diff --git a/CHANGELOG.md b/CHANGELOG.md
index f486fccae6..ab020d5cc5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,21 +6,36 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [Unreleased]
+## 01/23/2021 - Release v0.1.5
 ### Added
--
+- `WiderFace` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/65>, <https://github.com/openvinotoolkit/datumaro/pull/90>)
+- Function to transform annotations to labels (<https://github.com/openvinotoolkit/datumaro/pull/66>)
+- Dataset splits for classification, detection and re-id tasks (<https://github.com/openvinotoolkit/datumaro/pull/68>, <https://github.com/openvinotoolkit/datumaro/pull/81>)
+- `VGGFace2` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/69>, <https://github.com/openvinotoolkit/datumaro/pull/82>)
+- Unique image count statistic (<https://github.com/openvinotoolkit/datumaro/pull/87>)
+- Installation with pip by name `datumaro`
 
 ### Changed
--
+- `Dataset` class extended with new operations: `save`, `load`, `export`, `import_from`, `detect`, `run_model` (<https://github.com/openvinotoolkit/datumaro/pull/71>)
+- Allowed importing `Extractor`-only defined formats (in `Project.import_from`, `dataset.import_from` and CLI/`project import`) (<https://github.com/openvinotoolkit/datumaro/pull/71>)
+- `datum project ...` commands replaced with `datum ...` commands (<https://github.com/openvinotoolkit/datumaro/pull/84>)
+- Supported more image formats in `ImageNet` extractors (<https://github.com/openvinotoolkit/datumaro/pull/85>)
+- Allowed adding `Importer`-defined formats as project sources (`source add`) (<https://github.com/openvinotoolkit/datumaro/pull/86>)
+- Added max search depth in `ImageDir` format and importers (<https://github.com/openvinotoolkit/datumaro/pull/86>)
 
 ### Deprecated
--
+- `datum project ...` CLI context (<https://github.com/openvinotoolkit/datumaro/pull/84>)
 
 ### Removed
 -
 
 ### Fixed
--
+- Allow plugins inherited from `Extractor` (instead of only `SourceExtractor`) (<https://github.com/openvinotoolkit/datumaro/pull/70>)
+- Windows installation with `pip` for `pycocotools` (<https://github.com/openvinotoolkit/datumaro/pull/73>)
+- `YOLO` extractor path matching on Windows (<https://github.com/openvinotoolkit/datumaro/pull/73>)
+- Fixed inplace file copying when saving images (<https://github.com/openvinotoolkit/datumaro/pull/76>)
+- Fixed `labelmap` parameter type checking in `VOC` converter (<https://github.com/openvinotoolkit/datumaro/pull/76>)
+- Fixed model copying on addition in CLI (<https://github.com/openvinotoolkit/datumaro/pull/94>)
 
 ### Security
 -
diff --git a/README.md b/README.md
index c8c153edd9..4694f4ec6b 100644
--- a/README.md
+++ b/README.md
@@ -44,9 +44,9 @@ CVAT annotations                             ---> Publication, statistics etc.
 - Convert only non-`occluded` annotations from a [CVAT](https://github.com/opencv/cvat) project to TFrecord:
   ```bash
   # export Datumaro dataset in CVAT UI, extract somewhere, go to the project dir
-  datum project filter -e '/item/annotation[occluded="False"]' \
+  datum filter -e '/item/annotation[occluded="False"]' \
     --mode items+anno --output-dir not_occluded
-  datum project export --project not_occluded \
+  datum export --project not_occluded \
     --format tf_detection_api -- --save-images
   ```
 
@@ -54,13 +54,13 @@ CVAT annotations                             ---> Publication, statistics etc.
   ```bash
   # Download COCO dataset http://cocodataset.org/#download
   # Put images to coco/images/ and annotations to coco/annotations/
-  datum project import --format coco --input-path <path/to/coco>
-  datum project export --filter '/image[images_I_dont_like]' --format cvat \
+  datum import --format coco --input-path <path/to/coco>
+  datum export --filter '/image[images_I_dont_like]' --format cvat \
     --output-dir reannotation
   # import dataset and images to CVAT, re-annotate
   # export Datumaro project, extract to 'reannotation-upd'
-  datum project project merge reannotation-upd
-  datum project export --format coco
+  datum merge reannotation-upd
+  datum export --format coco
   ```
 
 - Annotate instance polygons in [CVAT](https://github.com/opencv/cvat), export as masks in COCO:
@@ -72,18 +72,18 @@ CVAT annotations                             ---> Publication, statistics etc.
 - Apply an OpenVINO detection model to some COCO-like dataset,
   then compare annotations with ground truth and visualize in TensorBoard:
   ```bash
-  datum project import --format coco --input-path <path/to/coco>
+  datum import --format coco --input-path <path/to/coco>
   # create model results interpretation script
   datum model add mymodel openvino \
     --weights model.bin --description model.xml \
     --interpretation-script parse_results.py
   datum model run --model mymodel --output-dir mymodel_inference/
-  datum project diff mymodel_inference/ --format tensorboard --output-dir diff
+  datum diff mymodel_inference/ --format tensorboard --output-dir diff
   ```
 
 - Change colors in PASCAL VOC-like `.png` masks:
   ```bash
-  datum project import --format voc --input-path <path/to/voc/dataset>
+  datum import --format voc --input-path <path/to/voc/dataset>
 
   # Create a color map file with desired colors:
   #
@@ -93,12 +93,28 @@ CVAT annotations                             ---> Publication, statistics etc.
   #
   # Save as mycolormap.txt
 
-  datum project export --format voc_segmentation -- --label-map mycolormap.txt
+  datum export --format voc_segmentation -- --label-map mycolormap.txt
   # add "--apply-colormap=0" to save grayscale (indexed) masks
   # check "--help" option for more info
   # use "datum --loglevel debug" for extra conversion info
   ```
 
+- Create a custom COCO-like dataset:
+  ```python
+  import numpy as np
+  from datumaro.components.extractor import (DatasetItem,
+    Bbox, LabelCategories, AnnotationType)
+  from datumaro.components.dataset import Dataset
+
+  dataset = Dataset(categories={
+    AnnotationType.label: LabelCategories.from_iterable(['cat', 'dog'])
+  })
+  dataset.put(DatasetItem(id=0, image=np.ones((5, 5, 3)), annotations=[
+    Bbox(1, 2, 3, 4, label=0),
+  ]))
+  dataset.export('test_dataset', 'coco')
+  ```
+
 <!--lint enable list-item-bullet-indent-->
 <!--lint enable list-item-indent-->
 
@@ -106,11 +122,13 @@ CVAT annotations                             ---> Publication, statistics etc.
 
 [(Back to top)](#table-of-contents)
 
-- Dataset reading, writing, conversion in any direction. Supported formats:
+- Dataset reading, writing, conversion in any direction. [Supported formats](docs/user_manual.md#supported-formats):
   - [COCO](http://cocodataset.org/#format-data) (`image_info`, `instances`, `person_keypoints`, `captions`, `labels`*)
   - [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) (`classification`, `detection`, `segmentation`, `action_classification`, `person_layout`)
   - [YOLO](https://github.com/AlexeyAB/darknet#how-to-train-pascal-voc-data) (`bboxes`)
   - [TF Detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md) (`bboxes`, `masks`)
+  - [WIDER Face](http://shuoyang1213.me/WIDERFACE/) (`bboxes`)
+  - [VGGFace2](https://github.com/ox-vgg/vgg_face2) (`landmarks`, `bboxes`)
   - [MOT sequences](https://arxiv.org/pdf/1906.04567.pdf)
   - [MOTS PNG](https://www.vision.rwth-aachen.de/page/mots)
   - [ImageNet](http://image-net.org/)
@@ -129,6 +147,14 @@ CVAT annotations                             ---> Publication, statistics etc.
     - polygons to instance masks and vise-versa
     - apply a custom colormap for mask annotations
     - rename or remove dataset labels
+  - Splitting a dataset into multiple subsets like `train`, `val`, and `test`:
+    - random split
+    - task-specific splits based on annotations,
+      which keep initial label and attribute distributions
+      - for classification task, based on labels
+      - for detection task, based on bboxes
+      - for re-identification task, based on labels,
+        avoiding having same IDs in training and test splits
 - Dataset quality checking
   - Simple checking for errors
   - Comparison with model infernece
@@ -162,7 +188,7 @@ python -m virtualenv venv
 Install Datumaro package:
 
 ``` bash
-pip install 'git+https://github.com/openvinotoolkit/datumaro'
+pip install datumaro
 ```
 
 ## Usage
@@ -208,13 +234,14 @@ dataset = dataset.transform(project.env.transforms.get('remap_labels'),
   {'cat': 'dog', # rename cat to dog
     'truck': 'car', # rename truck to car
     'person': '', # remove this label
-  }, default='delete')
+  }, default='delete') # remove everything else
 
+# iterate over dataset elements
 for item in dataset:
   print(item.id, item.annotations)
 
 # export the resulting dataset in COCO format
-project.env.converters.get('coco').convert(dataset, save_dir='dst/dir')
+dataset.export('dst/dir', 'coco')
 ```
 
 > Check our [developer guide](docs/developer_guide.md) for additional information.
diff --git a/datumaro/cli/__init__.py b/datumaro/cli/__init__.py
index eb864e52b5..6ea3037239 100644
--- a/datumaro/cli/__init__.py
+++ b/datumaro/cli/__init__.py
@@ -1,4 +1,4 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
diff --git a/datumaro/cli/__main__.py b/datumaro/cli/__main__.py
index 80a8805f56..005804cae0 100644
--- a/datumaro/cli/__main__.py
+++ b/datumaro/cli/__main__.py
@@ -1,5 +1,5 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -58,18 +58,25 @@ def make_parser():
     _LogManager._define_loglevel_option(parser)
 
     known_contexts = [
-        ('project', contexts.project, "Actions on projects (datasets)"),
-        ('source', contexts.source, "Actions on data sources"),
-        ('model', contexts.model, "Actions on models"),
+        ('project', contexts.project, "Actions with project (deprecated)"),
+        ('source', contexts.source, "Actions with data sources"),
+        ('model', contexts.model, "Actions with models"),
     ]
     known_commands = [
         ('create', commands.create, "Create project"),
-        ('add', commands.add, "Add source to project"),
-        ('remove', commands.remove, "Remove source from project"),
-        ('export', commands.export, "Export project"),
+        ('import', commands.import_, "Create project from existing dataset"),
+        ('add', commands.add, "Add data source to project"),
+        ('remove', commands.remove, "Remove data source from project"),
+        ('export', commands.export, "Export project in some format"),
+        ('filter', commands.filter, "Filter project"),
+        ('transform', commands.transform, "Transform project"),
+        ('merge', commands.merge, "Merge projects"),
+        ('convert', commands.convert, "Convert dataset into another format"),
+        ('diff', commands.diff, "Compare projects with intersection"),
+        ('ediff', commands.ediff, "Compare projects for equality"),
+        ('stats', commands.stats, "Compute project statistics"),
+        ('info', commands.info, "Print project info"),
         ('explain', commands.explain, "Run Explainable AI algorithm for model"),
-        ('merge', commands.merge, "Merge datasets"),
-        ('convert', commands.convert, "Convert dataset"),
     ]
 
     # Argparse doesn't support subparser groups:
diff --git a/datumaro/cli/commands/__init__.py b/datumaro/cli/commands/__init__.py
index fe74bc2b09..2d87b945bb 100644
--- a/datumaro/cli/commands/__init__.py
+++ b/datumaro/cli/commands/__init__.py
@@ -1,6 +1,13 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
-from . import add, create, explain, export, remove, merge, convert
+# pylint: disable=redefined-builtin
+
+from . import (
+    create, add, remove, import_,
+    explain,
+    export, merge, convert, transform, filter,
+    diff, ediff, stats,
+    info
+)
diff --git a/datumaro/cli/commands/add.py b/datumaro/cli/commands/add.py
index 288d7c047c..c43936ec81 100644
--- a/datumaro/cli/commands/add.py
+++ b/datumaro/cli/commands/add.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2020-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/convert.py b/datumaro/cli/commands/convert.py
index 6398bac73b..6d657d9b0b 100644
--- a/datumaro/cli/commands/convert.py
+++ b/datumaro/cli/commands/convert.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -9,6 +8,7 @@
 import os.path as osp
 
 from datumaro.components.project import Environment
+from datumaro.components.dataset import Dataset
 
 from ..contexts.project import FilterModes
 from ..util import CliException, MultilineFormatter, make_file_name
@@ -63,30 +63,17 @@ def convert_command(args):
     env = Environment()
 
     try:
-        converter = env.converters.get(args.output_format)
+        converter = env.converters[args.output_format]
     except KeyError:
         raise CliException("Converter for format '%s' is not found" % \
             args.output_format)
-    extra_args = converter.from_cmdline(args.extra_args)
-    def converter_proxy(extractor, save_dir):
-        return converter.convert(extractor, save_dir, **extra_args)
+    extra_args = converter.parse_cmdline(args.extra_args)
 
     filter_args = FilterModes.make_filter_args(args.filter_mode)
 
+    fmt = args.input_format
     if not args.input_format:
-        matches = []
-        for format_name in env.importers.items:
-            log.debug("Checking '%s' format...", format_name)
-            importer = env.make_importer(format_name)
-            try:
-                match = importer.detect(args.source)
-                if match:
-                    log.debug("format matched")
-                    matches.append((format_name, importer))
-            except NotImplementedError:
-                log.debug("Format '%s' does not support auto detection.",
-                    format_name)
-
+        matches = env.detect_dataset(args.source)
         if len(matches) == 0:
             log.error("Failed to detect dataset format. "
                 "Try to specify format with '-if/--input-format' parameter.")
@@ -94,20 +81,11 @@ def converter_proxy(extractor, save_dir):
         elif len(matches) != 1:
             log.error("Multiple formats match the dataset: %s. "
                 "Try to specify format with '-if/--input-format' parameter.",
-                ', '.join(m[0] for m in matches))
+                ', '.join(matches))
             return 2
 
-        format_name, importer = matches[0]
-        args.input_format = format_name
+        fmt = matches[0]
         log.info("Source dataset format detected as '%s'", args.input_format)
-    else:
-        try:
-            importer = env.make_importer(args.input_format)
-            if hasattr(importer, 'from_cmdline'):
-                extra_args = importer.from_cmdline()
-        except KeyError:
-            raise CliException("Importer for format '%s' is not found" % \
-                args.input_format)
 
     source = osp.abspath(args.source)
 
@@ -121,15 +99,12 @@ def converter_proxy(extractor, save_dir):
             (osp.basename(source), make_file_name(args.output_format)))
     dst_dir = osp.abspath(dst_dir)
 
-    project = importer(source)
-    dataset = project.make_dataset()
+    dataset = Dataset.import_from(source, fmt)
 
     log.info("Exporting the dataset")
-    dataset.export_project(
-        save_dir=dst_dir,
-        converter=converter_proxy,
-        filter_expr=args.filter,
-        **filter_args)
+    if args.filter:
+        dataset = dataset.filter(args.filter, **filter_args)
+    dataset.export(format=args.output_format, save_dir=dst_dir, **extra_args)
 
     log.info("Dataset exported to '%s' as '%s'" % \
         (dst_dir, args.output_format))
diff --git a/datumaro/cli/commands/create.py b/datumaro/cli/commands/create.py
index 97e3c9b4cf..1396d5f9ed 100644
--- a/datumaro/cli/commands/create.py
+++ b/datumaro/cli/commands/create.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/diff.py b/datumaro/cli/commands/diff.py
new file mode 100644
index 0000000000..a50c8f0a4e
--- /dev/null
+++ b/datumaro/cli/commands/diff.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_diff_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/ediff.py b/datumaro/cli/commands/ediff.py
new file mode 100644
index 0000000000..ac5ba8c467
--- /dev/null
+++ b/datumaro/cli/commands/ediff.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_ediff_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/explain.py b/datumaro/cli/commands/explain.py
index 4d5d16b2af..9c3e1d147a 100644
--- a/datumaro/cli/commands/explain.py
+++ b/datumaro/cli/commands/explain.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/export.py b/datumaro/cli/commands/export.py
index be47245d6b..1efb506459 100644
--- a/datumaro/cli/commands/export.py
+++ b/datumaro/cli/commands/export.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/filter.py b/datumaro/cli/commands/filter.py
new file mode 100644
index 0000000000..0b0d28cb9f
--- /dev/null
+++ b/datumaro/cli/commands/filter.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2020-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_filter_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/import_.py b/datumaro/cli/commands/import_.py
new file mode 100644
index 0000000000..74c47ab3cc
--- /dev/null
+++ b/datumaro/cli/commands/import_.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_import_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/info.py b/datumaro/cli/commands/info.py
new file mode 100644
index 0000000000..fa2af5ddd9
--- /dev/null
+++ b/datumaro/cli/commands/info.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_info_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/merge.py b/datumaro/cli/commands/merge.py
index 2583cd8641..9e80b30971 100644
--- a/datumaro/cli/commands/merge.py
+++ b/datumaro/cli/commands/merge.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2020 Intel Corporation
+# Copyright (C) 2020-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/remove.py b/datumaro/cli/commands/remove.py
index 7b9c0d3a2f..3ea0bbffa9 100644
--- a/datumaro/cli/commands/remove.py
+++ b/datumaro/cli/commands/remove.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/commands/stats.py b/datumaro/cli/commands/stats.py
new file mode 100644
index 0000000000..cb54eec394
--- /dev/null
+++ b/datumaro/cli/commands/stats.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_stats_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/commands/transform.py b/datumaro/cli/commands/transform.py
new file mode 100644
index 0000000000..474d6a2128
--- /dev/null
+++ b/datumaro/cli/commands/transform.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2020-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_transform_parser as build_parser
\ No newline at end of file
diff --git a/datumaro/cli/contexts/__init__.py b/datumaro/cli/contexts/__init__.py
index 433efe9b86..b903435527 100644
--- a/datumaro/cli/contexts/__init__.py
+++ b/datumaro/cli/contexts/__init__.py
@@ -1,6 +1,6 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
-from . import project, source, model, item
\ No newline at end of file
+from . import project, source, model
\ No newline at end of file
diff --git a/datumaro/cli/contexts/item/__init__.py b/datumaro/cli/contexts/item/__init__.py
deleted file mode 100644
index 8f74826d90..0000000000
--- a/datumaro/cli/contexts/item/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
-#
-# SPDX-License-Identifier: MIT
-
-import argparse
-
-from ...util import add_subparser
-
-
-def build_export_parser(parser_ctor=argparse.ArgumentParser):
-    parser = parser_ctor()
-    return parser
-
-def build_stats_parser(parser_ctor=argparse.ArgumentParser):
-    parser = parser_ctor()
-    return parser
-
-def build_diff_parser(parser_ctor=argparse.ArgumentParser):
-    parser = parser_ctor()
-    return parser
-
-def build_edit_parser(parser_ctor=argparse.ArgumentParser):
-    parser = parser_ctor()
-    return parser
-
-def build_parser(parser_ctor=argparse.ArgumentParser):
-    parser = parser_ctor()
-
-    subparsers = parser.add_subparsers()
-    add_subparser(subparsers, 'export', build_export_parser)
-    add_subparser(subparsers, 'stats', build_stats_parser)
-    add_subparser(subparsers, 'diff', build_diff_parser)
-    add_subparser(subparsers, 'edit', build_edit_parser)
-
-    return parser
diff --git a/datumaro/cli/contexts/model/__init__.py b/datumaro/cli/contexts/model.py
similarity index 93%
rename from datumaro/cli/contexts/model/__init__.py
rename to datumaro/cli/contexts/model.py
index 69b7da1eae..9d625973da 100644
--- a/datumaro/cli/contexts/model/__init__.py
+++ b/datumaro/cli/contexts/model.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -7,13 +6,11 @@
 import logging as log
 import os
 import os.path as osp
-import re
 
-from datumaro.components.config import DEFAULT_FORMAT
 from datumaro.components.project import Environment
 
-from ...util import CliException, MultilineFormatter, add_subparser
-from ...util.project import load_project, \
+from ..util import CliException, MultilineFormatter, add_subparser
+from ..util.project import load_project, \
     generate_next_name, generate_next_file_name
 
 
@@ -59,17 +56,18 @@ def add_command(args):
         assert args.name not in project.config.models, args.name
 
     try:
-        launcher = project.env.launchers.get(args.launcher)
+        launcher = project.env.launchers[args.launcher]
     except KeyError:
         raise CliException("Launcher '%s' is not found" % args.launcher)
 
     cli_plugin = getattr(launcher, 'cli_plugin', launcher)
-    model_args = cli_plugin.from_cmdline(args.extra_args)
+    model_args = cli_plugin.parse_cmdline(args.extra_args)
 
     if args.copy:
         log.info("Copying model data")
 
-        model_dir = project.local_model_dir(args.name)
+        model_dir = osp.join(project.config.project_dir,
+            project.local_model_dir(args.name))
         os.makedirs(model_dir, exist_ok=False)
 
         try:
diff --git a/datumaro/cli/contexts/project/__init__.py b/datumaro/cli/contexts/project/__init__.py
index 8e805e9da4..3dbaeb6aa8 100644
--- a/datumaro/cli/contexts/project/__init__.py
+++ b/datumaro/cli/contexts/project/__init__.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -87,7 +86,7 @@ def create_command(args):
 def build_import_parser(parser_ctor=argparse.ArgumentParser):
     builtins = sorted(Environment().importers.items)
 
-    parser = parser_ctor(help="Create project from existing dataset",
+    parser = parser_ctor(help="Create project from an existing dataset",
         description="""
             Creates a project from an existing dataset. The source can be:|n
             - a dataset in a supported format (check 'formats' section below)|n
@@ -172,50 +171,43 @@ def import_command(args):
     log.info("Importing project from '%s'" % args.source)
 
     extra_args = {}
+    fmt = args.format
     if not args.format:
         if args.extra_args:
             raise CliException("Extra args can not be used without format")
 
         log.info("Trying to detect dataset format...")
 
-        matches = []
-        for format_name in env.importers.items:
-            log.debug("Checking '%s' format...", format_name)
-            importer = env.make_importer(format_name)
-            try:
-                match = importer.detect(args.source)
-                if match:
-                    log.debug("format matched")
-                    matches.append((format_name, importer))
-            except NotImplementedError:
-                log.debug("Format '%s' does not support auto detection.",
-                    format_name)
-
+        matches = env.detect_dataset(args.source)
         if len(matches) == 0:
-            log.error("Failed to detect dataset format automatically. "
+            log.error("Failed to detect dataset format. "
                 "Try to specify format with '-f/--format' parameter.")
             return 1
         elif len(matches) != 1:
             log.error("Multiple formats match the dataset: %s. "
                 "Try to specify format with '-f/--format' parameter.",
-                ', '.join(m[0] for m in matches))
-            return 2
+                ', '.join(matches))
+            return 1
 
-        format_name, importer = matches[0]
-        args.format = format_name
-    else:
-        try:
-            importer = env.make_importer(args.format)
-            if hasattr(importer, 'from_cmdline'):
-                extra_args = importer.from_cmdline(args.extra_args)
-        except KeyError:
-            raise CliException("Importer for format '%s' is not found" % \
-                args.format)
-
-    log.info("Importing project as '%s'" % args.format)
-
-    source = osp.abspath(args.source)
-    project = importer(source, **extra_args)
+        fmt = matches[0]
+    elif args.extra_args:
+        if fmt in env.importers:
+            arg_parser = env.importers[fmt]
+        elif fmt in env.extractors:
+            arg_parser = env.extractors[fmt]
+        else:
+            raise CliException("Unknown format '%s'. A format can be added"
+                "by providing an Extractor and Importer plugins" % fmt)
+
+        if hasattr(arg_parser, 'parse_cmdline'):
+            extra_args = arg_parser.parse_cmdline(args.extra_args)
+        else:
+            raise CliException("Format '%s' does not accept "
+                "extra parameters" % fmt)
+
+    log.info("Importing project as '%s'" % fmt)
+
+    project = Project.import_from(osp.abspath(args.source), fmt, **extra_args)
     project.config.project_name = project_name
     project.config.project_dir = project_dir
 
@@ -337,14 +329,11 @@ def export_command(args):
     dst_dir = osp.abspath(dst_dir)
 
     try:
-        converter = project.env.converters.get(args.format)
+        converter = project.env.converters[args.format]
     except KeyError:
         raise CliException("Converter for format '%s' is not found" % \
             args.format)
-
-    extra_args = converter.from_cmdline(args.extra_args)
-    def converter_proxy(extractor, save_dir):
-        return converter.convert(extractor, save_dir, **extra_args)
+    extra_args = converter.parse_cmdline(args.extra_args)
 
     filter_args = FilterModes.make_filter_args(args.filter_mode)
 
@@ -352,13 +341,12 @@ def converter_proxy(extractor, save_dir):
     dataset = project.make_dataset()
 
     log.info("Exporting the project...")
-    dataset.export_project(
-        save_dir=dst_dir,
-        converter=converter_proxy,
-        filter_expr=args.filter,
-        **filter_args)
-    log.info("Project exported to '%s' as '%s'" % \
-        (dst_dir, args.format))
+
+    if args.filter:
+        dataset = dataset.filter(args.filter, **filter_args)
+    dataset.export(format=args.format, save_dir=dst_dir, **extra_args)
+
+    log.info("Project exported to '%s' as '%s'" % (dst_dir, args.format))
 
     return 0
 
@@ -681,13 +669,13 @@ def transform_command(args):
     dst_dir = osp.abspath(dst_dir)
 
     try:
-        transform = project.env.transforms.get(args.transform)
+        transform = project.env.transforms[args.transform]
     except KeyError:
         raise CliException("Transform '%s' is not found" % args.transform)
 
     extra_args = {}
-    if hasattr(transform, 'from_cmdline'):
-        extra_args = transform.from_cmdline(args.extra_args)
+    if hasattr(transform, 'parse_cmdline'):
+        extra_args = transform.parse_cmdline(args.extra_args)
 
     log.info("Loading the project...")
     dataset = project.make_dataset()
diff --git a/datumaro/cli/contexts/project/diff.py b/datumaro/cli/contexts/project/diff.py
index 358f386057..7f638bbb0a 100644
--- a/datumaro/cli/contexts/project/diff.py
+++ b/datumaro/cli/contexts/project/diff.py
@@ -1,5 +1,5 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/contexts/source/__init__.py b/datumaro/cli/contexts/source.py
similarity index 98%
rename from datumaro/cli/contexts/source/__init__.py
rename to datumaro/cli/contexts/source.py
index 45dbdb1b52..caea28446c 100644
--- a/datumaro/cli/contexts/source/__init__.py
+++ b/datumaro/cli/contexts/source.py
@@ -1,5 +1,4 @@
-
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -10,8 +9,8 @@
 import shutil
 
 from datumaro.components.project import Environment
-from ...util import add_subparser, CliException, MultilineFormatter
-from ...util.project import load_project
+from ..util import add_subparser, CliException, MultilineFormatter
+from ..util.project import load_project
 
 
 def build_add_parser(parser_ctor=argparse.ArgumentParser):
diff --git a/datumaro/cli/util/__init__.py b/datumaro/cli/util/__init__.py
index 4ee0b72b07..b9d496b154 100644
--- a/datumaro/cli/util/__init__.py
+++ b/datumaro/cli/util/__init__.py
@@ -1,5 +1,5 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/cli/util/project.py b/datumaro/cli/util/project.py
index 56590a4d1d..e157ded5ea 100644
--- a/datumaro/cli/util/project.py
+++ b/datumaro/cli/util/project.py
@@ -1,5 +1,5 @@
 
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/datumaro/components/cli_plugin.py b/datumaro/components/cli_plugin.py
index e85f5c4f30..0346cddcea 100644
--- a/datumaro/components/cli_plugin.py
+++ b/datumaro/components/cli_plugin.py
@@ -31,7 +31,7 @@ def build_cmdline_parser(cls, **kwargs):
         return argparse.ArgumentParser(**args)
 
     @classmethod
-    def from_cmdline(cls, args=None):
+    def parse_cmdline(cls, args=None):
         if args and args[0] == '--':
             args = args[1:]
         parser = cls.build_cmdline_parser()
diff --git a/datumaro/components/config.py b/datumaro/components/config.py
index a79cda151b..72c461ae8f 100644
--- a/datumaro/components/config.py
+++ b/datumaro/components/config.py
@@ -150,14 +150,19 @@ def __setattr__(self, key, value):
 
     def __eq__(self, other):
         try:
-            for k, my_v in self.items(allow_internal=False):
+            keys = set(self.keys()) | set(other.keys())
+            for k in keys:
+                my_v = self[k]
                 other_v = other[k]
                 if my_v != other_v:
                     return False
             return True
-        except Exception:
+        except (KeyError, AttributeError):
             return False
 
+    def __repr__(self):
+        return repr(dict(self))
+
     def update(self, other):
         for k, v in other.items():
             self.set(k, v)
@@ -205,9 +210,12 @@ def set(self, key, value):
         return value
 
     @staticmethod
-    def parse(path):
-        with open(path, 'r') as f:
-            return Config(yaml.safe_load(f))
+    def parse(path, *args, **kwargs):
+        if isinstance(path, str):
+            with open(path, 'r') as f:
+                return Config(yaml.safe_load(f), *args, **kwargs)
+        else:
+            return Config(yaml.safe_load(path), *args, **kwargs)
 
     @staticmethod
     def yaml_representer(dumper, value):
@@ -215,13 +223,16 @@ def yaml_representer(dumper, value):
             value._items(allow_internal=False, allow_fallback=False))
 
     def dump(self, path):
-        with open(path, 'w+') as f:
-            yaml.dump(self, f)
+        if isinstance(path, str):
+            with open(path, 'w') as f:
+                yaml.dump(self, f)
+        else:
+            yaml.dump(self, path)
 
 yaml.add_multi_representer(Config, Config.yaml_representer)
 
 
-class DefaultConfig(Config):
+class DictConfig(Config):
     def __init__(self, default=None):
         super().__init__()
         self.__dict__['_default'] = default
@@ -232,6 +243,3 @@ def set(self, key, value):
             return super().set(key, value)
         else:
             return super().set(key, value)
-
-
-DEFAULT_FORMAT = 'datumaro'
\ No newline at end of file
diff --git a/datumaro/components/config_model.py b/datumaro/components/config_model.py
index c6f65179a6..49f85e9133 100644
--- a/datumaro/components/config_model.py
+++ b/datumaro/components/config_model.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from datumaro.components.config import Config, \
-    DefaultConfig as _DefaultConfig, \
+    DictConfig as _DictConfig, \
     SchemaBuilder as _SchemaBuilder
 
 
@@ -34,9 +34,9 @@ def __init__(self, config=None):
     .add('format_version', int) \
     \
     .add('subsets', list) \
-    .add('sources', lambda: _DefaultConfig(
+    .add('sources', lambda: _DictConfig(
         lambda v=None: Source(v))) \
-    .add('models', lambda: _DefaultConfig(
+    .add('models', lambda: _DictConfig(
         lambda v=None: Model(v))) \
     \
     .add('models_dir', str, internal=True) \
diff --git a/datumaro/components/converter.py b/datumaro/components/converter.py
index 086e5d0d02..c6efc020ed 100644
--- a/datumaro/components/converter.py
+++ b/datumaro/components/converter.py
@@ -62,13 +62,15 @@ def _save_image(self, item, path=None):
             return
 
         path = path or self._make_image_filename(item)
+        path = osp.abspath(path)
 
         src_ext = item.image.ext.lower()
         dst_ext = osp.splitext(osp.basename(path))[1].lower()
 
         os.makedirs(osp.dirname(path), exist_ok=True)
         if src_ext == dst_ext and osp.isfile(item.image.path):
-            shutil.copyfile(item.image.path, path)
+            if item.image.path != path:
+                shutil.copyfile(item.image.path, path)
         elif src_ext == dst_ext and isinstance(item.image, ByteImage):
             with open(path, 'wb') as f:
                 f.write(item.image.get_bytes())
diff --git a/datumaro/components/dataset.py b/datumaro/components/dataset.py
index 7b0ff3f313..87693832cb 100644
--- a/datumaro/components/dataset.py
+++ b/datumaro/components/dataset.py
@@ -1,16 +1,25 @@
-# Copyright (C) 2020 Intel Corporation
+# Copyright (C) 2020-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
 from collections import OrderedDict, defaultdict
 from typing import Iterable, Union, Dict, List
+import logging as log
+import os
+import os.path as osp
+import shutil
 
 from datumaro.components.extractor import (Extractor, LabelCategories,
     AnnotationType, DatasetItem, DEFAULT_SUBSET_NAME)
 from datumaro.components.dataset_filter import \
     XPathDatasetFilter, XPathAnnotationsFilter
+from datumaro.components.environment import Environment
+from datumaro.util import error_rollback
+from datumaro.util.log_utils import logging_disabled
 
 
+DEFAULT_FORMAT = 'datumaro'
+
 class Dataset(Extractor):
     class Subset(Extractor):
         def __init__(self, parent):
@@ -28,7 +37,8 @@ def categories(self):
 
     @classmethod
     def from_iterable(cls, iterable: Iterable[DatasetItem],
-            categories: Union[Dict, List[str]] = None):
+            categories: Union[Dict, List[str]] = None,
+            env: Environment = None):
         if isinstance(categories, list):
             categories = { AnnotationType.label:
                 LabelCategories.from_iterable(categories)
@@ -44,12 +54,12 @@ def __iter__(self):
             def categories(self):
                 return categories
 
-        return cls.from_extractors(_extractor())
+        return cls.from_extractors(_extractor(), env=env)
 
     @classmethod
-    def from_extractors(cls, *sources):
+    def from_extractors(cls, *sources, env=None):
         categories = cls._merge_categories(s.categories() for s in sources)
-        dataset = Dataset(categories=categories)
+        dataset = Dataset(categories=categories, env=env)
 
         # merge items
         subsets = defaultdict(lambda: cls.Subset(dataset))
@@ -67,9 +77,12 @@ def from_extractors(cls, *sources):
         dataset._subsets = dict(subsets)
         return dataset
 
-    def __init__(self, categories=None):
+    def __init__(self, categories=None, env=None):
         super().__init__()
 
+        assert env is None or isinstance(env, Environment), env
+        self._env = env
+
         self._subsets = {}
 
         if not categories:
@@ -183,4 +196,97 @@ def _merge_anno(a, b):
     def _merge_categories(sources):
         # TODO: implement properly with merging and annotations remapping
         from .operations import merge_categories
-        return merge_categories(sources)
\ No newline at end of file
+        return merge_categories(sources)
+
+    @error_rollback('on_error', implicit=True)
+    def export(self, save_dir, format, **kwargs): #pylint: disable=redefined-builtin
+        if isinstance(format, str):
+            converter = self.env.make_converter(format)
+        else:
+            converter = format
+
+        save_dir = osp.abspath(save_dir)
+        if not osp.exists(save_dir):
+            on_error.do(shutil.rmtree, save_dir, ignore_errors=True)
+        os.makedirs(save_dir, exist_ok=True)
+        converter(self, save_dir=save_dir, **kwargs)
+
+    def transform(self, method, *args, **kwargs):
+        if isinstance(method, str):
+            method = self.env.make_transform(method)
+
+        result = super().transform(method, *args, **kwargs)
+        return Dataset.from_extractors(result, env=self._env)
+
+    def run_model(self, model, batch_size=1):
+        from datumaro.components.launcher import Launcher, ModelTransform
+        if isinstance(model, Launcher):
+            return self.transform(ModelTransform, launcher=model,
+                batch_size=batch_size)
+        elif isinstance(model, ModelTransform):
+            return self.transform(model, batch_size=batch_size)
+        else:
+            raise TypeError('Unexpected model argument type: %s' % type(model))
+
+    @property
+    def env(self):
+        if not self._env:
+            self._env = Environment()
+        return self._env
+
+    def save(self, save_dir, **kwargs):
+        self.export(save_dir, format=DEFAULT_FORMAT, **kwargs)
+
+    @classmethod
+    def load(cls, path, **kwargs):
+        return cls.import_from(path, format=DEFAULT_FORMAT, **kwargs)
+
+    @classmethod
+    def import_from(cls, path, format=None, env=None, **kwargs): #pylint: disable=redefined-builtin
+        from datumaro.components.config_model import Source
+
+        if env is None:
+            env = Environment()
+
+        if not format:
+            format = cls.detect(path, env)
+
+        # TODO: remove importers, put this logic into extractors
+        if format in env.importers:
+            importer = env.make_importer(format)
+            with logging_disabled(log.INFO):
+                project = importer(path, **kwargs)
+            detected_sources = list(project.config.sources.values())
+        elif format in env.extractors:
+            detected_sources = [{
+                'url': path, 'format': format, 'options': kwargs
+            }]
+        else:
+            raise Exception("Unknown source format '%s'. To make it "
+                "available, add the corresponding Extractor implementation "
+                "to the environment" % format)
+
+        extractors = []
+        for src_conf in detected_sources:
+            if not isinstance(src_conf, Source):
+                src_conf = Source(src_conf)
+            extractors.append(env.make_extractor(
+                src_conf.format, src_conf.url, **src_conf.options
+            ))
+
+        return cls.from_extractors(*extractors)
+
+    @staticmethod
+    def detect(path, env=None):
+        if env is None:
+            env = Environment()
+
+        matches = env.detect_dataset(path)
+        if not matches:
+            raise Exception("Failed to detect dataset format automatically: "
+                "no matching formats found")
+        if 1 < len(matches):
+            raise Exception("Failed to detect dataset format automatically:"
+                " data matches more than one format: %s" % \
+                ', '.join(matches))
+        return matches[0]
\ No newline at end of file
diff --git a/datumaro/components/environment.py b/datumaro/components/environment.py
new file mode 100644
index 0000000000..3db6b1b400
--- /dev/null
+++ b/datumaro/components/environment.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2020-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from functools import partial
+from glob import glob
+import git
+import inspect
+import logging as log
+import os
+import os.path as osp
+
+from datumaro.components.config import Config
+from datumaro.components.config_model import Model, Source
+from datumaro.util.os_util import import_foreign_module
+
+
+class Registry:
+    def __init__(self, config=None, item_type=None):
+        self.item_type = item_type
+
+        self.items = {}
+
+        if config is not None:
+            self.load(config)
+
+    def load(self, config):
+        pass
+
+    def register(self, name, value):
+        if self.item_type:
+            value = self.item_type(value)
+        self.items[name] = value
+        return value
+
+    def unregister(self, name):
+        return self.items.pop(name, None)
+
+    def get(self, key):
+        """Returns a class or a factory function"""
+        return self.items[key]
+
+    def __getitem__(self, key):
+        return self.get(key)
+
+    def __contains__(self, key):
+        return key in self.items
+
+
+class ModelRegistry(Registry):
+    def __init__(self, config=None):
+        super().__init__(config, item_type=Model)
+
+    def load(self, config):
+        # TODO: list default dir, insert values
+        if 'models' in config:
+            for name, model in config.models.items():
+                self.register(name, model)
+
+
+class SourceRegistry(Registry):
+    def __init__(self, config=None):
+        super().__init__(config, item_type=Source)
+
+    def load(self, config):
+        # TODO: list default dir, insert values
+        if 'sources' in config:
+            for name, source in config.sources.items():
+                self.register(name, source)
+
+
+class PluginRegistry(Registry):
+    def __init__(self, config=None, builtin=None, local=None):
+        super().__init__(config)
+
+        from datumaro.components.cli_plugin import CliPlugin
+
+        if builtin is not None:
+            for v in builtin:
+                k = CliPlugin._get_name(v)
+                self.register(k, v)
+        if local is not None:
+            for v in local:
+                k = CliPlugin._get_name(v)
+                self.register(k, v)
+
+
+class GitWrapper:
+    def __init__(self, config=None):
+        self.repo = None
+
+        if config is not None and config.project_dir:
+            self.init(config.project_dir)
+
+    @staticmethod
+    def _git_dir(base_path):
+        return osp.join(base_path, '.git')
+
+    @classmethod
+    def spawn(cls, path):
+        spawn = not osp.isdir(cls._git_dir(path))
+        repo = git.Repo.init(path=path)
+        if spawn:
+            repo.config_writer().set_value("user", "name", "User") \
+                .set_value("user", "email", "user@nowhere.com") \
+                .release()
+            # gitpython does not support init, use git directly
+            repo.git.init()
+            repo.git.commit('-m', 'Initial commit', '--allow-empty')
+        return repo
+
+    def init(self, path):
+        self.repo = self.spawn(path)
+        return self.repo
+
+    def is_initialized(self):
+        return self.repo is not None
+
+    def create_submodule(self, name, dst_dir, **kwargs):
+        self.repo.create_submodule(name, dst_dir, **kwargs)
+
+    def has_submodule(self, name):
+        return name in [submodule.name for submodule in self.repo.submodules]
+
+    def remove_submodule(self, name, **kwargs):
+        return self.repo.submodule(name).remove(**kwargs)
+
+
+class Environment:
+    _builtin_plugins = None
+    PROJECT_EXTRACTOR_NAME = 'datumaro_project'
+
+    def __init__(self, config=None):
+        from datumaro.components.project import (
+            PROJECT_DEFAULT_CONFIG, PROJECT_SCHEMA, load_project_as_dataset)
+        config = Config(config,
+            fallback=PROJECT_DEFAULT_CONFIG, schema=PROJECT_SCHEMA)
+
+        self.models = ModelRegistry(config)
+        self.sources = SourceRegistry(config)
+
+        self.git = GitWrapper(config)
+
+        env_dir = osp.join(config.project_dir, config.env_dir)
+        builtin = self._load_builtin_plugins()
+        custom = self._load_plugins2(osp.join(env_dir, config.plugins_dir))
+        select = lambda seq, t: [e for e in seq if issubclass(e, t)]
+        from datumaro.components.converter import Converter
+        from datumaro.components.extractor import (Importer, SourceExtractor,
+            Transform)
+        from datumaro.components.launcher import Launcher
+        self.extractors = PluginRegistry(
+            builtin=select(builtin, SourceExtractor),
+            local=select(custom, SourceExtractor)
+        )
+        self.extractors.register(self.PROJECT_EXTRACTOR_NAME,
+            load_project_as_dataset)
+
+        self.importers = PluginRegistry(
+            builtin=select(builtin, Importer),
+            local=select(custom, Importer)
+        )
+        self.launchers = PluginRegistry(
+            builtin=select(builtin, Launcher),
+            local=select(custom, Launcher)
+        )
+        self.converters = PluginRegistry(
+            builtin=select(builtin, Converter),
+            local=select(custom, Converter)
+        )
+        self.transforms = PluginRegistry(
+            builtin=select(builtin, Transform),
+            local=select(custom, Transform)
+        )
+
+    @staticmethod
+    def _find_plugins(plugins_dir):
+        plugins = []
+        if not osp.exists(plugins_dir):
+            return plugins
+
+        for plugin_name in os.listdir(plugins_dir):
+            p = osp.join(plugins_dir, plugin_name)
+            if osp.isfile(p) and p.endswith('.py'):
+                plugins.append((plugins_dir, plugin_name, None))
+            elif osp.isdir(p):
+                plugins += [(plugins_dir,
+                        osp.splitext(plugin_name)[0] + '.' + osp.basename(p),
+                        osp.splitext(plugin_name)[0]
+                    )
+                    for p in glob(osp.join(p, '*.py'))]
+        return plugins
+
+    @classmethod
+    def _import_module(cls, module_dir, module_name, types, package=None):
+        module = import_foreign_module(osp.splitext(module_name)[0], module_dir,
+            package=package)
+
+        exports = []
+        if hasattr(module, 'exports'):
+            exports = module.exports
+        else:
+            for symbol in dir(module):
+                if symbol.startswith('_'):
+                    continue
+                exports.append(getattr(module, symbol))
+
+        exports = [s for s in exports
+            if inspect.isclass(s) and issubclass(s, types) and not s in types]
+
+        return exports
+
+    @classmethod
+    def _load_plugins(cls, plugins_dir, types):
+        types = tuple(types)
+
+        plugins = cls._find_plugins(plugins_dir)
+
+        all_exports = []
+        for module_dir, module_name, package in plugins:
+            try:
+                exports = cls._import_module(module_dir, module_name, types,
+                    package)
+            except Exception as e:
+                module_search_error = ImportError
+                try:
+                    module_search_error = ModuleNotFoundError # python 3.6+
+                except NameError:
+                    pass
+
+                message = ["Failed to import module '%s': %s", module_name, e]
+                if isinstance(e, module_search_error):
+                    log.debug(*message)
+                else:
+                    log.warning(*message)
+                continue
+
+            log.debug("Imported the following symbols from %s: %s" % \
+                (
+                    module_name,
+                    ', '.join(s.__name__ for s in exports)
+                )
+            )
+            all_exports.extend(exports)
+
+        return all_exports
+
+    @classmethod
+    def _load_builtin_plugins(cls):
+        if not cls._builtin_plugins:
+            plugins_dir = osp.join(
+                __file__[: __file__.rfind(osp.join('datumaro', 'components'))],
+                osp.join('datumaro', 'plugins')
+            )
+            assert osp.isdir(plugins_dir), plugins_dir
+            cls._builtin_plugins = cls._load_plugins2(plugins_dir)
+        return cls._builtin_plugins
+
+    @classmethod
+    def _load_plugins2(cls, plugins_dir):
+        from datumaro.components.converter import Converter
+        from datumaro.components.extractor import (Extractor, Importer,
+            Transform)
+        from datumaro.components.launcher import Launcher
+        types = [Extractor, Converter, Importer, Launcher, Transform]
+
+        return cls._load_plugins(plugins_dir, types)
+
+    def make_extractor(self, name, *args, **kwargs):
+        return self.extractors.get(name)(*args, **kwargs)
+
+    def make_importer(self, name, *args, **kwargs):
+        return self.importers.get(name)(*args, **kwargs)
+
+    def make_launcher(self, name, *args, **kwargs):
+        return self.launchers.get(name)(*args, **kwargs)
+
+    def make_converter(self, name, *args, **kwargs):
+        result = self.converters.get(name)
+        if inspect.isclass(result):
+            result = result.convert
+        return partial(result, *args, **kwargs)
+
+    def make_transform(self, name, *args, **kwargs):
+        return partial(self.transforms.get(name), *args, **kwargs)
+
+    def register_model(self, name, model):
+        self.models.register(name, model)
+
+    def unregister_model(self, name):
+        self.models.unregister(name)
+
+    def is_format_known(self, name):
+        return name in self.importers or name in self.extractors
+
+    def detect_dataset(self, path):
+        matches = []
+
+        for format_name, importer in self.importers.items.items():
+            log.debug("Checking '%s' format...", format_name)
+            try:
+                match = importer.detect(path)
+                if match:
+                    log.debug("format matched")
+                    matches.append(format_name)
+            except NotImplementedError:
+                log.debug("Format '%s' does not support auto detection.",
+                    format_name)
+
+        return matches
diff --git a/datumaro/components/extractor.py b/datumaro/components/extractor.py
index 7bf604ce53..1c81462779 100644
--- a/datumaro/components/extractor.py
+++ b/datumaro/components/extractor.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from enum import Enum
-from glob import glob
+from glob import iglob
 from typing import List, Dict
 import numpy as np
 import os.path as osp
@@ -615,12 +615,19 @@ def __call__(self, path, **extra_params):
         return project
 
     @classmethod
-    def _find_sources_recursive(cls, path, ext, extractor_name, filename='*'):
+    def _find_sources_recursive(cls, path, ext, extractor_name,
+            filename='*', dirname='', file_filter=None, max_depth=3):
         if path.endswith(ext) and osp.isfile(path):
             sources = [{'url': path, 'format': extractor_name}]
         else:
-            sources = [{'url': p, 'format': extractor_name} for p in
-                glob(osp.join(path, '**', filename + ext), recursive=True)]
+            sources = []
+            for d in range(max_depth + 1):
+                sources.extend({'url': p, 'format': extractor_name} for p in
+                    iglob(osp.join(path, *('*' * d), dirname, filename + ext))
+                    if (callable(file_filter) and file_filter(p)) \
+                    or (not callable(file_filter)))
+                if sources:
+                    break
         return sources
 
 class Transform(Extractor):
diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py
index 38a074757a..78f19fe856 100644
--- a/datumaro/components/operations.py
+++ b/datumaro/components/operations.py
@@ -1063,8 +1063,14 @@ def compute_ann_statistics(dataset):
     def get_label(ann):
         return labels.items[ann.label].name if ann.label is not None else None
 
+    unique_images = find_unique_images(dataset)
+    repeated_images = [sorted(g) for g in unique_images.values() if 1 < len(g)]
+
     stats = {
         'images count': len(dataset),
+        'unique images count': len(unique_images),
+        'repeated images count': len(repeated_images),
+        'repeated images': repeated_images, # [[id1, id2], [id3, id4, id5], ...]
         'annotations count': 0,
         'unannotated images count': 0,
         'unannotated images': [],
@@ -1277,24 +1283,8 @@ def match_items_by_id(a, b):
     return matches, a_unmatched, b_unmatched
 
 def match_items_by_image_hash(a, b):
-    def _hash(item):
-        if not item.image.has_data:
-            log.warning("Image (%s, %s) has no image "
-                "data, counted as unmatched", item.id, item.subset)
-            return None
-        return hashlib.md5(item.image.data.tobytes()).hexdigest()
-
-    def _build_hashmap(source):
-        d = {}
-        for item in source:
-            h = _hash(item)
-            if h is None:
-                h = str(id(item)) # anything unique
-            d.setdefault(h, []).append((item.id, item.subset))
-        return d
-
-    a_hash = _build_hashmap(a)
-    b_hash = _build_hashmap(b)
+    a_hash = find_unique_images(a)
+    b_hash = find_unique_images(b)
 
     a_items = set(a_hash)
     b_items = set(b_hash)
@@ -1309,6 +1299,28 @@ def _build_hashmap(source):
 
     return matches, a_unmatched, b_unmatched
 
+def find_unique_images(dataset, item_hash=None):
+    def _default_hash(item):
+        if not item.image or not item.image.has_data:
+            if item.image and item.image.path:
+                return hash(item.image.path)
+
+            log.warning("Item (%s, %s) has no image "
+                "info, counted as unique", item.id, item.subset)
+            return None
+        return hashlib.md5(item.image.data.tobytes()).hexdigest()
+
+    if item_hash is None:
+        item_hash = _default_hash
+
+    unique = {}
+    for item in dataset:
+        h = item_hash(item)
+        if h is None:
+            h = str(id(item)) # anything unique
+        unique.setdefault(h, set()).add((item.id, item.subset))
+    return unique
+
 @attrs
 class ExactComparator:
     match_images = attrib(kw_only=True, type=bool, default=False)
diff --git a/datumaro/components/project.py b/datumaro/components/project.py
index 094dd22ffe..e1d6bc340b 100644
--- a/datumaro/components/project.py
+++ b/datumaro/components/project.py
@@ -1,330 +1,45 @@
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
-from glob import glob
-import git
-import importlib
-import inspect
 import logging as log
 import os
 import os.path as osp
 import shutil
-import sys
 
-from datumaro.components.config import Config, DEFAULT_FORMAT
+from datumaro.components.config import Config
 from datumaro.components.config_model import (Model, Source,
     PROJECT_DEFAULT_CONFIG, PROJECT_SCHEMA)
+from datumaro.components.environment import Environment
 from datumaro.components.launcher import ModelTransform
-from datumaro.components.dataset import Dataset
+from datumaro.components.dataset import Dataset, DEFAULT_FORMAT
 
 
-def import_foreign_module(name, path, package=None):
-    module = None
-    default_path = sys.path.copy()
-    try:
-        sys.path = [ osp.abspath(path), ] + default_path
-        sys.modules.pop(name, None) # remove from cache
-        module = importlib.import_module(name, package=package)
-        sys.modules.pop(name) # remove from cache
-    except Exception:
-        raise
-    finally:
-        sys.path = default_path
-    return module
-
-
-class Registry:
-    def __init__(self, config=None, item_type=None):
-        self.item_type = item_type
-
-        self.items = {}
-
-        if config is not None:
-            self.load(config)
-
-    def load(self, config):
-        pass
-
-    def register(self, name, value):
-        if self.item_type:
-            value = self.item_type(value)
-        self.items[name] = value
-        return value
-
-    def unregister(self, name):
-        return self.items.pop(name, None)
-
-    def get(self, key):
-        return self.items[key] # returns a class / ctor
-
-
-class ModelRegistry(Registry):
-    def __init__(self, config=None):
-        super().__init__(config, item_type=Model)
-
-    def load(self, config):
-        # TODO: list default dir, insert values
-        if 'models' in config:
-            for name, model in config.models.items():
-                self.register(name, model)
-
-
-class SourceRegistry(Registry):
-    def __init__(self, config=None):
-        super().__init__(config, item_type=Source)
-
-    def load(self, config):
-        # TODO: list default dir, insert values
-        if 'sources' in config:
-            for name, source in config.sources.items():
-                self.register(name, source)
-
-class PluginRegistry(Registry):
-    def __init__(self, config=None, builtin=None, local=None):
-        super().__init__(config)
-
-        from datumaro.components.cli_plugin import CliPlugin
-
-        if builtin is not None:
-            for v in builtin:
-                k = CliPlugin._get_name(v)
-                self.register(k, v)
-        if local is not None:
-            for v in local:
-                k = CliPlugin._get_name(v)
-                self.register(k, v)
-
-class GitWrapper:
-    def __init__(self, config=None):
-        self.repo = None
-
-        if config is not None and config.project_dir:
-            self.init(config.project_dir)
-
-    @staticmethod
-    def _git_dir(base_path):
-        return osp.join(base_path, '.git')
-
-    @classmethod
-    def spawn(cls, path):
-        spawn = not osp.isdir(cls._git_dir(path))
-        repo = git.Repo.init(path=path)
-        if spawn:
-            repo.config_writer().set_value("user", "name", "User") \
-                .set_value("user", "email", "user@nowhere.com") \
-                .release()
-            # gitpython does not support init, use git directly
-            repo.git.init()
-            repo.git.commit('-m', 'Initial commit', '--allow-empty')
-        return repo
-
-    def init(self, path):
-        self.repo = self.spawn(path)
-        return self.repo
-
-    def is_initialized(self):
-        return self.repo is not None
-
-    def create_submodule(self, name, dst_dir, **kwargs):
-        self.repo.create_submodule(name, dst_dir, **kwargs)
-
-    def has_submodule(self, name):
-        return name in [submodule.name for submodule in self.repo.submodules]
-
-    def remove_submodule(self, name, **kwargs):
-        return self.repo.submodule(name).remove(**kwargs)
-
-def load_project_as_dataset(url):
-    # symbol forward declaration
-    raise NotImplementedError()
-
-class Environment:
-    _builtin_plugins = None
-    PROJECT_EXTRACTOR_NAME = 'datumaro_project'
-
-    def __init__(self, config=None):
-        config = Config(config,
-            fallback=PROJECT_DEFAULT_CONFIG, schema=PROJECT_SCHEMA)
-
-        self.models = ModelRegistry(config)
-        self.sources = SourceRegistry(config)
-
-        self.git = GitWrapper(config)
-
-        env_dir = osp.join(config.project_dir, config.env_dir)
-        builtin = self._load_builtin_plugins()
-        custom = self._load_plugins2(osp.join(env_dir, config.plugins_dir))
-        select = lambda seq, t: [e for e in seq if issubclass(e, t)]
-        from datumaro.components.extractor import Transform
-        from datumaro.components.extractor import SourceExtractor
-        from datumaro.components.extractor import Importer
-        from datumaro.components.converter import Converter
-        from datumaro.components.launcher import Launcher
-        self.extractors = PluginRegistry(
-            builtin=select(builtin, SourceExtractor),
-            local=select(custom, SourceExtractor)
-        )
-        self.extractors.register(self.PROJECT_EXTRACTOR_NAME,
-            load_project_as_dataset)
-
-        self.importers = PluginRegistry(
-            builtin=select(builtin, Importer),
-            local=select(custom, Importer)
-        )
-        self.launchers = PluginRegistry(
-            builtin=select(builtin, Launcher),
-            local=select(custom, Launcher)
-        )
-        self.converters = PluginRegistry(
-            builtin=select(builtin, Converter),
-            local=select(custom, Converter)
-        )
-        self.transforms = PluginRegistry(
-            builtin=select(builtin, Transform),
-            local=select(custom, Transform)
-        )
-
-    @staticmethod
-    def _find_plugins(plugins_dir):
-        plugins = []
-        if not osp.exists(plugins_dir):
-            return plugins
-
-        for plugin_name in os.listdir(plugins_dir):
-            p = osp.join(plugins_dir, plugin_name)
-            if osp.isfile(p) and p.endswith('.py'):
-                plugins.append((plugins_dir, plugin_name, None))
-            elif osp.isdir(p):
-                plugins += [(plugins_dir,
-                        osp.splitext(plugin_name)[0] + '.' + osp.basename(p),
-                        osp.splitext(plugin_name)[0]
-                    )
-                    for p in glob(osp.join(p, '*.py'))]
-        return plugins
-
-    @classmethod
-    def _import_module(cls, module_dir, module_name, types, package=None):
-        module = import_foreign_module(osp.splitext(module_name)[0], module_dir,
-            package=package)
-
-        exports = []
-        if hasattr(module, 'exports'):
-            exports = module.exports
-        else:
-            for symbol in dir(module):
-                if symbol.startswith('_'):
-                    continue
-                exports.append(getattr(module, symbol))
-
-        exports = [s for s in exports
-            if inspect.isclass(s) and issubclass(s, types) and not s in types]
-
-        return exports
-
-    @classmethod
-    def _load_plugins(cls, plugins_dir, types):
-        types = tuple(types)
-
-        plugins = cls._find_plugins(plugins_dir)
-
-        all_exports = []
-        for module_dir, module_name, package in plugins:
-            try:
-                exports = cls._import_module(module_dir, module_name, types,
-                    package)
-            except Exception as e:
-                module_search_error = ImportError
-                try:
-                    module_search_error = ModuleNotFoundError # python 3.6+
-                except NameError:
-                    pass
-
-                message = ["Failed to import module '%s': %s", module_name, e]
-                if isinstance(e, module_search_error):
-                    log.debug(*message)
-                else:
-                    log.warning(*message)
-                continue
-
-            log.debug("Imported the following symbols from %s: %s" % \
-                (
-                    module_name,
-                    ', '.join(s.__name__ for s in exports)
-                )
-            )
-            all_exports.extend(exports)
-
-        return all_exports
-
-    @classmethod
-    def _load_builtin_plugins(cls):
-        if not cls._builtin_plugins:
-            plugins_dir = osp.join(
-                __file__[: __file__.rfind(osp.join('datumaro', 'components'))],
-                osp.join('datumaro', 'plugins')
-            )
-            assert osp.isdir(plugins_dir), plugins_dir
-            cls._builtin_plugins = cls._load_plugins2(plugins_dir)
-        return cls._builtin_plugins
-
-    @classmethod
-    def _load_plugins2(cls, plugins_dir):
-        from datumaro.components.extractor import Transform
-        from datumaro.components.extractor import SourceExtractor
-        from datumaro.components.extractor import Importer
-        from datumaro.components.converter import Converter
-        from datumaro.components.launcher import Launcher
-        types = [SourceExtractor, Converter, Importer, Launcher, Transform]
-
-        return cls._load_plugins(plugins_dir, types)
-
-    def make_extractor(self, name, *args, **kwargs):
-        return self.extractors.get(name)(*args, **kwargs)
-
-    def make_importer(self, name, *args, **kwargs):
-        return self.importers.get(name)(*args, **kwargs)
-
-    def make_launcher(self, name, *args, **kwargs):
-        return self.launchers.get(name)(*args, **kwargs)
-
-    def make_converter(self, name, *args, **kwargs):
-        return self.converters.get(name)(*args, **kwargs)
-
-    def register_model(self, name, model):
-        self.models.register(name, model)
-
-    def unregister_model(self, name):
-        self.models.unregister(name)
-
 class ProjectDataset(Dataset):
     def __init__(self, project):
         super().__init__()
 
         self._project = project
+        self._env = project.env
         config = self.config
         env = self.env
 
         sources = {}
         for s_name, source in config.sources.items():
             s_format = source.format or env.PROJECT_EXTRACTOR_NAME
-            options = {}
-            options.update(source.options)
 
             url = source.url
             if not source.url:
                 url = osp.join(config.project_dir, config.sources_dir, s_name)
-            sources[s_name] = env.make_extractor(s_format, url, **options)
+            sources[s_name] = Dataset.import_from(url,
+                format=s_format, env=env, **source.options)
         self._sources = sources
 
         own_source = None
         own_source_dir = osp.join(config.project_dir, config.dataset_dir)
         if config.project_dir and osp.isdir(own_source_dir):
-            log.disable(log.INFO)
-            own_source = env.make_importer(DEFAULT_FORMAT)(own_source_dir) \
-                .make_dataset()
-            log.disable(log.NOTSET)
+            own_source = Dataset.load(own_source_dir)
 
         # merge categories
         # TODO: implement properly with merging and annotations remapping
@@ -458,10 +173,6 @@ def save(self, save_dir=None, merge=False, recursive=True,
                 shutil.rmtree(save_dir, ignore_errors=True)
             raise
 
-    @property
-    def env(self):
-        return self._project.env
-
     @property
     def config(self):
         return self._project.config
@@ -471,7 +182,9 @@ def sources(self):
         return self._sources
 
     def _save_branch_project(self, extractor, save_dir=None):
-        extractor = Dataset.from_extractors(extractor) # apply lazy transforms
+        if not isinstance(extractor, Dataset):
+            extractor = Dataset.from_extractors(
+                extractor) # apply lazy transforms to avoid repeating traversals
 
         # NOTE: probably this function should be in the ViewModel layer
         save_dir = osp.abspath(save_dir)
@@ -585,16 +298,45 @@ def generate(save_dir, config=None):
         return project
 
     @staticmethod
-    def import_from(path, dataset_format, env=None, **kwargs):
+    def import_from(path, dataset_format=None, env=None, **format_options):
         if env is None:
             env = Environment()
-        importer = env.make_importer(dataset_format)
-        return importer(path, **kwargs)
 
-    def __init__(self, config=None):
+        if not dataset_format:
+            matches = env.detect_dataset(path)
+            if not matches:
+                raise Exception("Failed to detect dataset format automatically")
+            if 1 < len(matches):
+                raise Exception("Failed to detect dataset format automatically:"
+                    " data matches more than one format: %s" % \
+                    ', '.join(matches))
+            dataset_format = matches[0]
+        elif not env.is_format_known(dataset_format):
+            raise KeyError("Unknown dataset format '%s'" % dataset_format)
+
+        if dataset_format in env.importers:
+            project = env.make_importer(dataset_format)(path, **format_options)
+        elif dataset_format in env.extractors:
+            project = Project(env=env)
+            project.add_source('source', {
+                'url': path,
+                'format': dataset_format,
+                'options': format_options,
+            })
+        else:
+            raise Exception("Unknown format '%s'. To make it "
+                "available, add the corresponding Extractor implementation "
+                "to the environment" % dataset_format)
+        return project
+
+    def __init__(self, config=None, env=None):
         self.config = Config(config,
             fallback=PROJECT_DEFAULT_CONFIG, schema=PROJECT_SCHEMA)
-        self.env = Environment(self.config)
+        if env is None:
+            env = Environment(self.config)
+        elif config is not None:
+            raise ValueError("env can only be provided when no config provided")
+        self.env = env
 
     def make_dataset(self):
         return ProjectDataset(self)
@@ -643,7 +385,8 @@ def remove_model(self, name):
     def make_executable_model(self, name):
         model = self.get_model(name)
         return self.env.make_launcher(model.launcher,
-            **model.options, model_dir=self.local_model_dir(name))
+            **model.options, model_dir=osp.join(
+                self.config.project_dir, self.local_model_dir(name)))
 
     def make_source_project(self, name):
         source = self.get_source(name)
@@ -662,8 +405,5 @@ def local_model_dir(self, model_name):
     def local_source_dir(self, source_name):
         return osp.join(self.config.sources_dir, source_name)
 
-# pylint: disable=function-redefined
 def load_project_as_dataset(url):
-    # implement the function declared above
     return Project.load(url).make_dataset()
-# pylint: enable=function-redefined
diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 6049ce6af4..ace780148b 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -191,13 +191,8 @@ def _lazy_extract_mask(mask, c):
 class CamvidImporter(Importer):
     @classmethod
     def find_sources(cls, path):
-        subset_paths = [p for p in glob(osp.join(path, '**.txt'), recursive=True)
-            if osp.basename(p) != CamvidPath.LABELMAP_FILE]
-        sources = []
-        for subset_path in subset_paths:
-            sources += cls._find_sources_recursive(
-                subset_path, '.txt', 'camvid')
-        return sources
+        return cls._find_sources_recursive(path, '.txt', 'camvid',
+            file_filter=lambda p: osp.basename(p) != CamvidPath.LABELMAP_FILE)
 
 
 LabelmapType = Enum('LabelmapType', ['camvid', 'source'])
diff --git a/datumaro/plugins/coco_format/converter.py b/datumaro/plugins/coco_format/converter.py
index 71cb37dbc0..44d604ab09 100644
--- a/datumaro/plugins/coco_format/converter.py
+++ b/datumaro/plugins/coco_format/converter.py
@@ -16,7 +16,7 @@
 import datumaro.util.mask_tools as mask_tools
 from datumaro.components.converter import Converter
 from datumaro.components.extractor import (_COORDINATE_ROUNDING_DIGITS,
-    DEFAULT_SUBSET_NAME, AnnotationType, Points)
+    AnnotationType, Points)
 from datumaro.util import cast, find, str_to_bool
 
 from .format import CocoPath, CocoTask
@@ -144,7 +144,9 @@ def save_annotations(self, item):
                     log.warning("Item '%s', ann #%s: failed to convert "
                         "attribute 'score': %e" % (item.id, ann_idx, e))
             if self._context._allow_attributes:
-                elem['attributes'] = self._convert_attributes(ann)
+                attrs = self._convert_attributes(ann)
+                if attrs:
+                    elem['attributes'] = attrs
 
             self.annotations.append(elem)
 
@@ -317,7 +319,9 @@ def convert_instance(self, instance, item):
                 log.warning("Item '%s': failed to convert attribute "
                     "'score': %e" % (item.id, e))
         if self._context._allow_attributes:
-                elem['attributes'] = self._convert_attributes(ann)
+                attrs = self._convert_attributes(ann)
+                if attrs:
+                    elem['attributes'] = attrs
 
         return elem
 
@@ -434,7 +438,9 @@ def save_annotations(self, item):
                     log.warning("Item '%s': failed to convert attribute "
                         "'score': %e" % (item.id, e))
             if self._context._allow_attributes:
-                elem['attributes'] = self._convert_attributes(ann)
+                attrs = self._convert_attributes(ann)
+                if attrs:
+                    elem['attributes'] = attrs
 
             self.annotations.append(elem)
 
diff --git a/datumaro/plugins/image_dir.py b/datumaro/plugins/image_dir.py
index 9be3092944..f8a45baa67 100644
--- a/datumaro/plugins/image_dir.py
+++ b/datumaro/plugins/image_dir.py
@@ -9,7 +9,7 @@
 
 from datumaro.components.extractor import DatasetItem, SourceExtractor, Importer
 from datumaro.components.converter import Converter
-from datumaro.util.image import Image
+from datumaro.util.os_util import walk
 
 
 class ImageDirImporter(Importer):
@@ -20,23 +20,21 @@ def find_sources(cls, path):
         return [{ 'url': path, 'format': 'image_dir' }]
 
 class ImageDirExtractor(SourceExtractor):
-    def __init__(self, url):
+    IMAGE_EXT_FORMATS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp',
+        '.pgm', '.tif', '.tiff'}
+
+    def __init__(self, url, max_depth=10):
         super().__init__()
 
         assert osp.isdir(url), url
 
-        for dirpath, _, filenames in os.walk(url):
+        for dirpath, _, filenames in walk(url, max_depth=max_depth):
             for name in filenames:
-                path = osp.join(dirpath, name)
-                image = Image(path=path)
-                try:
-                    # force loading
-                    image.data # pylint: disable=pointless-statement
-                except Exception:
+                if not osp.splitext(name)[-1] in self.IMAGE_EXT_FORMATS:
                     continue
-
+                path = osp.join(dirpath, name)
                 item_id = osp.relpath(osp.splitext(path)[0], url)
-                self._items.append(DatasetItem(id=item_id, image=image))
+                self._items.append(DatasetItem(id=item_id, image=path))
 
 class ImageDirConverter(Converter):
     DEFAULT_IMAGE_EXT = '.jpg'
diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
index 0e0669a9db..9702262008 100644
--- a/datumaro/plugins/imagenet_format.py
+++ b/datumaro/plugins/imagenet_format.py
@@ -15,7 +15,9 @@
 
 
 class ImagenetPath:
-    IMAGES_EXT = '.jpg'
+    DEFAULT_IMAGE_EXT = '.jpg'
+    IMAGE_EXT_FORMATS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp',
+        '.pgm', '.tif', '.tiff'}
     IMAGES_DIR_NO_LABEL = 'no_label'
 
 
@@ -37,7 +39,9 @@ def _load_categories(self, path):
     def _load_items(self, path):
         items = {}
         for image_path in glob(osp.join(path, '*', '*')):
-            if osp.splitext(image_path)[1] != ImagenetPath.IMAGES_EXT:
+            if not osp.isfile(image_path) or \
+                    osp.splitext(image_path)[-1] not in \
+                        ImagenetPath.IMAGE_EXT_FORMATS:
                 continue
             label = osp.basename(osp.dirname(image_path))
             image_name = osp.splitext(osp.basename(image_path))[0][len(label) + 1:]
@@ -62,7 +66,7 @@ def find_sources(cls, path):
 
 
 class ImagenetConverter(Converter):
-    DEFAULT_IMAGE_EXT = ImagenetPath.IMAGES_EXT
+    DEFAULT_IMAGE_EXT = ImagenetPath.DEFAULT_IMAGE_EXT
 
     def apply(self):
         if 1 < len(self._extractor.subsets()):
@@ -79,12 +83,10 @@ def apply(self):
             for label in labels[image_name]:
                 label_name = extractor.categories()[AnnotationType.label][label].name
                 self._save_image(item, osp.join(subset_dir, label_name,
-                    '%s_%s%s' % \
-                    (label_name, image_name, ImagenetPath.IMAGES_EXT)
-                ))
+                    '%s_%s' %  (label_name, self._make_image_filename(item))))
 
             if not labels[image_name]:
                 self._save_image(item, osp.join(subset_dir,
                     ImagenetPath.IMAGES_DIR_NO_LABEL,
-                    ImagenetPath.IMAGES_DIR_NO_LABEL + '_' +
-                    image_name + ImagenetPath.IMAGES_EXT))
+                    ImagenetPath.IMAGES_DIR_NO_LABEL + '_'
+                    + self._make_image_filename(item)))
diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
index 00ee4ae789..36ee68a7c5 100644
--- a/datumaro/plugins/imagenet_txt_format.py
+++ b/datumaro/plugins/imagenet_txt_format.py
@@ -14,6 +14,8 @@
 
 
 class ImagenetTxtPath:
+    DEFAULT_IMAGE_EXT = '.jpg'
+    IMAGE_EXT_FORMAT = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
     LABELS_FILE = 'synsets.txt'
     IMAGE_DIR = 'images'
 
@@ -56,26 +58,27 @@ def _load_items(self, path):
                         label < len(self._categories[AnnotationType.label]), \
                         "Image '%s': unknown label id '%s'" % (item_id, label)
                     anno.append(Label(label))
+                image_path = osp.join(self.image_dir, item_id +
+                    ImagenetTxtPath.DEFAULT_IMAGE_EXT)
+                for path in glob(osp.join(self.image_dir, item_id + '*')):
+                    if osp.splitext(path)[1] in ImagenetTxtPath.IMAGE_EXT_FORMAT:
+                        image_path = path
+                        break
                 items[item_id] = DatasetItem(id=item_id, subset=self._subset,
-                    image=osp.join(self.image_dir, item_id + '.jpg'),
-                    annotations=anno)
+                    image=image_path, annotations=anno)
         return items
 
 
 class ImagenetTxtImporter(Importer):
     @classmethod
     def find_sources(cls, path):
-        subset_paths = [p for p in glob(osp.join(path, '*.txt'))
-            if osp.basename(p) != ImagenetTxtPath.LABELS_FILE]
-        sources = []
-        for subset_path in subset_paths:
-            sources += cls._find_sources_recursive(
-                subset_path, '.txt', 'imagenet_txt')
-        return sources
+        return cls._find_sources_recursive(path, '.txt', 'imagenet_txt',
+            file_filter=lambda p: \
+                osp.basename(p) != ImagenetTxtPath.LABELS_FILE)
 
 
 class ImagenetTxtConverter(Converter):
-    DEFAULT_IMAGE_EXT = '.jpg'
+    DEFAULT_IMAGE_EXT = ImagenetTxtPath.DEFAULT_IMAGE_EXT
 
     def apply(self):
         subset_dir = self._save_dir
diff --git a/datumaro/plugins/mot_format.py b/datumaro/plugins/mot_format.py
index ba8c33a5ac..2fbc28001c 100644
--- a/datumaro/plugins/mot_format.py
+++ b/datumaro/plugins/mot_format.py
@@ -206,7 +206,7 @@ class MotSeqImporter(Importer):
     @classmethod
     def find_sources(cls, path):
         return cls._find_sources_recursive(path, '.txt', 'mot_seq',
-            filename=osp.join('gt', osp.splitext(MotPath.GT_FILENAME)[0]))
+            dirname='gt', filename=osp.splitext(MotPath.GT_FILENAME)[0])
 
 class MotSeqGtConverter(Converter):
     DEFAULT_IMAGE_EXT = MotPath.IMAGE_EXT
diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py
new file mode 100644
index 0000000000..02a2675124
--- /dev/null
+++ b/datumaro/plugins/splitter.py
@@ -0,0 +1,582 @@
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import logging as log
+import numpy as np
+
+from datumaro.components.extractor import (Transform, AnnotationType,
+    DEFAULT_SUBSET_NAME)
+from datumaro.components.cli_plugin import CliPlugin
+
+NEAR_ZERO = 1e-7
+
+
+class _TaskSpecificSplit(Transform, CliPlugin):
+    _default_split = [('train', 0.5), ('val', 0.2), ('test', 0.3)]
+
+    @classmethod
+    def build_cmdline_parser(cls, **kwargs):
+        parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument('-s', '--subset', action='append',
+            type=cls._split_arg, dest='splits',
+            help="Subsets in the form: '<subset>:<ratio>' "
+                "(repeatable, default: %s)" % dict(cls._default_split))
+        parser.add_argument('--seed', type=int, help="Random seed")
+        return parser
+
+    @staticmethod
+    def _split_arg(s):
+        parts = s.split(':')
+        if len(parts) != 2:
+            import argparse
+            raise argparse.ArgumentTypeError()
+        return (parts[0], float(parts[1]))
+
+    def __init__(self, dataset, splits, seed):
+        super().__init__(dataset)
+
+        if splits is None:
+            splits = self._default_split
+
+        snames, sratio = self._validate_splits(splits)
+
+        self._snames = snames
+        self._sratio = sratio
+
+        self._seed = seed
+
+        self._subsets = {"train", "val", "test"}  # output subset names
+        self._parts = []
+        self._length = "parent"
+
+        self._initialized = False
+
+    def _set_parts(self, by_splits):
+        self._parts = []
+        for subset in self._subsets:
+            self._parts.append((set(by_splits[subset]), subset))
+
+    @staticmethod
+    def _get_uniq_annotations(dataset):
+        annotations = []
+        for item in dataset:
+            labels = [a for a in item.annotations
+                if a.type == AnnotationType.label]
+            if len(labels) != 1:
+                raise Exception("Item '%s' contains %s labels, "
+                    "but exactly one is expected" % (item.id, len(labels)))
+            annotations.append(labels[0])
+        return annotations
+
+    @staticmethod
+    def _validate_splits(splits, valid=None):
+        snames = []
+        ratios = []
+        if valid is None:
+            valid = ["train", "val", "test"]
+        for subset, ratio in splits:
+            assert subset in valid, \
+                "Subset name must be one of %s, but got %s" % (valid, subset)
+            assert 0.0 <= ratio and ratio <= 1.0, \
+                "Ratio is expected to be in the range " \
+                "[0, 1], but got %s for %s" % (ratio, subset)
+            snames.append(subset)
+            ratios.append(float(ratio))
+        ratios = np.array(ratios)
+
+        total_ratio = np.sum(ratios)
+        if not abs(total_ratio - 1.0) <= NEAR_ZERO:
+            raise Exception(
+                "Sum of ratios is expected to be 1, got %s, which is %s"
+                % (splits, total_ratio)
+            )
+        return snames, ratios
+
+    @staticmethod
+    def _get_required(ratio):
+        min_value = np.max(ratio)
+        for i in ratio:
+            if NEAR_ZERO < i and i < min_value:
+                min_value = i
+        required = int(np.around(1.0) / min_value)
+        return required
+
+    @staticmethod
+    def _get_sections(dataset_size, ratio):
+        n_splits = [int(np.around(dataset_size * r)) for r in ratio[:-1]]
+        n_splits.append(dataset_size - np.sum(n_splits))
+
+        # if there are splits with zero samples even if ratio is not 0,
+        # borrow one from the split who has one or more.
+        for ii, num_split in enumerate(n_splits):
+            if num_split == 0 and NEAR_ZERO < ratio[ii]:
+                midx = np.argmax(n_splits)
+                if n_splits[midx] > 0:
+                    n_splits[ii] += 1
+                    n_splits[midx] -= 1
+        sections = np.add.accumulate(n_splits[:-1])
+        return sections
+
+    @staticmethod
+    def _group_by_attr(items):
+        """
+        Args:
+            items: list of (idx, ann). ann is the annotation from Label object.
+        Returns:
+            by_attributes: dict of { combination-of-attrs : list of index }
+        """
+        # group by attributes
+        by_attributes = dict()
+        for idx, ann in items:
+            attributes = tuple(sorted(ann.attributes.items()))
+            if attributes not in by_attributes:
+                by_attributes[attributes] = []
+            by_attributes[attributes].append(idx)
+        return by_attributes
+
+    def _split_by_attr(self, datasets, snames, ratio, out_splits,
+            dataset_key=None):
+        required = self._get_required(ratio)
+        if dataset_key is None:
+            dataset_key = "label"
+        for key, items in datasets.items():
+            np.random.shuffle(items)
+            by_attributes = self._group_by_attr(items)
+            for attributes, indice in by_attributes.items():
+                gname = "%s: %s, attrs: %s" % (dataset_key, key, attributes)
+                splits = self._split_indice(indice, gname, ratio, required)
+                for subset, split in zip(snames, splits):
+                    if 0 < len(split):
+                        out_splits[subset].extend(split)
+
+    def _split_indice(self, indice, group_name, ratio, required):
+        filtered_size = len(indice)
+        if filtered_size < required:
+            log.warning("Not enough samples for a group, '%s'" % group_name)
+        sections = self._get_sections(filtered_size, ratio)
+        splits = np.array_split(indice, sections)
+        return splits
+
+    def _find_split(self, index):
+        for subset_indices, subset in self._parts:
+            if index in subset_indices:
+                return subset
+        return DEFAULT_SUBSET_NAME  # all the possible remainder --> default
+
+    def _split_dataset(self):
+        raise NotImplementedError()
+
+    def __iter__(self):
+        # lazy splitting
+        if self._initialized is False:
+            self._split_dataset()
+            self._initialized = True
+        for i, item in enumerate(self._extractor):
+            yield self.wrap_item(item, subset=self._find_split(i))
+
+
+class ClassificationSplit(_TaskSpecificSplit):
+    """
+    Splits dataset into train/val/test set in class-wise manner. |n
+    Splits dataset images in the specified ratio, keeping the initial class
+    distribution.|n
+    |n
+    Notes:|n
+    - Each image is expected to have only one Label|n
+    - If Labels also have attributes, also splits by attribute values.|n
+    - If there is not enough images in some class or attributes group,
+      the split ratio can't be guaranteed.|n
+    |n
+    Example:|n
+    |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3
+    """
+    def __init__(self, dataset, splits, seed=None):
+        """
+        Parameters
+        ----------
+        dataset : Dataset
+        splits : list
+            A list of (subset(str), ratio(float))
+            Subset is expected to be one of ["train", "val", "test"].
+            The sum of ratios is expected to be 1.
+        seed : int, optional
+        """
+        super().__init__(dataset, splits, seed)
+
+    def _split_dataset(self):
+        np.random.seed(self._seed)
+
+        # support only single label for a DatasetItem
+        # 1. group by label
+        by_labels = dict()
+        annotations = self._get_uniq_annotations(self._extractor)
+        for idx, ann in enumerate(annotations):
+            label = getattr(ann, 'label', None)
+            if label not in by_labels:
+                by_labels[label] = []
+            by_labels[label].append((idx, ann))
+
+        by_splits = dict()
+        for subset in self._subsets:
+            by_splits[subset] = []
+
+        # 2. group by attributes
+        self._split_by_attr(by_labels, self._snames, self._sratio, by_splits)
+        self._set_parts(by_splits)
+
+
+class ReidentificationSplit(_TaskSpecificSplit):
+    """
+    Splits a dataset for re-identification task.|n
+    Produces a split with a specified ratio of images, avoiding having same
+    labels in different subsets.|n
+    |n
+    In this task, the test set should consist of images of unseen
+    people or objects during the training phase. |n
+    This function splits a dataset in the following way:|n
+    1. Splits the dataset into 'train + val' and 'test' sets|n
+    |s|sbased on person or object ID.|n
+    2. Splits 'test' set into 'test-gallery' and 'test-query' sets|n
+    |s|sin class-wise manner.|n
+    3. Splits the 'train + val' set into 'train' and 'val' sets|n
+    |s|sin the same way.|n
+    The final subsets would be
+    'train', 'val', 'test-gallery' and 'test-query'. |n
+    |n
+    Notes:|n
+    - Each image is expected to have a single Label|n
+    - Object ID can be described by Label, or by attribute (--attr parameter)|n
+    - The splits of the test set are controlled by '--query' parameter. |n
+    |s|sGallery ratio would be 1.0 - query.|n
+    |n
+    Example: split a dataset in the specified ratio, split the test set|n
+    |s|s|s|sinto gallery and query in 1:1 ratio|n
+    |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3 --query .5|n
+    Example: use 'person_id' attribute for splitting|n
+    |s|s%(prog)s --attr person_id
+    """
+
+    _default_query_ratio = 0.5
+
+    @classmethod
+    def build_cmdline_parser(cls, **kwargs):
+        parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument('--query', type=float,
+            help="Query ratio in the test set (default: %.3f)"
+            % cls._default_query_ratio)
+        parser.add_argument('--attr', type=str, dest='attr_for_id',
+            help="Attribute name representing the ID (default: use label)")
+        return parser
+
+    def __init__(self, dataset, splits, query=None,
+            attr_for_id=None, seed=None):
+        """
+        Parameters
+        ----------
+        dataset : Dataset
+        splits : list
+            A list of (subset(str), ratio(float))
+            Subset is expected to be one of ["train", "val", "test"].
+            The sum of ratios is expected to be 1.
+        query : float
+            The ratio of 'test-query' set.
+            The ratio of 'test-gallery' set would be 1.0 - query.
+        attr_for_id: str
+            attribute name representing the person/object id.
+            if this is not specified, label would be used.
+        seed : int, optional
+        """
+        super().__init__(dataset, splits, seed)
+
+        if query is None:
+            query = self._default_query_ratio
+
+        assert 0.0 <= query and query <= 1.0, \
+            "Query ratio is expected to be in the range " \
+            "[0, 1], but got %f" % query
+        test_splits = [('test-query', query), ('test-gallery', 1.0 - query)]
+
+        # reset output subset names
+        self._subsets = {"train", "val", "test-gallery", "test-query"}
+        self._test_splits = test_splits
+        self._attr_for_id = attr_for_id
+
+    def _split_dataset(self):
+        np.random.seed(self._seed)
+
+        id_snames, id_ratio = self._snames, self._sratio
+
+        attr_for_id = self._attr_for_id
+        dataset = self._extractor
+
+        # group by ID(attr_for_id)
+        by_id = dict()
+        annotations = self._get_uniq_annotations(dataset)
+        if attr_for_id is None:  # use label
+            for idx, ann in enumerate(annotations):
+                ID = getattr(ann, 'label', None)
+                if ID not in by_id:
+                    by_id[ID] = []
+                by_id[ID].append((idx, ann))
+        else:  # use attr_for_id
+            for idx, ann in enumerate(annotations):
+                attributes = dict(ann.attributes.items())
+                assert attr_for_id in attributes, \
+                    "'%s' is expected as an attribute name" % attr_for_id
+                ID = attributes[attr_for_id]
+                if ID not in by_id:
+                    by_id[ID] = []
+                by_id[ID].append((idx, ann))
+
+        required = self._get_required(id_ratio)
+        if len(by_id) < required:
+            log.warning("There's not enough IDs, which is %s, "
+                "so train/val/test ratio can't be guaranteed."
+                % len(by_id)
+            )
+
+        # 1. split dataset into trval and test
+        #    IDs in test set should not exist in train/val set.
+        test = id_ratio[id_snames.index("test")] if "test" in id_snames else 0
+        if NEAR_ZERO < test:  # has testset
+            split_ratio = np.array([test, 1.0 - test])
+            IDs = list(by_id.keys())
+            np.random.shuffle(IDs)
+            sections = self._get_sections(len(IDs), split_ratio)
+            splits = np.array_split(IDs, sections)
+            testset = {pid: by_id[pid] for pid in splits[0]}
+            trval = {pid: by_id[pid] for pid in splits[1]}
+
+            # follow the ratio of datasetitems as possible.
+            # naive heuristic: exchange the best item one by one.
+            expected_count = int(len(self._extractor) * split_ratio[0])
+            testset_total = int(np.sum([len(v) for v in testset.values()]))
+            self._rebalancing(testset, trval, expected_count, testset_total)
+        else:
+            testset = dict()
+            trval = by_id
+
+        by_splits = dict()
+        for subset in self._subsets:
+            by_splits[subset] = []
+
+        # 2. split 'test' into 'test-gallery' and 'test-query'
+        if 0 < len(testset):
+            test_snames = []
+            test_ratio = []
+            for sname, ratio in self._test_splits:
+                test_snames.append(sname)
+                test_ratio.append(float(ratio))
+
+            self._split_by_attr(testset, test_snames, test_ratio, by_splits,
+                dataset_key=attr_for_id)
+
+        # 3. split 'trval' into  'train' and 'val'
+        trval_snames = ["train", "val"]
+        trval_ratio = []
+        for subset in trval_snames:
+            if subset in id_snames:
+                val = id_ratio[id_snames.index(subset)]
+            else:
+                val = 0.0
+            trval_ratio.append(val)
+        trval_ratio = np.array(trval_ratio)
+        total_ratio = np.sum(trval_ratio)
+        if total_ratio < NEAR_ZERO:
+            trval_splits = list(zip(["train", "val"], trval_ratio))
+            log.warning("Sum of ratios is expected to be positive, "
+                "got %s, which is %s"
+                % (trval_splits, total_ratio)
+            )
+        else:
+            trval_ratio /= total_ratio  # normalize
+            self._split_by_attr(trval, trval_snames, trval_ratio, by_splits,
+                dataset_key=attr_for_id)
+
+        self._set_parts(by_splits)
+
+    @staticmethod
+    def _rebalancing(test, trval, expected_count, testset_total):
+        diffs = dict()
+        for id_test, items_test in test.items():
+            count_test = len(items_test)
+            for id_trval, items_trval in trval.items():
+                count_trval = len(items_trval)
+                diff = count_trval - count_test
+                if diff == 0:
+                    continue  # exchange has no effect
+                if diff not in diffs:
+                    diffs[diff] = [(id_test, id_trval)]
+                else:
+                    diffs[diff].append((id_test, id_trval))
+        if len(diffs) == 0:  # nothing would be changed by exchange
+            return
+
+        exchanges = []
+        while True:
+            target_diff = expected_count - testset_total
+            # find nearest diff.
+            keys = np.array(list(diffs.keys()))
+            idx = (np.abs(keys - target_diff)).argmin()
+            nearest = keys[idx]
+            if abs(target_diff) <= abs(target_diff - nearest):
+                break
+            choice = np.random.choice(range(len(diffs[nearest])))
+            id_test, id_trval = diffs[nearest][choice]
+            testset_total += nearest
+            new_diffs = dict()
+            for diff, IDs in diffs.items():
+                new_list = []
+                for id1, id2 in IDs:
+                    if id1 == id_test or id2 == id_trval:
+                        continue
+                    new_list.append((id1, id2))
+                if 0 < len(new_list):
+                    new_diffs[diff] = new_list
+            diffs = new_diffs
+            exchanges.append((id_test, id_trval))
+
+        # exchange
+        for id_test, id_trval in exchanges:
+            test[id_trval] = trval.pop(id_trval)
+            trval[id_test] = test.pop(id_test)
+
+
+class DetectionSplit(_TaskSpecificSplit):
+    """
+    Splits a dataset into train/val/test subsets for detection task,
+    using object annotations as a basis for splitting.|n
+    Tries to produce an image split with the specified ratio, keeping the
+    initial distribution of class objects.|n
+    |n
+    In a detection dataset, each image can have multiple object annotations -
+    instance bounding boxes. Since an image shouldn't be included
+    in multiple subsets at the same time, and image annotations
+    shoudln't be split, in general, dataset annotations are unlikely to be split
+    exactly in the specified ratio. |n
+    This split tries to split dataset images as close as possible
+    to the specified ratio, keeping the initial class distribution.|n
+    |n
+    Notes:|n
+    - Each image is expected to have one or more Bbox annotations.|n
+    - Only Bbox annotations are considered.|n
+    |n
+    Example: split dataset so that each object class annotations were split|n
+    |s|s|s|sin the specified ratio between subsets|n
+    |s|s%(prog)s --subset train:.5 --subset val:.2 --subset test:.3
+    """
+    def __init__(self, dataset, splits, seed=None):
+        """
+        Parameters
+        ----------
+        dataset : Dataset
+        splits : list
+            A list of (subset(str), ratio(float))
+            Subset is expected to be one of ["train", "val", "test"].
+            The sum of ratios is expected to be 1.
+        seed : int, optional
+        """
+        super().__init__(dataset, splits, seed)
+
+    @staticmethod
+    def _group_by_bbox_labels(dataset):
+        by_labels = dict()
+        for idx, item in enumerate(dataset):
+            bbox_anns = [a for a in item.annotations
+                if a.type == AnnotationType.bbox]
+            assert 0 < len(bbox_anns), \
+                "Expected more than one bbox annotation in the dataset"
+            for ann in bbox_anns:
+                label = getattr(ann, 'label', None)
+                if label not in by_labels:
+                    by_labels[label] = [(idx, ann)]
+                else:
+                    by_labels[label].append((idx, ann))
+        return by_labels
+
+    def _split_dataset(self):
+        np.random.seed(self._seed)
+
+        subsets, sratio = self._snames, self._sratio
+
+        # 1. group by bbox label
+        by_labels = self._group_by_bbox_labels(self._extractor)
+
+        # 2. group by attributes
+        by_combinations = dict()
+        for label, items in by_labels.items():
+            by_attributes = self._group_by_attr(items)
+            for attributes, indice in by_attributes.items():
+                gname = "label: %s, attributes: %s" % (label, attributes)
+                by_combinations[gname] = indice
+
+        # total number of GT samples per label-attr combinations
+        n_combs = {k: len(v) for k, v in by_combinations.items()}
+
+        # 3-1. initially count per-image GT samples
+        scores_all = {}
+        init_scores = {}
+        for idx, _ in enumerate(self._extractor):
+            counts = {k: v.count(idx) for k, v in by_combinations.items()}
+            scores_all[idx] = counts
+            init_scores[idx] = np.sum(
+                [v / n_combs[k] for k, v in counts.items()]
+            )
+
+        by_splits = dict()
+        for sname in self._subsets:
+            by_splits[sname] = []
+
+        total = len(self._extractor)
+        target_size = dict()
+        expected = []  # expected numbers of per split GT samples
+        for sname, ratio in zip(subsets, sratio):
+            target_size[sname] = total * ratio
+            expected.append(
+                (sname, {k: v * ratio for k, v in n_combs.items()})
+            )
+
+        # functions for keep the # of annotations not exceed the expected num
+        def compute_penalty(counts, n_combs):
+            p = 0
+            for k, v in counts.items():
+                p += max(0, (v / n_combs[k]) - 1.0)
+            return p
+
+        def update_nc(counts, n_combs):
+            for k, v in counts.items():
+                n_combs[k] = max(0, n_combs[k] - v)
+                if n_combs[k] == 0:
+                    n_combs[k] = -1
+            return n_combs
+
+        # 3-2. assign each DatasetItem to a split, one by one
+        for idx, _ in sorted(
+            init_scores.items(), key=lambda item: item[1], reverse=True
+        ):
+            counts = scores_all[idx]
+
+            # shuffling split order to add randomness
+            # when two or more splits have the same penalty value
+            np.random.shuffle(expected)
+
+            pp = []
+            for sname, nc in expected:
+                if target_size[sname] <= len(by_splits[sname]):
+                    # the split has enough images,
+                    # stop adding more images to this split
+                    pp.append(1e08)
+                else:
+                    # compute penalty based on the number of GT samples
+                    # added in the split
+                    pp.append(compute_penalty(counts, nc))
+
+            # we push an image to a split with the minimum penalty
+            midx = np.argmin(pp)
+
+            sname, nc = expected[midx]
+            by_splits[sname].append(idx)
+            update_nc(counts, nc)
+
+        self._set_parts(by_splits)
diff --git a/datumaro/plugins/transforms.py b/datumaro/plugins/transforms.py
index f50afae070..c5030251f2 100644
--- a/datumaro/plugins/transforms.py
+++ b/datumaro/plugins/transforms.py
@@ -12,7 +12,7 @@
 import pycocotools.mask as mask_utils
 
 from datumaro.components.extractor import (Transform, AnnotationType,
-    RleMask, Polygon, Bbox, DEFAULT_SUBSET_NAME,
+    RleMask, Polygon, Bbox, Label, DEFAULT_SUBSET_NAME,
     LabelCategories, MaskCategories, PointsCategories
 )
 from datumaro.components.cli_plugin import CliPlugin
@@ -541,4 +541,19 @@ def transform_item(self, item):
                     annotations.append(ann.wrap(label=conv_label))
             else:
                 annotations.append(ann.wrap())
+        return item.wrap(annotations=annotations)
+
+class AnnsToLabels(Transform, CliPlugin):
+    """
+    Collects all labels from annotations (of all types) and
+    transforms them into a set of annotations of type Label
+    """
+
+    def transform_item(self, item):
+        labels = set(p.label for p in item.annotations
+            if getattr(p, 'label') != None)
+        annotations = []
+        for label in labels:
+            annotations.append(Label(label=label))
+
         return item.wrap(annotations=annotations)
\ No newline at end of file
diff --git a/datumaro/plugins/vgg_face2_format.py b/datumaro/plugins/vgg_face2_format.py
new file mode 100644
index 0000000000..bd7a818be2
--- /dev/null
+++ b/datumaro/plugins/vgg_face2_format.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import csv
+import os
+import os.path as osp
+
+from datumaro.components.converter import Converter
+from datumaro.components.extractor import (AnnotationType, Bbox, DatasetItem,
+    Importer, Label, LabelCategories, Points, SourceExtractor)
+
+
+class VggFace2Path:
+    ANNOTATION_DIR = "bb_landmark"
+    IMAGE_EXT = '.jpg'
+    BBOXES_FILE = 'loose_bb_'
+    LANDMARKS_FILE = 'loose_landmark_'
+    LABELS_FILE = 'labels.txt'
+    IMAGES_DIR_NO_LABEL = 'no_label'
+
+class VggFace2Extractor(SourceExtractor):
+    def __init__(self, path):
+        if not osp.isfile(path):
+            raise Exception("Can't read .csv annotation file '%s'" % path)
+        self._path = path
+        self._dataset_dir = osp.dirname(osp.dirname(path))
+
+        subset = osp.splitext(osp.basename(path))[0]
+        if subset.startswith(VggFace2Path.LANDMARKS_FILE):
+            subset = subset.split('_')[2]
+        super().__init__(subset=subset)
+
+        self._categories = self._load_categories()
+        self._items = list(self._load_items(path).values())
+
+    def _load_categories(self):
+        label_cat = LabelCategories()
+        path = osp.join(self._dataset_dir, VggFace2Path.LABELS_FILE)
+        if osp.isfile(path):
+            with open(path, encoding='utf-8') as labels_file:
+                lines = [s.strip() for s in labels_file]
+            for line in lines:
+                objects = line.split()
+                label = objects[0]
+                class_name = None
+                if 1 < len(objects):
+                    class_name = objects[1]
+                label_cat.add(label, parent=class_name)
+        else:
+            subset_path = osp.join(self._dataset_dir, self._subset)
+            if osp.isdir(subset_path):
+                for images_dir in sorted(os.listdir(subset_path)):
+                    if osp.isdir(osp.join(subset_path, images_dir)) and \
+                            images_dir != VggFace2Path.IMAGES_DIR_NO_LABEL:
+                        label_cat.add(images_dir)
+        return { AnnotationType.label: label_cat }
+
+    def _load_items(self, path):
+        items = {}
+        with open(path) as content:
+            landmarks_table = list(csv.DictReader(content))
+
+        for row in landmarks_table:
+            item_id = row['NAME_ID']
+            label = None
+            if '/' in item_id:
+                label_name = item_id.split('/')[0]
+                if label_name != VggFace2Path.IMAGES_DIR_NO_LABEL:
+                    label = \
+                        self._categories[AnnotationType.label].find(label_name)[0]
+                item_id = item_id[len(label_name) + 1:]
+            if item_id not in items:
+                image_path = osp.join(self._dataset_dir, self._subset,
+                    row['NAME_ID'] + VggFace2Path.IMAGE_EXT)
+                items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                    image=image_path)
+            annotations = items[item_id].annotations
+            if len([p for p in row if row[p] == '']) == 0 and len(row) == 11:
+                annotations.append(Points(
+                    [float(row[p]) for p in row if p != 'NAME_ID'], label=label,
+                    group=1))
+            elif label is not None:
+                annotations.append(Label(label=label, group=1))
+
+        bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR,
+            VggFace2Path.BBOXES_FILE + self._subset + '.csv')
+        if osp.isfile(bboxes_path):
+            with open(bboxes_path) as content:
+                bboxes_table = list(csv.DictReader(content))
+            for row in bboxes_table:
+                item_id = row['NAME_ID']
+                label = None
+                if '/' in item_id:
+                    label_name = item_id.split('/')[0]
+                    if label_name != VggFace2Path.IMAGES_DIR_NO_LABEL:
+                        label = \
+                            self._categories[AnnotationType.label].find(label_name)[0]
+                    item_id = item_id[len(label_name) + 1:]
+                if item_id not in items:
+                    image_path = osp.join(self._dataset_dir, self._subset,
+                        row['NAME_ID'] + VggFace2Path.IMAGE_EXT)
+                    items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                        image=image_path)
+                annotations = items[item_id].annotations
+                if len([p for p in row if row[p] == '']) == 0 and len(row) == 5:
+                    annotations.append(Bbox(float(row['X']), float(row['Y']),
+                        float(row['W']), float(row['H']), label=label, group=1))
+        return items
+
+class VggFace2Importer(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        return cls._find_sources_recursive(path, '.csv', 'vgg_face2',
+            dirname=VggFace2Path.ANNOTATION_DIR,
+            file_filter=lambda p: \
+                not osp.basename(p).startswith(VggFace2Path.BBOXES_FILE))
+
+class VggFace2Converter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    def apply(self):
+        save_dir = self._save_dir
+        os.makedirs(save_dir, exist_ok=True)
+
+        labels_path = osp.join(save_dir, VggFace2Path.LABELS_FILE)
+        labels_file = ''
+        for label in self._extractor.categories()[AnnotationType.label]:
+            labels_file += '%s' % label.name
+            if label.parent:
+                labels_file += ' %s' % label.parent
+            labels_file += '\n'
+        with open(labels_path, 'w', encoding='utf-8') as f:
+            f.write(labels_file)
+
+        label_categories = self._extractor.categories()[AnnotationType.label]
+
+        for subset_name, subset in self._extractor.subsets().items():
+            subset_dir = osp.join(save_dir, subset_name)
+            bboxes_table = []
+            landmarks_table = []
+            for item in subset:
+                if item.has_image and self._save_images:
+                    labels = set(p.label for p in item.annotations
+                        if getattr(p, 'label') != None)
+                    if labels:
+                        for label in labels:
+                            self._save_image(item, osp.join(subset_dir,
+                                label_categories[label].name + '/' \
+                                + item.id + VggFace2Path.IMAGE_EXT))
+                    else:
+                        self._save_image(item, osp.join(subset_dir,
+                            VggFace2Path.IMAGES_DIR_NO_LABEL,
+                            item.id + VggFace2Path.IMAGE_EXT))
+
+                landmarks = [a for a in item.annotations
+                    if a.type == AnnotationType.points]
+                for landmark in landmarks:
+                    if landmark.label is not None and \
+                            label_categories[landmark.label].name:
+                        name_id = label_categories[landmark.label].name \
+                            + '/' + item.id
+                    else:
+                        name_id = VggFace2Path.IMAGES_DIR_NO_LABEL \
+                            + '/' + item.id
+                    points = landmark.points
+                    landmarks_table.append({'NAME_ID': name_id,
+                        'P1X': points[0], 'P1Y': points[1],
+                        'P2X': points[2], 'P2Y': points[3],
+                        'P3X': points[4], 'P3Y': points[5],
+                        'P4X': points[6], 'P4Y': points[7],
+                        'P5X': points[8], 'P5Y': points[9]})
+
+                bboxes = [a for a in item.annotations
+                    if a.type == AnnotationType.bbox]
+                for bbox in bboxes:
+                    if bbox.label is not None and \
+                            label_categories[bbox.label].name:
+                        name_id = label_categories[bbox.label].name \
+                            + '/' + item.id
+                    else:
+                        name_id = VggFace2Path.IMAGES_DIR_NO_LABEL \
+                            + '/' + item.id
+                    bboxes_table.append({'NAME_ID': name_id, 'X': bbox.x,
+                        'Y': bbox.y, 'W': bbox.w, 'H': bbox.h})
+
+                labels = [a for a in item.annotations
+                    if a.type == AnnotationType.label]
+                for label in labels:
+                    if label.label is not None and \
+                            label_categories[label.label].name:
+                        name_id = label_categories[label.label].name \
+                            + '/' + item.id
+                    else:
+                        name_id = VggFace2Path.IMAGES_DIR_NO_LABEL \
+                            + '/' + item.id
+                    landmarks_table.append({'NAME_ID': name_id})
+
+                if not landmarks and not bboxes and not labels:
+                    landmarks_table.append({'NAME_ID':
+                        VggFace2Path.IMAGES_DIR_NO_LABEL + '/' + item.id})
+
+            landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
+                VggFace2Path.LANDMARKS_FILE + subset_name + '.csv')
+            os.makedirs(osp.dirname(landmarks_path), exist_ok=True)
+            with open(landmarks_path, 'w', newline='') as file:
+                columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y',
+                    'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y']
+                writer = csv.DictWriter(file, fieldnames=columns)
+                writer.writeheader()
+                writer.writerows(landmarks_table)
+
+            if bboxes_table:
+                bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
+                    VggFace2Path.BBOXES_FILE + subset_name + '.csv')
+                os.makedirs(osp.dirname(bboxes_path), exist_ok=True)
+                with open(bboxes_path, 'w', newline='') as file:
+                    columns = ['NAME_ID', 'X', 'Y', 'W', 'H']
+                    writer = csv.DictWriter(file, fieldnames=columns)
+                    writer.writeheader()
+                    writer.writerows(bboxes_table)
diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
index c88e15d7ba..9de4e1e592 100644
--- a/datumaro/plugins/voc_format/converter.py
+++ b/datumaro/plugins/voc_format/converter.py
@@ -13,7 +13,7 @@
 from lxml import etree as ET
 
 from datumaro.components.converter import Converter
-from datumaro.components.extractor import (DEFAULT_SUBSET_NAME, AnnotationType,
+from datumaro.components.extractor import (AnnotationType,
     CompiledMask, LabelCategories)
 from datumaro.util import find, str_to_bool
 from datumaro.util.image import save_image
@@ -104,6 +104,7 @@ def __init__(self, extractor, save_dir,
 
         if label_map is None:
             label_map = LabelmapType.source.name
+        assert isinstance(label_map, (str, dict)), label_map
         self._load_categories(label_map)
 
     def apply(self):
@@ -469,9 +470,9 @@ def _load_categories(self, label_map_source):
             label_map = parse_label_map(label_map_source)
 
         else:
-            raise Exception("Wrong labelmap specified, "
+            raise Exception("Wrong labelmap specified: '%s', "
                 "expected one of %s or a file path" % \
-                ', '.join(t.name for t in LabelmapType))
+                (label_map_source, ', '.join(t.name for t in LabelmapType)))
 
         # There must always be a label with color (0, 0, 0) at index 0
         bg_label = find(label_map.items(), lambda x: x[1][0] == (0, 0, 0))
diff --git a/datumaro/plugins/widerface_format.py b/datumaro/plugins/widerface_format.py
new file mode 100644
index 0000000000..58c25540b1
--- /dev/null
+++ b/datumaro/plugins/widerface_format.py
@@ -0,0 +1,175 @@
+
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import os
+import os.path as osp
+import re
+
+from datumaro.components.converter import Converter
+from datumaro.components.extractor import (AnnotationType, Bbox, DatasetItem,
+    Importer, Label, LabelCategories, SourceExtractor)
+
+
+class WiderFacePath:
+    IMAGE_EXT = '.jpg'
+    ANNOTATIONS_DIR = 'wider_face_split'
+    IMAGES_DIR = 'images'
+    SUBSET_DIR = 'WIDER_'
+    LABELS_FILE = 'labels.txt'
+    IMAGES_DIR_NO_LABEL = 'no_label'
+    BBOX_ATTRIBUTES = ['blur', 'expression', 'illumination',
+        'occluded', 'pose', 'invalid']
+
+class WiderFaceExtractor(SourceExtractor):
+    def __init__(self, path):
+        if not osp.isfile(path):
+            raise Exception("Can't read annotation file '%s'" % path)
+        self._path = path
+        self._dataset_dir = osp.dirname(osp.dirname(path))
+
+        subset = osp.splitext(osp.basename(path))[0]
+        match = re.fullmatch(r'wider_face_\S+_bbx_gt', subset)
+        if match:
+            subset = subset.split('_')[2]
+        super().__init__(subset=subset)
+
+        self._categories = self._load_categories()
+        self._items = list(self._load_items(path).values())
+
+    def _load_categories(self):
+        self._categories[AnnotationType.label] = LabelCategories()
+        label_cat = LabelCategories()
+        path = osp.join(self._dataset_dir, WiderFacePath.LABELS_FILE)
+        if osp.isfile(path):
+            with open(path, encoding='utf-8') as labels_file:
+                labels = [s.strip() for s in labels_file]
+            for label in labels:
+                label_cat.add(label)
+        else:
+            subset_path = osp.join(self._dataset_dir,
+                WiderFacePath.SUBSET_DIR + self._subset,
+                WiderFacePath.IMAGES_DIR)
+            if osp.isdir(subset_path):
+                for images_dir in sorted(os.listdir(subset_path)):
+                    if osp.isdir(osp.join(subset_path, images_dir)) and \
+                            images_dir != WiderFacePath.IMAGES_DIR_NO_LABEL:
+                        if '--' in images_dir:
+                            images_dir = images_dir.split('--')[1]
+                        label_cat.add(images_dir)
+        return { AnnotationType.label: label_cat }
+
+    def _load_items(self, path):
+        items = {}
+        with open(path, 'r') as f:
+            lines = f.readlines()
+
+        image_ids = [image_id for image_id, line in enumerate(lines)
+            if WiderFacePath.IMAGE_EXT in line]
+
+        for image_id in image_ids:
+            image = lines[image_id]
+            image_path = osp.join(self._dataset_dir, WiderFacePath.SUBSET_DIR
+                + self._subset, WiderFacePath.IMAGES_DIR, image[:-1])
+            item_id = image[:-(len(WiderFacePath.IMAGE_EXT) + 1)]
+            annotations = []
+            if '/' in item_id:
+                label_name = item_id.split('/')[0]
+                if '--' in label_name:
+                    label_name = label_name.split('--')[1]
+                if label_name != WiderFacePath.IMAGES_DIR_NO_LABEL:
+                    label = \
+                        self._categories[AnnotationType.label].find(label_name)[0]
+                    annotations.append(Label(label=label))
+                item_id = item_id[len(item_id.split('/')[0]) + 1:]
+
+            bbox_count = lines[image_id + 1]
+            bbox_lines = lines[image_id + 2 : image_id + int(bbox_count) + 2]
+            for bbox in bbox_lines:
+                bbox_list = bbox.split()
+                if len(bbox_list) >= 4:
+                    attributes = {}
+                    if len(bbox_list) == 10:
+                        i = 4
+                        for attr in WiderFacePath.BBOX_ATTRIBUTES:
+                            if bbox_list[i] != '-':
+                                attributes[attr] = int(bbox_list[i])
+                            i += 1
+                    annotations.append(Bbox(
+                        float(bbox_list[0]), float(bbox_list[1]),
+                        float(bbox_list[2]), float(bbox_list[3]),
+                        attributes = attributes
+                    ))
+
+            items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                image=image_path, annotations=annotations)
+        return items
+
+class WiderFaceImporter(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        return cls._find_sources_recursive(path, '.txt', 'wider_face',
+            dirname=WiderFacePath.ANNOTATIONS_DIR)
+
+class WiderFaceConverter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    def apply(self):
+        save_dir = self._save_dir
+        os.makedirs(save_dir, exist_ok=True)
+
+        label_categories = self._extractor.categories()[AnnotationType.label]
+
+        labels_path = osp.join(save_dir, WiderFacePath.LABELS_FILE)
+        with open(labels_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(label.name for label in label_categories))
+
+        for subset_name, subset in self._extractor.subsets().items():
+            subset_dir = osp.join(save_dir, WiderFacePath.SUBSET_DIR + subset_name)
+
+            wider_annotation = ''
+            for item in subset:
+                labels = [a.label for a in item.annotations
+                    if a.type == AnnotationType.label]
+                if labels:
+                    wider_annotation += '%s\n' % (str(labels[0]) + '--' \
+                        + label_categories[labels[0]].name + '/' \
+                        + item.id + WiderFacePath.IMAGE_EXT)
+                    if item.has_image and self._save_images:
+                        self._save_image(item, osp.join(save_dir, subset_dir,
+                            WiderFacePath.IMAGES_DIR, str(labels[0]) + '--' \
+                            + label_categories[labels[0]].name + '/' + item.id \
+                            + WiderFacePath.IMAGE_EXT))
+                else:
+                    wider_annotation += '%s\n' % (WiderFacePath.IMAGES_DIR_NO_LABEL \
+                        + '/' + item.id + WiderFacePath.IMAGE_EXT)
+                    if item.has_image and self._save_images:
+                        self._save_image(item, osp.join(save_dir, subset_dir,
+                            WiderFacePath.IMAGES_DIR, WiderFacePath.IMAGES_DIR_NO_LABEL \
+                            + '/' + item.id + WiderFacePath.IMAGE_EXT))
+
+                bboxes = [a for a in item.annotations
+                    if a.type == AnnotationType.bbox]
+
+                wider_annotation += '%s\n' % len(bboxes)
+                for bbox in bboxes:
+                    wider_bb = ' '.join('%d' % p for p in bbox.get_bbox())
+                    wider_annotation += '%s ' % wider_bb
+                    if bbox.attributes:
+                        wider_attr = ''
+                        attr_counter = 0
+                        for attr in WiderFacePath.BBOX_ATTRIBUTES:
+                            if attr in bbox.attributes:
+                                wider_attr += '%s ' % bbox.attributes[attr]
+                                attr_counter += 1
+                            else:
+                                wider_attr += '- '
+                        if attr_counter > 0:
+                            wider_annotation += wider_attr
+                    wider_annotation  += '\n'
+            annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR,
+                'wider_face_' + subset_name + '_bbx_gt.txt')
+            os.makedirs(osp.dirname(annotation_path), exist_ok=True)
+            with open(annotation_path, 'w') as f:
+                f.write(wider_annotation)
diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py
index be9b2ba807..54774f08cb 100644
--- a/datumaro/plugins/yolo_format/extractor.py
+++ b/datumaro/plugins/yolo_format/extractor.py
@@ -106,7 +106,7 @@ def __init__(self, config_path, image_info=None):
 
     @staticmethod
     def localize_path(path):
-        path = path.strip()
+        path = osp.normpath(path).strip()
         default_base = osp.join('data', '')
         if path.startswith(default_base): # default path
             path = path[len(default_base) : ]
diff --git a/datumaro/util/__init__.py b/datumaro/util/__init__.py
index 0a75756bd2..b7e56890ae 100644
--- a/datumaro/util/__init__.py
+++ b/datumaro/util/__init__.py
@@ -3,8 +3,11 @@
 #
 # SPDX-License-Identifier: MIT
 
+import attr
 import os
 import os.path as osp
+from contextlib import ExitStack
+from functools import partial, wraps
 from itertools import islice
 
 
@@ -90,4 +93,105 @@ def str_to_bool(s):
         raise ValueError("Can't convert value '%s' to bool" % s)
 
 def filter_dict(d, exclude_keys):
-    return { k: v for k, v in d.items() if k not in exclude_keys }
\ No newline at end of file
+    return { k: v for k, v in d.items() if k not in exclude_keys }
+
+def optional_arg_decorator(fn):
+    @wraps(fn)
+    def wrapped_decorator(*args, **kwargs):
+        if len(args) == 1 and callable(args[0]) and not kwargs:
+            return fn(args[0], **kwargs)
+
+        else:
+            def real_decorator(decoratee):
+                return fn(decoratee, *args, **kwargs)
+
+            return real_decorator
+
+    return wrapped_decorator
+
+class Rollback:
+    @attr.attrs
+    class Handler:
+        callback = attr.attrib()
+        enabled = attr.attrib(default=True)
+        ignore_errors = attr.attrib(default=False)
+
+        def __call__(self):
+            if self.enabled:
+                try:
+                    self.callback()
+                except: # pylint: disable=bare-except
+                    if not self.ignore_errors:
+                        raise
+
+    def __init__(self):
+        self._handlers = {}
+        self._stack = ExitStack()
+        self.enabled = True
+
+    def add(self, callback, *args,
+            name=None, enabled=True, ignore_errors=False,
+            fwd_kwargs=None, **kwargs):
+        if args or kwargs or fwd_kwargs:
+            if fwd_kwargs:
+                kwargs.update(fwd_kwargs)
+            callback = partial(callback, *args, **kwargs)
+        name = name or hash(callback)
+        assert name not in self._handlers
+        handler = self.Handler(callback,
+            enabled=enabled, ignore_errors=ignore_errors)
+        self._handlers[name] = handler
+        self._stack.callback(handler)
+        return name
+
+    do = add # readability alias
+
+    def enable(self, name=None):
+        if name:
+            self._handlers[name].enabled = True
+        else:
+            self.enabled = True
+
+    def disable(self, name=None):
+        if name:
+            self._handlers[name].enabled = False
+        else:
+            self.enabled = False
+
+    def clean(self):
+        self.__exit__(None, None, None)
+
+    def __enter__(self):
+        return self
+
+    # pylint: disable=redefined-builtin
+    def __exit__(self, type=None, value=None, traceback=None):
+        if type is None:
+            return
+        if not self.enabled:
+            return
+        self._stack.__exit__(type, value, traceback)
+    # pylint: enable=redefined-builtin
+
+@optional_arg_decorator
+def error_rollback(func, arg_name='on_error', implicit=False):
+    @wraps(func)
+    def wrapped_func(*args, **kwargs):
+        with Rollback() as manager:
+            if implicit:
+                fglobals = func.__globals__
+
+                has_arg = arg_name in fglobals
+                old_val = fglobals.get(arg_name)
+                fglobals[arg_name] = manager
+                try:
+                    func(*args, **kwargs)
+                finally:
+                    if has_arg:
+                        func.__globals__[arg_name] = old_val
+                    else:
+                        func.__globals__.pop(arg_name)
+            else:
+                kwargs[arg_name] = manager
+                func(*args, **kwargs)
+    return wrapped_func
diff --git a/datumaro/util/image.py b/datumaro/util/image.py
index c653adf687..e2f086e74b 100644
--- a/datumaro/util/image.py
+++ b/datumaro/util/image.py
@@ -205,6 +205,8 @@ def __init__(self, data=None, path=None, loader=None, cache=None,
         assert path is None or isinstance(path, str)
         if path is None:
             path = ''
+        elif path:
+            path = osp.abspath(path)
         self._path = path
 
         assert data is not None or path or loader, "Image can not be empty"
diff --git a/datumaro/util/os_util.py b/datumaro/util/os_util.py
index b4d05e376d..c090dced14 100644
--- a/datumaro/util/os_util.py
+++ b/datumaro/util/os_util.py
@@ -1,9 +1,12 @@
-
-# Copyright (C) 2020 Intel Corporation
+# Copyright (C) 2020-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
+import importlib
+import os
+import os.path as osp
 import subprocess
+import sys
 
 
 def check_instruction_set(instruction):
@@ -14,4 +17,27 @@ def check_instruction_set(instruction):
         subprocess.check_output(
             'lscpu | grep -o "%s" | head -1' % instruction,
             shell=True).decode('utf-8') # nosec
-    )
\ No newline at end of file
+    )
+
+def import_foreign_module(name, path, package=None):
+    module = None
+    default_path = sys.path.copy()
+    try:
+        sys.path = [ osp.abspath(path), ] + default_path
+        sys.modules.pop(name, None) # remove from cache
+        module = importlib.import_module(name, package=package)
+        sys.modules.pop(name) # remove from cache
+    except Exception:
+        raise
+    finally:
+        sys.path = default_path
+    return module
+
+def walk(path, max_depth=None):
+    baselevel = path.count(osp.sep)
+    for dirpath, dirnames, filenames in os.walk(path, topdown=True):
+        curlevel = dirpath.count(osp.sep)
+        if baselevel + max_depth <= curlevel:
+            dirnames.clear() # topdown=True allows to modify the list
+
+        yield dirpath, dirnames, filenames
\ No newline at end of file
diff --git a/datumaro/util/test_utils.py b/datumaro/util/test_utils.py
index 08ffa4ed53..63bd4222c0 100644
--- a/datumaro/util/test_utils.py
+++ b/datumaro/util/test_utils.py
@@ -10,7 +10,7 @@
 import tempfile
 
 from datumaro.components.extractor import AnnotationType
-from datumaro.components.project import Project
+from datumaro.components.dataset import Dataset
 from datumaro.util import find
 
 
@@ -132,8 +132,7 @@ def test_save_and_load(test, source_dataset, converter, test_dir, importer,
 
     if importer_args is None:
         importer_args = {}
-    parsed_dataset = Project.import_from(test_dir, importer, **importer_args) \
-        .make_dataset()
+    parsed_dataset = Dataset.import_from(test_dir, importer, **importer_args)
 
     if target_dataset is None:
         target_dataset = source_dataset
diff --git a/datumaro/version.py b/datumaro/version.py
index a0901263b2..0fd5744b0f 100644
--- a/datumaro/version.py
+++ b/datumaro/version.py
@@ -1 +1 @@
-VERSION = '0.1.4'
\ No newline at end of file
+VERSION = '0.1.5.1'
\ No newline at end of file
diff --git a/docs/design.md b/docs/design.md
index 528b2adf75..1e520400c0 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -73,9 +73,11 @@ Datumaro is:
 
 ## RC 1 vision
 
-In the first version Datumaro should be a project manager for CVAT.
-It should only consume data from CVAT. The collected dataset
-can be downloaded by user to be operated on with Datumaro CLI.
+*CVAT integration*
+
+Datumaro needs to be integrated with [CVAT](https://github.com/openvinotoolkit/cvat),
+extending CVAT UI capabilities regarding task and project operations.
+It should be capable of downloading and processing data from CVAT.
 
 <!--lint disable fenced-code-flag-->
 ```
@@ -94,6 +96,7 @@ can be downloaded by user to be operated on with Datumaro CLI.
 
 - [x] Python API for user code
   - [x] Installation as a package
+  - [x] Installation with `pip` by name
 - [x] A command-line tool for dataset manipulations
 
 ### Features
@@ -106,7 +109,7 @@ can be downloaded by user to be operated on with Datumaro CLI.
   - [x] YOLO
   - [x] TF Detection API
   - [ ] Cityscapes
-  - [ ] ImageNet
+  - [x] ImageNet
 
 - Dataset visualization (`show`)
   - [ ] Ability to visualize a dataset
@@ -117,7 +120,7 @@ can be downloaded by user to be operated on with Datumaro CLI.
   - [x] Object counts (detection scenario)
   - [x] Image-Class distribution (classification scenario)
   - [x] Pixel-Class distribution (segmentation scenario)
-  - [ ] Image similarity clusters
+  - [x] Image similarity clusters
   - [ ] Custom statistics
 
 - Dataset building
@@ -164,7 +167,7 @@ can be downloaded by user to be operated on with Datumaro CLI.
 ### Optional features
 
 - Dataset publishing
-  - [ ] Versioning (for annotations, subsets, sources, etc.)
+  - [x] Versioning (for annotations, subsets, sources, etc.)
   - [ ] Blur sensitive areas on images
   - [ ] Tracking of legal information
   - [ ] Documentation generation
@@ -175,7 +178,7 @@ can be downloaded by user to be operated on with Datumaro CLI.
 
 - Dataset and model debugging
   - [ ] Training visualization
-  - [ ] Inference explanation (`explain`)
+  - [x] Inference explanation (`explain`)
     - [ ] White-box approach
 
 ### Properties
diff --git a/docs/developer_guide.md b/docs/developer_guide.md
index 6317cee999..52cb09e327 100644
--- a/docs/developer_guide.md
+++ b/docs/developer_guide.md
@@ -38,28 +38,27 @@ Datumaro has a number of dataset and annotation features:
 - various annotation operations
 
 ```python
-from datumaro.components.project import Environment, Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import Bbox, Polygon, DatasetItem
 
-# Import and save a dataset
-env = Environment()
-dataset = env.make_importer('voc')('src/dir').make_dataset()
-env.converters.get('coco').convert(dataset, save_dir='dst/dir')
+# Import and export a dataset
+dataset = Dataset.import_from('src/dir', 'voc')
+dataset.export('dst/dir', 'coco')
 
 # Create a dataset, convert polygons to masks, save in PASCAL VOC format
 dataset = Dataset.from_iterable([
-    DatasetItem(id='image1', annotations=[
-        Bbox(x=1, y=2, w=3, h=4, label=1),
-        Polygon([1, 2, 3, 2, 4, 4], label=2, attributes={'occluded': True}),
-    ]),
+  DatasetItem(id='image1', annotations=[
+    Bbox(x=1, y=2, w=3, h=4, label=1),
+    Polygon([1, 2, 3, 2, 4, 4], label=2, attributes={'occluded': True}),
+  ]),
 ], categories=['cat', 'dog', 'person'])
-dataset = dataset.transform(env.transforms.get('polygons_to_masks'))
-env.converters.get('voc').convert(dataset, save_dir='dst/dir')
+dataset = dataset.transform('polygons_to_masks')
+dataset.export('dst/dir', 'voc')
 ```
 
 ### The Dataset class
 
-The `Dataset` class from the `datumaro.components.project` module represents
+The `Dataset` class from the `datumaro.components.dataset` module represents
 a dataset, consisting of multiple `DatasetItem`s. Annotations are
 represented by members of the `datumaro.components.extractor` module,
 such as `Label`, `Mask` or `Polygon`. A dataset can contain items from one or
@@ -80,16 +79,19 @@ The main operation for a dataset is iteration over its elements.
 An item corresponds to a single image, a video sequence, etc. There are also
 few other operations available, such as filtration (`dataset.select`) and
 transformations (`dataset.transform`). A dataset can be created from extractors
-or other datasets with `dataset.from_extractors` and directly from items with
-`dataset.from_iterable`. A dataset is an extractor itself. If it is created from
-multiple extractors, their categories must match, and their contents will be
-merged.
+or other datasets with `Dataset.from_extractors()` and directly from items with
+`Dataset.from_iterable()`. A dataset is an extractor itself. If it is created
+from multiple extractors, their categories must match, and their contents
+will be merged.
 
 A dataset item is an element of a dataset. Its `id` is a name of a
 corresponding image. There can be some image `attributes`,
 an `image` and `annotations`.
 
 ```python
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import Bbox, Polygon, DatasetItem
+
 # create a dataset from other datasets
 dataset = Dataset.from_extractors(dataset1, dataset2)
 
@@ -105,7 +107,7 @@ dataset = Dataset.from_iterable([
 dataset = dataset.select(lambda item: len(item.annotations) != 0)
 
 # change dataset labels
-dataset = dataset.transform(project.env.transforms.get('remap_labels'),
+dataset = dataset.transform('remap_labels',
   {'cat': 'dog', # rename cat to dog
     'truck': 'car', # rename truck to car
     'person': '', # remove this label
@@ -116,8 +118,7 @@ for item in dataset:
   print(item.id, item.annotations)
 
 # iterate over subsets
-for subset_name in dataset.subsets():
-  subset = dataset.get_subset(subset_name) # a dataset, again
+for subset_name, subset in dataset.subsets().items():
   for item in subset:
     print(item.id, item.annotations)
 ```
@@ -129,6 +130,7 @@ persistence, of extending, and CLI operation for Datasets. A project can
 be converted to a Dataset with `project.make_dataset`. Project datasets
 can have multiple data sources, which are merged on dataset creation. They
 can have a hierarchy. Project configuration is available in `project.config`.
+A dataset can be saved in `datumaro_project` format.
 
 The `Environment` class is responsible for accessing built-in and
 project-specific plugins. For a project, there is an instance of
@@ -204,11 +206,12 @@ YoloConverter.convert(dataset, save_dir=dst_dir)
 
 ### Writing a plugin
 
-A plugin is a Python module with any name, which exports some symbols.
-To export a symbol, inherit it from one of special classes:
+A plugin is a Python module with any name, which exports some symbols. Symbols,
+starting with `_` are not exported by default. To export a symbol,
+inherit it from one of the special classes:
 
 ```python
-from datumaro.components.extractor import Importer, SourceExtractor, Transform
+from datumaro.components.extractor import Importer, Extractor, Transform
 from datumaro.components.launcher import Launcher
 from datumaro.components.converter import Converter
 ```
@@ -224,6 +227,19 @@ There is also an additional class to modify plugin appearance in command line:
 
 ```python
 from datumaro.components.cli_plugin import CliPlugin
+
+class MyPlugin(Converter, CliPlugin):
+  """
+    Optional documentation text, which will appear in command-line help
+  """
+
+  NAME = 'optional_custom_plugin_name'
+
+  def build_cmdline_parser(self, **kwargs):
+    parser = super().build_cmdline_parser(**kwargs)
+    # set up argparse.ArgumentParser instance
+    # the parsed args are supposed to be used as invocation options
+    return parser
 ```
 
 #### Plugin example
@@ -269,13 +285,14 @@ class MyTransform(Transform, CliPlugin):
 `my_plugin2.py` contents:
 
 ```python
-from datumaro.components.extractor import SourceExtractor
+from datumaro.components.extractor import Extractor
 
 class MyFormat: ...
-class MyFormatExtractor(SourceExtractor): ...
+class _MyFormatConverter(Converter): ...
+class MyFormatExtractor(Extractor): ...
 
 exports = [MyFormat] # explicit exports declaration
-# MyFormatExtractor won't be exported
+# MyFormatExtractor and _MyFormatConverter won't be exported
 ```
 
 ## Command-line
diff --git a/docs/user_manual.md b/docs/user_manual.md
index 9afd63e74f..675c7c9242 100644
--- a/docs/user_manual.md
+++ b/docs/user_manual.md
@@ -20,7 +20,7 @@
   - [Obtaining project info](#get-project-info)
   - [Obtaining project statistics](#get-project-statistics)
   - [Register model](#register-model)
-  - [Run inference](#run-inference)
+  - [Run inference](#run-model)
   - [Run inference explanation](#explain-inference)
   - [Transform project](#transform-project)
 - [Extending](#extending)
@@ -45,11 +45,15 @@ python -m virtualenv venv
 
 Install:
 ``` bash
+# From PyPI:
+pip install datumaro
+
+# From the GitHub repository:
 pip install 'git+https://github.com/openvinotoolkit/datumaro'
 ```
 
 > You can change the installation branch with `...@<branch_name>`
-> Also note `--force-reinstall` parameter in this case.
+> Also use `--force-reinstall` parameter in this case.
 
 ## Interfaces
 
@@ -91,6 +95,12 @@ List of supported formats:
 - TF Detection API (`bboxes`, `masks`)
   - Format specifications: [bboxes](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md), [masks](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/instance_segmentation.md)
   - [Dataset example](../tests/assets/tf_detection_api_dataset)
+- WIDER Face (`bboxes`)
+  - [Format specification](http://shuoyang1213.me/WIDERFACE/)
+  - [Dataset example](../tests/assets/wider_dataset)
+- VGGFace2 (`landmarks`, `bboxes`)
+  - [Format specification](https://github.com/ox-vgg/vgg_face2)
+  - [Dataset example](../tests/assets/vgg_face2_dataset)
 - MOT sequences
   - [Format specification](https://arxiv.org/pdf/1906.04567.pdf)
   - [Dataset example](../tests/assets/mot_dataset)
@@ -176,11 +186,11 @@ Usage:
 datum convert --help
 
 datum convert \
-     -i <input path> \
-     -if <input format> \
-     -o <output path> \
-     -f <output format> \
-     -- [extra parameters for output format]
+    -i <input path> \
+    -if <input format> \
+    -o <output path> \
+    -f <output format> \
+    -- [extra parameters for output format]
 ```
 
 Example: convert a VOC-like dataset to a COCO-like one:
@@ -200,21 +210,21 @@ for information on extra format support.
 Usage:
 
 ``` bash
-datum project import --help
+datum import --help
 
-datum project import \
-     -i <dataset_path> \
-     -o <project_dir> \
-     -f <format>
+datum import \
+    -i <dataset_path> \
+    -o <project_dir> \
+    -f <format>
 ```
 
 Example: create a project from COCO-like dataset
 
 ``` bash
-datum project import \
-     -i /home/coco_dir \
-     -o /home/project_dir \
-     -f coco
+datum import \
+    -i /home/coco_dir \
+    -o /home/project_dir \
+    -f coco
 ```
 
 An _MS COCO_-like dataset should have the following directory structure:
@@ -241,16 +251,16 @@ a few options to interact with it.
 Usage:
 
 ``` bash
-datum project create --help
+datum create --help
 
-datum project create \
-  -o <project_dir>
+datum create \
+    -o <project_dir>
 ```
 
 Example: create an empty project `my_dataset`
 
 ``` bash
-datum project create -o my_dataset/
+datum create -o my_dataset/
 ```
 
 ### Add and remove data
@@ -278,31 +288,32 @@ for information on extra format support.
 Usage:
 
 ``` bash
-datum source add --help
-datum source remove --help
-
-datum source add \
-     path <path> \
-     -p <project dir> \
-     -n <name>
-
-datum source remove \
-     -p <project dir> \
-     -n <name>
+datum add --help
+datum remove --help
+
+datum add \
+    path <path> \
+    -p <project dir> \
+    -f <format> \
+    -n <name>
+
+datum remove \
+    -p <project dir> \
+    -n <name>
 ```
 
 Example: create a project from a bunch of different annotations and images,
 and generate TFrecord for TF Detection API for model training
 
 ``` bash
-datum project create
+datum create
 # 'default' is the name of the subset below
-datum source add path <path/to/coco/instances_default.json> -f coco_instances
-datum source add path <path/to/cvat/default.xml> -f cvat
-datum source add path <path/to/voc> -f voc_detection
-datum source add path <path/to/datumaro/default.json> -f datumaro
-datum source add path <path/to/images/dir> -f image_dir
-datum project export -f tf_detection_api
+datum add path <path/to/coco/instances_default.json> -f coco_instances
+datum add path <path/to/cvat/default.xml> -f cvat
+datum add path <path/to/voc> -f voc_detection
+datum add path <path/to/datumaro/default.json> -f datumaro
+datum add path <path/to/images/dir> -f image_dir
+datum export -f tf_detection_api
 ```
 
 ### Filter project
@@ -325,35 +336,42 @@ returns `annotation` elements (see examples).
 Usage:
 
 ``` bash
-datum project filter --help
+datum filter --help
 
-datum project filter \
-     -p <project dir> \
-     -e '<xpath filter expression>'
+datum filter \
+    -p <project dir> \
+    -e '<xpath filter expression>'
 ```
 
 Example: extract a dataset with only images which `width` < `height`
 
+``` bash
+datum filter \
+    -p test_project \
+    -e '/item[image/width < image/height]'
+```
+
+Example: extract a dataset with only images of subset `train`.
 ``` bash
 datum project filter \
-     -p test_project \
-     -e '/item[image/width < image/height]'
+    -p test_project \
+    -e '/item[subset="train"]'
 ```
 
 Example: extract a dataset with only large annotations of class `cat` and any non-`persons`
 
 ``` bash
-datum project filter \
-     -p test_project \
-     --mode annotations -e '/item/annotation[(label="cat" and area > 99.5) or label!="person"]'
+datum filter \
+    -p test_project \
+    --mode annotations -e '/item/annotation[(label="cat" and area > 99.5) or label!="person"]'
 ```
 
 Example: extract a dataset with only occluded annotations, remove empty images
 
 ``` bash
-datum project filter \
-     -p test_project \
-     -m i+a -e '/item/annotation[occluded="True"]'
+datum filter \
+    -p test_project \
+    -m i+a -e '/item/annotation[occluded="True"]'
 ```
 
 Item representations are available with `--dry-run` parameter:
@@ -399,22 +417,22 @@ This command updates items in a project from another one
 Usage:
 
 ``` bash
-datum project merge --help
+datum merge --help
 
-datum project merge \
-     -p <project dir> \
-     -o <output dir> \
-     <other project dir>
+datum merge \
+    -p <project dir> \
+    -o <output dir> \
+    <other project dir>
 ```
 
 Example: update annotations in the `first_project` with annotations
 from the `second_project` and save the result as `merged_project`
 
 ``` bash
-datum project merge \
-     -p first_project \
-     -o merged_project \
-     second_project
+datum merge \
+    -p first_project \
+    -o merged_project \
+    second_project
 ```
 
 ### Merge projects
@@ -440,9 +458,9 @@ Example: merge 4 (partially-)intersecting projects,
 
 ``` bash
 datum merge project1/ project2/ project3/ project4/ \
-     --quorum 3 \
-     -iou 0.6 \
-     --groups 'person,hand?,head,foot?'
+    --quorum 3 \
+    -iou 0.6 \
+    --groups 'person,hand?,head,foot?'
 ```
 
 ### Export project
@@ -455,23 +473,23 @@ for information on extra format support.
 Usage:
 
 ``` bash
-datum project export --help
+datum export --help
 
-datum project export \
-     -p <project dir> \
-     -o <output dir> \
-     -f <format> \
-     -- [additional format parameters]
+datum export \
+    -p <project dir> \
+    -o <output dir> \
+    -f <format> \
+    -- [additional format parameters]
 ```
 
 Example: save project as VOC-like dataset, include images, convert images to `PNG`
 
 ``` bash
-datum project export \
-     -p test_project \
-     -o test_project-export \
-     -f voc \
-     -- --save-images --image-ext='.png'
+datum export \
+    -p test_project \
+    -o test_project-export \
+    -f voc \
+    -- --save-images --image-ext='.png'
 ```
 
 ### Get project info
@@ -481,16 +499,16 @@ This command outputs project status information.
 Usage:
 
 ``` bash
-datum project info --help
+datum info --help
 
-datum project info \
-     -p <project dir>
+datum info \
+    -p <project dir>
 ```
 
 Example:
 
 ``` bash
-datum project info -p /test_project
+datum info -p /test_project
 
 Project:
   name: test_project
@@ -525,10 +543,10 @@ This command computes various project statistics, such as:
 Usage:
 
 ``` bash
-datum project stats --help
+datum stats --help
 
-datum project stats \
-     -p <project dir>
+datum stats \
+    -p <project dir>
 ```
 
 Example:
@@ -536,7 +554,7 @@ Example:
 <details>
 
 ``` bash
-datum project stats -p /test_project
+datum stats -p test_project
 
 {
     "annotations": {
@@ -784,7 +802,14 @@ datum project stats -p /test_project
         "img00054",
         "img00055",
     ],
-    "unannotated images count": 5
+    "unannotated images count": 5,
+    "unique images count": 97,
+    "repeating images count": 3,
+    "repeating images": [
+        [("img00057", "default"), ("img00058", "default")],
+        [("img00059", "default"), ("img00060", "default")],
+        [("img00061", "default"), ("img00062", "default")],
+    ],
 }
 ```
 
@@ -808,10 +833,10 @@ A model consists of a graph description and weights. There is also a script
 used to convert model outputs to internal data structures.
 
 ``` bash
-datum project create
+datum create
 datum model add \
-     -n <model_name> openvino \
-     -d <path_to_xml> -w <path_to_bin> -i <path_to_interpretation_script>
+    -n <model_name> -l open_vino -- \
+    -d <path_to_xml> -w <path_to_bin> -i <path_to_interpretation_script>
 ```
 
 Interpretation script for an OpenVINO detection model (`convert.py`):
@@ -823,38 +848,38 @@ max_det = 10
 conf_thresh = 0.1
 
 def process_outputs(inputs, outputs):
-     # inputs = model input, array or images, shape = (N, C, H, W)
-     # outputs = model output, shape = (N, 1, K, 7)
-     # results = conversion result, [ [ Annotation, ... ], ... ]
-     results = []
-     for input, output in zip(inputs, outputs):
-          input_height, input_width = input.shape[:2]
-          detections = output[0]
-          image_results = []
-          for i, det in enumerate(detections):
-               label = int(det[1])
-               conf = det[2]
-               if conf <= conf_thresh:
-                    continue
-
-               x = max(int(det[3] * input_width), 0)
-               y = max(int(det[4] * input_height), 0)
-               w = min(int(det[5] * input_width - x), input_width)
-               h = min(int(det[6] * input_height - y), input_height)
-               image_results.append(Bbox(x, y, w, h,
-                    label=label, attributes={'score': conf} ))
-
-               results.append(image_results[:max_det])
-
-     return results
+    # inputs = model input, array or images, shape = (N, C, H, W)
+    # outputs = model output, shape = (N, 1, K, 7)
+    # results = conversion result, [ [ Annotation, ... ], ... ]
+    results = []
+    for input, output in zip(inputs, outputs):
+        input_height, input_width = input.shape[:2]
+        detections = output[0]
+        image_results = []
+        for i, det in enumerate(detections):
+            label = int(det[1])
+            conf = float(det[2])
+            if conf <= conf_thresh:
+                continue
+
+            x = max(int(det[3] * input_width), 0)
+            y = max(int(det[4] * input_height), 0)
+            w = min(int(det[5] * input_width - x), input_width)
+            h = min(int(det[6] * input_height - y), input_height)
+            image_results.append(Bbox(x, y, w, h,
+                label=label, attributes={'score': conf} ))
+
+            results.append(image_results[:max_det])
+
+    return results
 
 def get_categories():
-     # Optionally, provide output categories - label map etc.
-     # Example:
-     label_categories = LabelCategories()
-     label_categories.add('person')
-     label_categories.add('car')
-     return { AnnotationType.label: label_categories }
+    # Optionally, provide output categories - label map etc.
+    # Example:
+    label_categories = LabelCategories()
+    label_categories.add('person')
+    label_categories.add('car')
+    return { AnnotationType.label: label_categories }
 ```
 
 ### Run model
@@ -867,15 +892,15 @@ Usage:
 datum model run --help
 
 datum model run \
-     -p <project dir> \
-     -m <model_name> \
-     -o <save_dir>
+    -p <project dir> \
+    -m <model_name> \
+    -o <save_dir>
 ```
 
 Example: launch inference on a dataset
 
 ``` bash
-datum project import <...>
+datum import <...>
 datum model add mymodel <...>
 datum model run -m mymodel -o inference
 ```
@@ -887,18 +912,18 @@ specified directory. The current project is considered to be
 "ground truth".
 
 ``` bash
-datum project diff --help
+datum diff --help
 
-datum project diff <other_project_dir> -o <save_dir>
+datum diff <other_project_dir> -o <save_dir>
 ```
 
 Example: compare a dataset with model inference
 
 ``` bash
-datum project import <...>
+datum import <...>
 datum model add mymodel <...>
-datum project transform <...> -o inference
-datum project diff inference -o diff
+datum transform <...> -o inference
+datum diff inference -o diff
 ```
 
 ### Explain inference
@@ -909,23 +934,23 @@ Usage:
 datum explain --help
 
 datum explain \
-     -m <model_name> \
-     -o <save_dir> \
-     -t <target> \
-     <method> \
-     <method_params>
+    -m <model_name> \
+    -o <save_dir> \
+    -t <target> \
+    <method> \
+    <method_params>
 ```
 
 Example: run inference explanation on a single image with visualization
 
 ``` bash
-datum project create <...>
+datum create <...>
 datum model add mymodel <...>
 datum explain \
-     -m mymodel \
-     -t 'image.png' \
-     rise \
-     -s 1000 --progressive
+    -m mymodel \
+    -t 'image.png' \
+    rise \
+    -s 1000 --progressive
 ```
 
 ### Transform Project
@@ -933,36 +958,50 @@ datum explain \
 This command allows to modify images or annotations in a project all at once.
 
 ``` bash
-datum project transform --help
+datum transform --help
 
-datum project transform \
-     -p <project_dir> \
-     -o <output_dir> \
-     -t <transform_name> \
-     -- [extra transform options]
+datum transform \
+    -p <project_dir> \
+    -o <output_dir> \
+    -t <transform_name> \
+    -- [extra transform options]
 ```
 
 Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1
 
 ``` bash
-datum project transform -t random_split -- --subset train:.67 --subset test:.33
+datum transform -t random_split -- --subset train:.67 --subset test:.33
+```
+
+Example: split a dataset in task-specific manner. Supported tasks are
+classification, detection, and re-identification.
+
+``` bash
+datum transform -t classification_split -- \
+    --subset train:.5 --subset val:.2 --subset test:.3
+
+datum transform -t detection_split -- \
+    --subset train:.5 --subset val:.2 --subset test:.3
+
+datum transform -t reidentification_split -- \
+    --subset train:.5 --subset val:.2 --subset test:.3 --query .5
 ```
 
 Example: convert polygons to masks, masks to boxes etc.:
 
 ``` bash
-datum project transform -t boxes_to_masks
-datum project transform -t masks_to_polygons
-datum project transform -t polygons_to_masks
-datum project transform -t shapes_to_boxes
+datum transform -t boxes_to_masks
+datum transform -t masks_to_polygons
+datum transform -t polygons_to_masks
+datum transform -t shapes_to_boxes
 ```
 
 Example: remap dataset labels, `person` to `car` and `cat` to `dog`, keep `bus`, remove others
 
 ``` bash
-datum project transform -t remap_labels -- \
-     -l person:car -l bus:bus -l cat:dog \
-     --default delete
+datum transform -t remap_labels -- \
+    -l person:car -l bus:bus -l cat:dog \
+    --default delete
 ```
 
 Example: rename dataset items by a regular expression
@@ -970,8 +1009,8 @@ Example: rename dataset items by a regular expression
 - Remove `frame_` from item ids
 
 ``` bash
-datum project transform -t rename -- -e '|pattern|replacement|'
-datum project transform -t rename -- -e '|frame_(\d+)|\\1|'
+datum transform -t rename -- -e '|pattern|replacement|'
+datum transform -t rename -- -e '|frame_(\d+)|\\1|'
 ```
 
 ## Extending
diff --git a/setup.py b/setup.py
index 7e08249839..d1e5ff0152 100644
--- a/setup.py
+++ b/setup.py
@@ -9,11 +9,15 @@
 import re
 import setuptools
 
+# Snyk scan integration
+here = None
 
-def find_version(file_path=None):
-    if not file_path:
-        file_path = osp.join(osp.dirname(osp.abspath(__file__)),
-            'datumaro', 'version.py')
+
+def find_version(project_dir=None):
+    if not project_dir:
+        project_dir = osp.dirname(osp.abspath(__file__))
+
+    file_path = osp.join(project_dir, 'datumaro', 'version.py')
 
     with open(file_path, 'r') as version_file:
         version_text = version_file.read()
@@ -38,7 +42,8 @@ def get_requirements():
         'matplotlib',
         'numpy>=1.17.3',
         'Pillow',
-        'pycocotools',
+        'pycocotools; platform_system != "Windows"',
+        'pycocotools-windows; platform_system == "Windows"',
         'PyYAML',
         'scikit-image',
         'tensorboardX',
@@ -59,20 +64,20 @@ def get_requirements():
 
 setuptools.setup(
     name="datumaro",
-    version=find_version(),
+    version=find_version(here),
     author="Intel",
     author_email="maxim.zhiltsov@intel.com",
     description="Dataset Management Framework (Datumaro)",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/opencv/cvat/datumaro",
+    url="https://github.com/openvinotoolkit/datumaro",
     packages=setuptools.find_packages(exclude=['tests*']),
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.5',
+    python_requires='>=3.6',
     install_requires=get_requirements(),
     extras_require={
         'tf': ['tensorflow'],
diff --git a/tests/assets/coco_dataset/coco_captions/annotations/captions_train.json b/tests/assets/coco_dataset/coco_captions/annotations/captions_train.json
new file mode 100644
index 0000000000..e360262e0c
--- /dev/null
+++ b/tests/assets/coco_dataset/coco_captions/annotations/captions_train.json
@@ -0,0 +1,54 @@
+{
+    "licenses": [{
+        "name": "",
+        "id": 0,
+        "url": ""
+    }],
+    "info": {
+        "contributor": "",
+        "date_created": "",
+        "description": "",
+        "url": "",
+        "version": "",
+        "year": ""
+    },
+    "categories": [],
+    "images": [{
+        "id": 1,
+        "width": 0,
+        "height": 0,
+        "file_name": "1.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }, {
+        "id": 2,
+        "width": 0,
+        "height": 0,
+        "file_name": "2.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }],
+    "annotations": [{
+        "id": 1,
+        "image_id": 1,
+        "category_id": 0,
+        "caption": "hello",
+        "attributes": {}
+    }, {
+        "id": 2,
+        "image_id": 1,
+        "category_id": 0,
+        "caption": "world",
+        "attributes": {}
+    }, {
+        "id": 3,
+        "image_id": 2,
+        "category_id": 0,
+        "caption": "test",
+        "attributes": {}
+    }]
+}
\ No newline at end of file
diff --git a/tests/assets/coco_dataset/coco_captions/annotations/captions_val.json b/tests/assets/coco_dataset/coco_captions/annotations/captions_val.json
new file mode 100644
index 0000000000..47d071a57d
--- /dev/null
+++ b/tests/assets/coco_dataset/coco_captions/annotations/captions_val.json
@@ -0,0 +1,33 @@
+{
+    "licenses": [{
+        "name": "",
+        "id": 0,
+        "url": ""
+    }],
+    "info": {
+        "contributor": "",
+        "date_created": "",
+        "description": "",
+        "url": "",
+        "version": "",
+        "year": ""
+    },
+    "categories": [],
+    "images": [{
+        "id": 1,
+        "width": 0,
+        "height": 0,
+        "file_name": "3.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }],
+    "annotations": [{
+        "id": 1,
+        "image_id": 1,
+        "category_id": 0,
+        "caption": "word",
+        "attributes": {}
+    }]
+}
\ No newline at end of file
diff --git a/tests/assets/coco_dataset/coco_image_info/annotations/image_info_default.json b/tests/assets/coco_dataset/coco_image_info/annotations/image_info_default.json
new file mode 100644
index 0000000000..f2fc85a73f
--- /dev/null
+++ b/tests/assets/coco_dataset/coco_image_info/annotations/image_info_default.json
@@ -0,0 +1,27 @@
+{
+    "licenses": [{
+        "name": "",
+        "id": 0,
+        "url": ""
+    }],
+    "info": {
+        "contributor": "",
+        "date_created": "",
+        "description": "",
+        "url": "",
+        "version": "",
+        "year": ""
+    },
+    "categories": [],
+    "images": [{
+        "id": 1,
+        "width": 15,
+        "height": 10,
+        "file_name": "1.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }],
+    "annotations": []
+}
\ No newline at end of file
diff --git a/tests/assets/coco_dataset/annotations/instances_val.json b/tests/assets/coco_dataset/coco_instances/annotations/instances_val.json
similarity index 92%
rename from tests/assets/coco_dataset/annotations/instances_val.json
rename to tests/assets/coco_dataset/coco_instances/annotations/instances_val.json
index b5d9bd8697..74de288d8e 100644
--- a/tests/assets/coco_dataset/annotations/instances_val.json
+++ b/tests/assets/coco_dataset/coco_instances/annotations/instances_val.json
@@ -41,7 +41,10 @@
         "segmentation": [[0, 0, 1, 0, 1, 2, 0, 2]],
         "area": 2,
         "bbox": [0, 0, 1, 2],
-        "iscrowd": 0
+        "iscrowd": 0,
+        "attributes": {
+          "x": 1, "y": "hello"
+        }
       },
       {
         "id": 2,
diff --git a/tests/assets/coco_dataset/images/val/000000000001.jpg b/tests/assets/coco_dataset/coco_instances/images/val/000000000001.jpg
similarity index 100%
rename from tests/assets/coco_dataset/images/val/000000000001.jpg
rename to tests/assets/coco_dataset/coco_instances/images/val/000000000001.jpg
diff --git a/tests/assets/coco_dataset/coco_labels/annotations/labels_train.json b/tests/assets/coco_dataset/coco_labels/annotations/labels_train.json
new file mode 100644
index 0000000000..1f790645c3
--- /dev/null
+++ b/tests/assets/coco_dataset/coco_labels/annotations/labels_train.json
@@ -0,0 +1,44 @@
+{
+    "licenses": [{
+        "name": "",
+        "id": 0,
+        "url": ""
+    }],
+    "info": {
+        "contributor": "",
+        "date_created": "",
+        "description": "",
+        "url": "",
+        "version": "",
+        "year": ""
+    },
+    "categories": [{
+        "id": 1,
+        "name": "a",
+        "supercategory": ""
+    }, {
+        "id": 2,
+        "name": "b",
+        "supercategory": ""
+    }],
+    "images": [{
+        "id": 1,
+        "width": 0,
+        "height": 0,
+        "file_name": "1.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }],
+    "annotations": [{
+        "id": 1,
+        "image_id": 1,
+        "category_id": 2
+    }, {
+        "id": 2,
+        "image_id": 1,
+        "category_id": 1,
+        "attributes": {}
+    }]
+}
\ No newline at end of file
diff --git a/tests/assets/coco_dataset/coco_person_keypoints/annotations/person_keypoints_train.json b/tests/assets/coco_dataset/coco_person_keypoints/annotations/person_keypoints_train.json
new file mode 100644
index 0000000000..e5c2238d17
--- /dev/null
+++ b/tests/assets/coco_dataset/coco_person_keypoints/annotations/person_keypoints_train.json
@@ -0,0 +1,87 @@
+{
+    "licenses": [{
+        "name": "",
+        "id": 0,
+        "url": ""
+    }],
+    "info": {
+        "contributor": "",
+        "date_created": "",
+        "description": "",
+        "url": "",
+        "version": "",
+        "year": ""
+    },
+    "categories": [{
+        "id": 1,
+        "name": "a",
+        "supercategory": "",
+        "keypoints": [],
+        "skeleton": [
+            [0, 1],
+            [1, 2]
+        ]
+    }, {
+        "id": 2,
+        "name": "b",
+        "supercategory": "",
+        "keypoints": [],
+        "skeleton": [
+            [0, 1],
+            [1, 2]
+        ]
+    }],
+    "images": [{
+        "id": 1,
+        "width": 5,
+        "height": 5,
+        "file_name": "1.jpg",
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }],
+    "annotations": [{
+        "id": 3,
+        "image_id": 1,
+        "category_id": 1,
+        "segmentation": [],
+        "area": 4.0,
+        "bbox": [0.0, 1.0, 4.0, 1.0],
+        "iscrowd": 0,
+        "keypoints": [1, 2, 2, 0, 2, 2, 4, 1, 2],
+        "num_keypoints": 3
+    }, {
+        "id": 5,
+        "image_id": 1,
+        "category_id": 0,
+        "segmentation": [],
+        "area": 4.0,
+        "bbox": [1.0, 2.0, 2.0, 2.0],
+        "iscrowd": 0,
+        "keypoints": [0, 0, 0, 1, 2, 1, 3, 4, 2],
+        "num_keypoints": 2
+    }, {
+        "id": 1,
+        "image_id": 1,
+        "category_id": 2,
+        "segmentation": [
+            [0.0, 0.0, 4.0, 0.0, 4.0, 4.0]
+        ],
+        "area": 6.0,
+        "bbox": [0.0, 0.0, 4.0, 4.0],
+        "iscrowd": 0,
+        "keypoints": [0, 0, 0, 0, 2, 1, 4, 1, 2],
+        "num_keypoints": 2
+    }, {
+        "id": 2,
+        "image_id": 1,
+        "category_id": 0,
+        "segmentation": [],
+        "area": 4.0,
+        "bbox": [1.0, 2.0, 2.0, 2.0],
+        "iscrowd": 0,
+        "keypoints": [1, 2, 2, 3, 4, 2, 2, 3, 2],
+        "num_keypoints": 3
+    }]
+}
\ No newline at end of file
diff --git a/tests/assets/vgg_face2_dataset/bb_landmark/loose_bb_train.csv b/tests/assets/vgg_face2_dataset/bb_landmark/loose_bb_train.csv
new file mode 100644
index 0000000000..365734f280
--- /dev/null
+++ b/tests/assets/vgg_face2_dataset/bb_landmark/loose_bb_train.csv
@@ -0,0 +1,3 @@
+NAME_ID,X,Y,W,H
+n000001/0001_01,2,2,1,2
+n000002/0002_01,1,3,1,1
diff --git a/tests/assets/vgg_face2_dataset/bb_landmark/loose_landmark_train.csv b/tests/assets/vgg_face2_dataset/bb_landmark/loose_landmark_train.csv
new file mode 100644
index 0000000000..7ca5c1a3b0
--- /dev/null
+++ b/tests/assets/vgg_face2_dataset/bb_landmark/loose_landmark_train.csv
@@ -0,0 +1,3 @@
+NAME_ID,P1X,P1Y,P2X,P2Y,P3X,P3Y,P4X,P4Y,P5X,P5Y
+n000001/0001_01,2.787,2.898,2.965,2.79,2.8,2.456,2.81,2.32,2.89,2.3
+n000002/0002_01,1.2,3.8,1.8,3.82,1.51,3.634,1.43,3.34,1.65,3.32
diff --git a/tests/assets/vgg_face2_dataset/labels.txt b/tests/assets/vgg_face2_dataset/labels.txt
new file mode 100644
index 0000000000..cdd15b2026
--- /dev/null
+++ b/tests/assets/vgg_face2_dataset/labels.txt
@@ -0,0 +1,2 @@
+n000001 car
+n000002 person
\ No newline at end of file
diff --git a/tests/assets/vgg_face2_dataset/train/n000001/0001_01.jpg b/tests/assets/vgg_face2_dataset/train/n000001/0001_01.jpg
new file mode 100644
index 0000000000..8689b95631
Binary files /dev/null and b/tests/assets/vgg_face2_dataset/train/n000001/0001_01.jpg differ
diff --git a/tests/assets/vgg_face2_dataset/train/n000002/0002_01.jpg b/tests/assets/vgg_face2_dataset/train/n000002/0002_01.jpg
new file mode 100644
index 0000000000..8689b95631
Binary files /dev/null and b/tests/assets/vgg_face2_dataset/train/n000002/0002_01.jpg differ
diff --git a/tests/assets/widerface_dataset/WIDER_train/images/0--Parade/0_Parade_image_01.jpg b/tests/assets/widerface_dataset/WIDER_train/images/0--Parade/0_Parade_image_01.jpg
new file mode 100644
index 0000000000..8689b95631
Binary files /dev/null and b/tests/assets/widerface_dataset/WIDER_train/images/0--Parade/0_Parade_image_01.jpg differ
diff --git a/tests/assets/widerface_dataset/WIDER_train/images/1--Handshaking/1_Handshaking_image_02.jpg b/tests/assets/widerface_dataset/WIDER_train/images/1--Handshaking/1_Handshaking_image_02.jpg
new file mode 100644
index 0000000000..8689b95631
Binary files /dev/null and b/tests/assets/widerface_dataset/WIDER_train/images/1--Handshaking/1_Handshaking_image_02.jpg differ
diff --git a/tests/assets/widerface_dataset/WIDER_val/images/0--Parade/0_Parade_image_03.jpg b/tests/assets/widerface_dataset/WIDER_val/images/0--Parade/0_Parade_image_03.jpg
new file mode 100644
index 0000000000..8689b95631
Binary files /dev/null and b/tests/assets/widerface_dataset/WIDER_val/images/0--Parade/0_Parade_image_03.jpg differ
diff --git a/tests/assets/widerface_dataset/labels.txt b/tests/assets/widerface_dataset/labels.txt
new file mode 100644
index 0000000000..d5ed8e2445
--- /dev/null
+++ b/tests/assets/widerface_dataset/labels.txt
@@ -0,0 +1,2 @@
+Parade
+Handshaking
\ No newline at end of file
diff --git a/tests/assets/widerface_dataset/wider_face_split/wider_face_train_bbx_gt.txt b/tests/assets/widerface_dataset/wider_face_split/wider_face_train_bbx_gt.txt
new file mode 100644
index 0000000000..09109f7010
--- /dev/null
+++ b/tests/assets/widerface_dataset/wider_face_split/wider_face_train_bbx_gt.txt
@@ -0,0 +1,7 @@
+0--Parade/0_Parade_image_01.jpg
+1
+1 2 2 2 0 0 0 0 0 0 
+1--Handshaking/1_Handshaking_image_02.jpg
+2
+1 1 2 2 0 0 1 0 0 0 
+5 1 2 2 0 0 1 0 0 0 
\ No newline at end of file
diff --git a/tests/assets/widerface_dataset/wider_face_split/wider_face_val_bbx_gt.txt b/tests/assets/widerface_dataset/wider_face_split/wider_face_val_bbx_gt.txt
new file mode 100644
index 0000000000..04573e8268
--- /dev/null
+++ b/tests/assets/widerface_dataset/wider_face_split/wider_face_val_bbx_gt.txt
@@ -0,0 +1,5 @@
+0--Parade/0_Parade_image_03.jpg
+3
+0 0 1 1 2 0 0 0 2 0 
+3 2 1 2 0 0 0 1 0 0 
+5 6 1 1 2 0 0 0 2 0 
\ No newline at end of file
diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py
index 12192279b9..85e0b6e7d9 100644
--- a/tests/test_camvid_format.py
+++ b/tests/test_camvid_format.py
@@ -6,11 +6,11 @@
 import datumaro.plugins.camvid_format as Camvid
 import numpy as np
 from datumaro.components.extractor import (AnnotationType, DatasetItem,
-                                           Extractor, LabelCategories, Mask)
-from datumaro.components.project import Dataset, Project
+    Extractor, LabelCategories, Mask)
+from datumaro.components.dataset import Dataset
 from datumaro.plugins.camvid_format import CamvidConverter, CamvidImporter
 from datumaro.util.test_utils import (TestDir, compare_datasets,
-                                      test_save_and_load)
+    test_save_and_load)
 
 
 class CamvidFormatTest(TestCase):
@@ -68,7 +68,7 @@ def test_can_import(self):
             ),
         ], categories=Camvid.make_camvid_categories())
 
-        parsed_dataset = Project.import_from(DUMMY_DATASET_DIR, 'camvid').make_dataset()
+        parsed_dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'camvid')
 
         compare_datasets(self, source_dataset, parsed_dataset)
 
diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py
index a25f15f561..9fcd26d30d 100644
--- a/tests/test_coco_format.py
+++ b/tests/test_coco_format.py
@@ -4,7 +4,7 @@
 
 from unittest import TestCase
 
-from datumaro.components.project import Project, Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem,
     AnnotationType, Label, Mask, Points, Polygon, Bbox, Caption,
     LabelCategories, PointsCategories
@@ -26,13 +26,14 @@
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'coco_dataset')
 
 class CocoImporterTest(TestCase):
-    def test_can_import(self):
+    def test_can_import_instances(self):
         expected_dataset = Dataset.from_iterable([
             DatasetItem(id='000000000001', image=np.ones((10, 5, 3)),
                 subset='val', attributes={'id': 1},
                 annotations=[
                     Polygon([0, 0, 1, 0, 1, 2, 0, 2], label=0,
-                        id=1, group=1, attributes={'is_crowd': False}),
+                        id=1, group=1, attributes={'is_crowd': False,
+                            'x': 1, 'y': 'hello'}),
                     Mask(np.array(
                         [[1, 0, 0, 1, 0]] * 5 +
                         [[1, 1, 1, 1, 0]] * 5
@@ -42,13 +43,107 @@ def test_can_import(self):
             ),
         ], categories=['TEST',])
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'coco') \
-            .make_dataset()
+        dataset = Dataset.import_from(
+            osp.join(DUMMY_DATASET_DIR, 'coco_instances'), 'coco')
+
+        compare_datasets(self, expected_dataset, dataset)
+
+    def test_can_import_captions(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train',
+                annotations=[
+                    Caption('hello', id=1, group=1),
+                    Caption('world', id=2, group=2),
+                ], attributes={'id': 1}),
+            DatasetItem(id=2, subset='train',
+                annotations=[
+                    Caption('test', id=3, group=3),
+                ], attributes={'id': 2}),
+
+            DatasetItem(id=3, subset='val',
+                annotations=[
+                    Caption('word', id=1, group=1),
+                ], attributes={'id': 1}),
+            ])
+
+        dataset = Dataset.import_from(
+            osp.join(DUMMY_DATASET_DIR, 'coco_captions'), 'coco')
+
+        compare_datasets(self, expected_dataset, dataset)
+
+    def test_can_import_labels(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train',
+                annotations=[
+                    Label(1, id=1, group=1),
+                    Label(0, id=2, group=2),
+                ], attributes={'id': 1}),
+        ], categories=['a', 'b'])
+
+        dataset = Dataset.import_from(
+            osp.join(DUMMY_DATASET_DIR, 'coco_labels'), 'coco')
+
+        compare_datasets(self, expected_dataset, dataset)
+
+    def test_can_import_points(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train',
+                image=Image(path='1.jpg', size=(5, 5)),
+                annotations=[
+                    Points([0, 0, 0, 2, 4, 1], [0, 1, 2],
+                        label=1, group=1, id=1,
+                        attributes={'is_crowd': False}),
+                    Polygon([0, 0, 4, 0, 4, 4],
+                        label=1, group=1, id=1,
+                        attributes={'is_crowd': False}),
+
+                    Points([1, 2, 3, 4, 2, 3],
+                        group=2, id=2,
+                        attributes={'is_crowd': False}),
+                    Bbox(1, 2, 2, 2,
+                        group=2, id=2,
+                        attributes={'is_crowd': False}),
+
+                    Points([1, 2, 0, 2, 4, 1],
+                        label=0, group=3, id=3,
+                        attributes={'is_crowd': False}),
+                    Bbox(0, 1, 4, 1,
+                        label=0, group=3, id=3,
+                        attributes={'is_crowd': False}),
+
+                    Points([0, 0, 1, 2, 3, 4], [0, 1, 2],
+                        group=5, id=5,
+                        attributes={'is_crowd': False}),
+                    Bbox(1, 2, 2, 2,
+                        group=5, id=5,
+                        attributes={'is_crowd': False}),
+                ], attributes={'id': 1}),
+            ], categories={
+                AnnotationType.label: LabelCategories.from_iterable(['a', 'b']),
+                AnnotationType.points: PointsCategories.from_iterable(
+                    (i, None, [[0, 1], [1, 2]]) for i in range(2)
+                ),
+            })
+
+        dataset = Dataset.import_from(
+            osp.join(DUMMY_DATASET_DIR, 'coco_person_keypoints'), 'coco')
+
+        compare_datasets(self, expected_dataset, dataset)
+
+    def test_can_import_image_info(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, image=Image(path='1.jpg', size=(10, 15)),
+                attributes={'id': 1}),
+        ])
+
+        dataset = Dataset.import_from(
+            osp.join(DUMMY_DATASET_DIR, 'coco_image_info'), 'coco')
 
         compare_datasets(self, expected_dataset, dataset)
 
     def test_can_detect(self):
-        self.assertTrue(CocoImporter.detect(DUMMY_DATASET_DIR))
+        self.assertTrue(CocoImporter.detect(
+            osp.join(DUMMY_DATASET_DIR, 'coco_instances')))
 
 class CocoConverterTest(TestCase):
     def _test_save_and_load(self, source_dataset, converter, test_dir,
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000000..32332b3545
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,32 @@
+from unittest import TestCase
+
+from datumaro.components.config import Config, DictConfig, SchemaBuilder
+
+
+class ConfigTest(TestCase):
+    def test_can_produce_multilayer_config_from_dict(self):
+        schema_low = SchemaBuilder() \
+            .add('options', dict) \
+            .build()
+        schema_mid = SchemaBuilder() \
+            .add('desc', lambda: Config(schema=schema_low)) \
+            .build()
+        schema_top = SchemaBuilder() \
+            .add('container', lambda: DictConfig(
+                lambda v: Config(v, schema=schema_mid))) \
+            .build()
+
+        value = 1
+        source = Config({
+            'container': {
+                'elem': {
+                    'desc': {
+                        'options': {
+                            'k': value
+                        }
+                    }
+                }
+            }
+        }, schema=schema_top)
+
+        self.assertEqual(value, source.container['elem'].desc.options['k'])
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
new file mode 100644
index 0000000000..3ae1879e84
--- /dev/null
+++ b/tests/test_dataset.py
@@ -0,0 +1,312 @@
+import numpy as np
+
+from unittest import TestCase
+
+from datumaro.components.environment import Environment
+from datumaro.components.extractor import (Extractor, DatasetItem,
+    Label, Mask, Points, Polygon, PolyLine, Bbox, Caption,
+    LabelCategories, AnnotationType, Transform
+)
+from datumaro.util.image import Image
+from datumaro.components.dataset_filter import \
+    XPathDatasetFilter, XPathAnnotationsFilter, DatasetItemEncoder
+from datumaro.components.dataset import Dataset, DEFAULT_FORMAT
+from datumaro.util.test_utils import TestDir, compare_datasets
+
+
+class DatasetTest(TestCase):
+    def test_create_from_extractors(self):
+        class SrcExtractor1(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=1, subset='train', annotations=[
+                        Bbox(1, 2, 3, 4),
+                        Label(4),
+                    ]),
+                    DatasetItem(id=1, subset='val', annotations=[
+                        Label(4),
+                    ]),
+                ])
+
+        class SrcExtractor2(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=1, subset='val', annotations=[
+                        Label(5),
+                    ]),
+                ])
+
+        class DstExtractor(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=1, subset='train', annotations=[
+                        Bbox(1, 2, 3, 4),
+                        Label(4),
+                    ]),
+                    DatasetItem(id=1, subset='val', annotations=[
+                        Label(4),
+                        Label(5),
+                    ]),
+                ])
+
+        dataset = Dataset.from_extractors(SrcExtractor1(), SrcExtractor2())
+
+        compare_datasets(self, DstExtractor(), dataset)
+
+    def test_can_create_from_iterable(self):
+        class TestExtractor(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=1, subset='train', annotations=[
+                        Bbox(1, 2, 3, 4, label=2),
+                        Label(4),
+                    ]),
+                    DatasetItem(id=1, subset='val', annotations=[
+                        Label(3),
+                    ]),
+                ])
+
+            def categories(self):
+                return { AnnotationType.label: LabelCategories.from_iterable(
+                    ['a', 'b', 'c', 'd', 'e'])
+                }
+
+        actual = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', annotations=[
+                Bbox(1, 2, 3, 4, label=2),
+                Label(4),
+            ]),
+            DatasetItem(id=1, subset='val', annotations=[
+                Label(3),
+            ]),
+        ], categories=['a', 'b', 'c', 'd', 'e'])
+
+        compare_datasets(self, TestExtractor(), actual)
+
+    def test_can_save_and_load(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'])
+
+        with TestDir() as test_dir:
+            source_dataset.save(test_dir)
+
+            loaded_dataset = Dataset.load(test_dir)
+
+            compare_datasets(self, source_dataset, loaded_dataset)
+
+    def test_can_detect(self):
+        env = Environment()
+        env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]}
+        env.extractors.items = {DEFAULT_FORMAT: env.extractors[DEFAULT_FORMAT]}
+
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'])
+
+        with TestDir() as test_dir:
+            dataset.save(test_dir)
+
+            detected_format = Dataset.detect(test_dir, env=env)
+
+            self.assertEqual(DEFAULT_FORMAT, detected_format)
+
+    def test_can_detect_and_import(self):
+        env = Environment()
+        env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]}
+        env.extractors.items = {DEFAULT_FORMAT: env.extractors[DEFAULT_FORMAT]}
+
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'])
+
+        with TestDir() as test_dir:
+            source_dataset.save(test_dir)
+
+            imported_dataset = Dataset.import_from(test_dir, env=env)
+
+            compare_datasets(self, source_dataset, imported_dataset)
+
+    def test_can_export_by_string_format_name(self):
+        env = Environment()
+        env.converters.items = {'qq': env.converters[DEFAULT_FORMAT]}
+
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'], env=env)
+
+        with TestDir() as test_dir:
+            dataset.export(format='qq', save_dir=test_dir)
+
+    def test_can_transform_by_string_name(self):
+        expected = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ], attributes={'qq': 1}),
+        ], categories=['a', 'b', 'c'])
+
+        class TestTransform(Transform):
+            def transform_item(self, item):
+                return self.wrap_item(item, attributes={'qq': 1})
+
+        env = Environment()
+        env.transforms.items = {'qq': TestTransform}
+
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'], env=env)
+
+        actual = dataset.transform('qq')
+
+        compare_datasets(self, expected, actual)
+
+    def test_can_join_annotations(self):
+        a = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', annotations=[
+                Label(1, id=3),
+                Label(2, attributes={ 'x': 1 }),
+            ])
+        ], categories=['a', 'b', 'c', 'd'])
+
+        b = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', annotations=[
+                Label(2, attributes={ 'x': 1 }),
+                Label(3, id=4),
+            ])
+        ], categories=['a', 'b', 'c', 'd'])
+
+        expected = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', annotations=[
+                Label(1, id=3),
+                Label(2, attributes={ 'x': 1 }),
+                Label(3, id=4),
+            ])
+        ], categories=['a', 'b', 'c', 'd'])
+
+        merged = Dataset.from_extractors(a, b)
+
+        compare_datasets(self, expected, merged)
+
+    def test_cant_join_different_categories(self):
+        s1 = Dataset.from_iterable([], categories=['a', 'b'])
+        s2 = Dataset.from_iterable([], categories=['b', 'a'])
+
+        with self.assertRaisesRegex(Exception, "different categories"):
+            Dataset.from_extractors(s1, s2)
+
+    def test_can_join_datasets(self):
+        s1 = Dataset.from_iterable([ DatasetItem(0), DatasetItem(1) ])
+        s2 = Dataset.from_iterable([ DatasetItem(1), DatasetItem(2) ])
+
+        dataset = Dataset.from_extractors(s1, s2)
+
+        self.assertEqual(3, len(dataset))
+
+
+class DatasetItemTest(TestCase):
+    def test_ctor_requires_id(self):
+        with self.assertRaises(Exception):
+            # pylint: disable=no-value-for-parameter
+            DatasetItem()
+            # pylint: enable=no-value-for-parameter
+
+    @staticmethod
+    def test_ctors_with_image():
+        for args in [
+            { 'id': 0, 'image': None },
+            { 'id': 0, 'image': 'path.jpg' },
+            { 'id': 0, 'image': np.array([1, 2, 3]) },
+            { 'id': 0, 'image': lambda f: np.array([1, 2, 3]) },
+            { 'id': 0, 'image': Image(data=np.array([1, 2, 3])) },
+        ]:
+            DatasetItem(**args)
+
+
+class DatasetFilterTest(TestCase):
+    @staticmethod
+    def test_item_representations():
+        item = DatasetItem(id=1, subset='subset', path=['a', 'b'],
+            image=np.ones((5, 4, 3)),
+            annotations=[
+                Label(0, attributes={'a1': 1, 'a2': '2'}, id=1, group=2),
+                Caption('hello', id=1),
+                Caption('world', group=5),
+                Label(2, id=3, attributes={ 'x': 1, 'y': '2' }),
+                Bbox(1, 2, 3, 4, label=4, id=4, attributes={ 'a': 1.0 }),
+                Bbox(5, 6, 7, 8, id=5, group=5),
+                Points([1, 2, 2, 0, 1, 1], label=0, id=5),
+                Mask(id=5, image=np.ones((3, 2))),
+                Mask(label=3, id=5, image=np.ones((2, 3))),
+                PolyLine([1, 2, 3, 4, 5, 6, 7, 8], id=11),
+                Polygon([1, 2, 3, 4, 5, 6, 7, 8]),
+            ]
+        )
+
+        encoded = DatasetItemEncoder.encode(item)
+        DatasetItemEncoder.to_string(encoded)
+
+    def test_item_filter_can_be_applied(self):
+        class TestExtractor(Extractor):
+            def __iter__(self):
+                for i in range(4):
+                    yield DatasetItem(id=i, subset='train')
+
+        extractor = TestExtractor()
+
+        filtered = XPathDatasetFilter(extractor, '/item[id > 1]')
+
+        self.assertEqual(2, len(filtered))
+
+    def test_annotations_filter_can_be_applied(self):
+        class SrcExtractor(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=0),
+                    DatasetItem(id=1, annotations=[
+                        Label(0),
+                        Label(1),
+                    ]),
+                    DatasetItem(id=2, annotations=[
+                        Label(0),
+                        Label(2),
+                    ]),
+                ])
+
+        class DstExtractor(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=0),
+                    DatasetItem(id=1, annotations=[
+                        Label(0),
+                    ]),
+                    DatasetItem(id=2, annotations=[
+                        Label(0),
+                    ]),
+                ])
+
+        extractor = SrcExtractor()
+
+        filtered = XPathAnnotationsFilter(extractor,
+            '/item/annotation[label_id = 0]')
+
+        self.assertListEqual(list(filtered), list(DstExtractor()))
+
+    def test_annotations_filter_can_remove_empty_items(self):
+        source = Dataset.from_iterable([
+            DatasetItem(id=0),
+            DatasetItem(id=1, annotations=[
+                Label(0),
+                Label(1),
+            ]),
+            DatasetItem(id=2, annotations=[
+                Label(0),
+                Label(2),
+            ]),
+        ], categories=['a', 'b', 'c'])
+
+        expected = Dataset.from_iterable([
+            DatasetItem(id=2, annotations=[Label(2)]),
+        ], categories=['a', 'b', 'c'])
+
+        filtered = XPathAnnotationsFilter(source,
+            '/item/annotation[label_id = 2]', remove_empty=True)
+
+        compare_datasets(self, expected, filtered)
diff --git a/tests/test_imagenet_format.py b/tests/test_imagenet_format.py
index b6bd4e0c97..779e46b310 100644
--- a/tests/test_imagenet_format.py
+++ b/tests/test_imagenet_format.py
@@ -3,12 +3,11 @@
 import numpy as np
 import os.path as osp
 
-from datumaro.components.project import Project, Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem, Label,
     LabelCategories, AnnotationType
 )
-from datumaro.plugins.imagenet_format import ImagenetConverter
-from datumaro.plugins.imagenet_format import ImagenetImporter
+from datumaro.plugins.imagenet_format import ImagenetConverter, ImagenetImporter
 from datumaro.util.test_utils import TestDir, compare_datasets
 
 class ImagenetFormatTest(TestCase):
@@ -103,7 +102,7 @@ def test_can_import(self):
                 'label_' + str(label) for label in range(2)),
         })
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'imagenet').make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'imagenet')
 
         compare_datasets(self, expected_dataset, dataset, require_images=True)
 
diff --git a/tests/test_imagenet_txt_format.py b/tests/test_imagenet_txt_format.py
index 251c71fcc9..1cd5159923 100644
--- a/tests/test_imagenet_txt_format.py
+++ b/tests/test_imagenet_txt_format.py
@@ -3,11 +3,12 @@
 import numpy as np
 import os.path as osp
 
-from datumaro.components.project import Project, Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem, Label,
     LabelCategories, AnnotationType
 )
-from datumaro.plugins.imagenet_txt_format import ImagenetTxtConverter, ImagenetTxtImporter
+from datumaro.plugins.imagenet_txt_format import \
+    ImagenetTxtConverter, ImagenetTxtImporter
 from datumaro.util.test_utils import TestDir, compare_datasets
 
 
@@ -111,8 +112,7 @@ def test_can_import(self):
                 'label_%s' % label for label in range(10)),
         })
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'imagenet_txt') \
-            .make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'imagenet_txt')
 
         compare_datasets(self, expected_dataset, dataset, require_images=True)
 
diff --git a/tests/test_labelme_format.py b/tests/test_labelme_format.py
index f51922224f..244a590b07 100644
--- a/tests/test_labelme_format.py
+++ b/tests/test_labelme_format.py
@@ -3,11 +3,10 @@
 import os.path as osp
 
 from unittest import TestCase
-from datumaro.components.project import Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem,
     AnnotationType, Bbox, Mask, Polygon, LabelCategories
 )
-from datumaro.components.project import Project
 from datumaro.plugins.labelme_format import LabelMeImporter, LabelMeConverter
 from datumaro.util.test_utils import (TestDir, compare_datasets,
     test_save_and_load)
@@ -181,6 +180,5 @@ def test_can_import(self):
             ]),
         })
 
-        parsed = Project.import_from(DUMMY_DATASET_DIR, 'label_me') \
-            .make_dataset()
+        parsed = Dataset.import_from(DUMMY_DATASET_DIR, 'label_me')
         compare_datasets(self, expected=target_dataset, actual=parsed)
\ No newline at end of file
diff --git a/tests/test_mot_format.py b/tests/test_mot_format.py
index fd647426bb..259bde298a 100644
--- a/tests/test_mot_format.py
+++ b/tests/test_mot_format.py
@@ -3,11 +3,10 @@
 import os.path as osp
 
 from unittest import TestCase
-from datumaro.components.project import Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem,
     AnnotationType, Bbox, LabelCategories
 )
-from datumaro.components.project import Project
 from datumaro.plugins.mot_format import MotSeqGtConverter, MotSeqImporter
 from datumaro.util.test_utils import (TestDir, compare_datasets,
     test_save_and_load)
@@ -123,7 +122,6 @@ def test_can_import(self):
                 'label_' + str(label) for label in range(10)),
         })
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'mot_seq') \
-            .make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'mot_seq')
 
         compare_datasets(self, expected_dataset, dataset)
\ No newline at end of file
diff --git a/tests/test_mots_format.py b/tests/test_mots_format.py
index 2a6ec057e4..f8358dda3c 100644
--- a/tests/test_mots_format.py
+++ b/tests/test_mots_format.py
@@ -5,7 +5,7 @@
 from unittest import TestCase
 
 from datumaro.components.extractor import DatasetItem, Mask
-from datumaro.components.project import Dataset, Project
+from datumaro.components.dataset import Dataset
 from datumaro.plugins.mots_format import MotsPngConverter, MotsImporter
 from datumaro.util.test_utils import (TestDir, compare_datasets,
     test_save_and_load)
@@ -90,5 +90,5 @@ def test_can_import(self):
             ]),
         ], categories=['a', 'b', 'c', 'd'])
 
-        parsed = Project.import_from(DUMMY_DATASET_DIR, 'mots').make_dataset()
+        parsed = Dataset.import_from(DUMMY_DATASET_DIR, 'mots')
         compare_datasets(self, expected=target, actual=parsed)
\ No newline at end of file
diff --git a/tests/test_ops.py b/tests/test_ops.py
index c0c206067c..f6af22f53e 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -3,12 +3,12 @@
 import numpy as np
 
 from datumaro.components.extractor import (Bbox, Caption, DatasetItem,
-    Extractor, Label, Mask, Points, Polygon, PolyLine, DEFAULT_SUBSET_NAME,
+    Label, Mask, Points, Polygon, PolyLine, DEFAULT_SUBSET_NAME,
     LabelCategories, PointsCategories, MaskCategories, AnnotationType)
 from datumaro.components.operations import (FailedAttrVotingError,
     IntersectMerge, NoMatchingAnnError, NoMatchingItemError, WrongGroupError,
-    compute_ann_statistics, mean_std)
-from datumaro.components.project import Dataset
+    compute_ann_statistics, mean_std, find_unique_images)
+from datumaro.components.dataset import Dataset
 from datumaro.util.test_utils import compare_datasets
 
 
@@ -60,13 +60,17 @@ def test_stats(self):
                 }),
             ]),
             DatasetItem(id=3),
+            DatasetItem(id='2.2', image=np.ones((2, 4, 3))),
         ], categories=['label_%s' % i for i in range(4)])
 
         expected = {
-            'images count': 3,
+            'images count': 4,
+            'unique images count': 3,
+            'repeated images count': 1,
+            'repeated images': [[('2', 'default'), ('2.2', 'default')]],
             'annotations count': 10,
-            'unannotated images count': 1,
-            'unannotated images': ['3'],
+            'unannotated images count': 2,
+            'unannotated images': ['3', '2.2'],
             'annotations by type': {
                 'label': { 'count': 2, },
                 'polygon': { 'count': 0, },
@@ -142,6 +146,9 @@ def test_stats_with_empty_dataset(self):
 
         expected = {
             'images count': 2,
+            'unique images count': 2,
+            'repeated images count': 0,
+            'repeated images': [],
             'annotations count': 0,
             'unannotated images count': 2,
             'unannotated images': ['1', '3'],
@@ -182,6 +189,31 @@ def test_stats_with_empty_dataset(self):
 
         self.assertEqual(expected, actual)
 
+    def test_unique_image_count(self):
+        expected = {
+            frozenset([('1', 'a'), ('1', 'b')]),
+            frozenset([('2', DEFAULT_SUBSET_NAME), ('3', DEFAULT_SUBSET_NAME)]),
+            frozenset([('4', DEFAULT_SUBSET_NAME)]),
+        }
+
+        dataset = Dataset.from_iterable([
+            # no image data, but the same path
+            DatasetItem(1, subset='a', image='1.jpg'),
+            DatasetItem(1, subset='b', image='1.jpg'),
+
+            # same images
+            DatasetItem(2, image=np.array([1])),
+            DatasetItem(3, image=np.array([1])),
+
+            # no image is always a unique image
+            DatasetItem(4),
+        ])
+
+        groups = find_unique_images(dataset)
+
+        self.assertEqual(expected, set(frozenset(s) for s in groups.values()))
+
+
 class TestMultimerge(TestCase):
     def test_can_match_items(self):
         # items 1 and 3 are unique, item 2 is common and should be merged
diff --git a/tests/test_project.py b/tests/test_project.py
index 00751da087..b4ab7bbf58 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -4,17 +4,13 @@
 
 from unittest import TestCase
 
-from datumaro.components.project import Project, Environment, Dataset
+from datumaro.components.project import Project, Environment
 from datumaro.components.config_model import Source, Model
 from datumaro.components.launcher import Launcher, ModelTransform
 from datumaro.components.extractor import (Extractor, DatasetItem,
-    Label, Mask, Points, Polygon, PolyLine, Bbox, Caption,
-    LabelCategories, AnnotationType
-)
-from datumaro.util.image import Image
-from datumaro.components.config import Config, DefaultConfig, SchemaBuilder
-from datumaro.components.dataset_filter import \
-    XPathDatasetFilter, XPathAnnotationsFilter, DatasetItemEncoder
+    Label, LabelCategories, AnnotationType)
+from datumaro.components.config import Config
+from datumaro.components.dataset import Dataset, DEFAULT_FORMAT
 from datumaro.util.test_utils import TestDir, compare_datasets
 
 
@@ -363,134 +359,25 @@ def __iter__(self):
         item = next(iter(merged))
         self.assertEqual(3, len(item.annotations))
 
-class DatasetFilterTest(TestCase):
-    @staticmethod
-    def test_item_representations():
-        item = DatasetItem(id=1, subset='subset', path=['a', 'b'],
-            image=np.ones((5, 4, 3)),
-            annotations=[
-                Label(0, attributes={'a1': 1, 'a2': '2'}, id=1, group=2),
-                Caption('hello', id=1),
-                Caption('world', group=5),
-                Label(2, id=3, attributes={ 'x': 1, 'y': '2' }),
-                Bbox(1, 2, 3, 4, label=4, id=4, attributes={ 'a': 1.0 }),
-                Bbox(5, 6, 7, 8, id=5, group=5),
-                Points([1, 2, 2, 0, 1, 1], label=0, id=5),
-                Mask(id=5, image=np.ones((3, 2))),
-                Mask(label=3, id=5, image=np.ones((2, 3))),
-                PolyLine([1, 2, 3, 4, 5, 6, 7, 8], id=11),
-                Polygon([1, 2, 3, 4, 5, 6, 7, 8]),
-            ]
-        )
-
-        encoded = DatasetItemEncoder.encode(item)
-        DatasetItemEncoder.to_string(encoded)
-
-    def test_item_filter_can_be_applied(self):
-        class TestExtractor(Extractor):
-            def __iter__(self):
-                for i in range(4):
-                    yield DatasetItem(id=i, subset='train')
-
-        extractor = TestExtractor()
-
-        filtered = XPathDatasetFilter(extractor, '/item[id > 1]')
-
-        self.assertEqual(2, len(filtered))
-
-    def test_annotations_filter_can_be_applied(self):
-        class SrcExtractor(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=0),
-                    DatasetItem(id=1, annotations=[
-                        Label(0),
-                        Label(1),
-                    ]),
-                    DatasetItem(id=2, annotations=[
-                        Label(0),
-                        Label(2),
-                    ]),
-                ])
-
-        class DstExtractor(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=0),
-                    DatasetItem(id=1, annotations=[
-                        Label(0),
-                    ]),
-                    DatasetItem(id=2, annotations=[
-                        Label(0),
-                    ]),
-                ])
-
-        extractor = SrcExtractor()
+    def test_can_detect_and_import(self):
+        env = Environment()
+        env.importers.items = {DEFAULT_FORMAT: env.importers[DEFAULT_FORMAT]}
+        env.extractors.items = {DEFAULT_FORMAT: env.extractors[DEFAULT_FORMAT]}
 
-        filtered = XPathAnnotationsFilter(extractor,
-            '/item/annotation[label_id = 0]')
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[ Label(2) ]),
+        ], categories=['a', 'b', 'c'])
 
-        self.assertListEqual(list(filtered), list(DstExtractor()))
+        with TestDir() as test_dir:
+            source_dataset.save(test_dir)
 
-    def test_annotations_filter_can_remove_empty_items(self):
-        class SrcExtractor(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=0),
-                    DatasetItem(id=1, annotations=[
-                        Label(0),
-                        Label(1),
-                    ]),
-                    DatasetItem(id=2, annotations=[
-                        Label(0),
-                        Label(2),
-                    ]),
-                ])
+            project = Project.import_from(test_dir, env=env)
+            imported_dataset = project.make_dataset()
 
-        class DstExtractor(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=2, annotations=[
-                        Label(2),
-                    ]),
-                ])
+            self.assertEqual(next(iter(project.config.sources.values())).format,
+                DEFAULT_FORMAT)
+            compare_datasets(self, source_dataset, imported_dataset)
 
-        extractor = SrcExtractor()
-
-        filtered = XPathAnnotationsFilter(extractor,
-            '/item/annotation[label_id = 2]', remove_empty=True)
-
-        self.assertListEqual(list(filtered), list(DstExtractor()))
-
-class ConfigTest(TestCase):
-    def test_can_produce_multilayer_config_from_dict(self):
-        schema_low = SchemaBuilder() \
-            .add('options', dict) \
-            .build()
-        schema_mid = SchemaBuilder() \
-            .add('desc', lambda: Config(schema=schema_low)) \
-            .build()
-        schema_top = SchemaBuilder() \
-            .add('container', lambda: DefaultConfig(
-                lambda v: Config(v, schema=schema_mid))) \
-            .build()
-
-        value = 1
-        source = Config({
-            'container': {
-                'elem': {
-                    'desc': {
-                        'options': {
-                            'k': value
-                        }
-                    }
-                }
-            }
-        }, schema=schema_top)
-
-        self.assertEqual(value, source.container['elem'].desc.options['k'])
-
-class ExtractorTest(TestCase):
     def test_custom_extractor_can_be_created(self):
         class CustomExtractor(Extractor):
             def __iter__(self):
@@ -518,61 +405,3 @@ def __iter__(self):
         dataset = project.make_dataset()
 
         compare_datasets(self, CustomExtractor(), dataset)
-
-class DatasetTest(TestCase):
-    def test_create_from_extractors(self):
-        class SrcExtractor1(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=1, subset='train', annotations=[
-                        Bbox(1, 2, 3, 4),
-                        Label(4),
-                    ]),
-                    DatasetItem(id=1, subset='val', annotations=[
-                        Label(4),
-                    ]),
-                ])
-
-        class SrcExtractor2(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=1, subset='val', annotations=[
-                        Label(5),
-                    ]),
-                ])
-
-        class DstExtractor(Extractor):
-            def __iter__(self):
-                return iter([
-                    DatasetItem(id=1, subset='train', annotations=[
-                        Bbox(1, 2, 3, 4),
-                        Label(4),
-                    ]),
-                    DatasetItem(id=1, subset='val', annotations=[
-                        Label(4),
-                        Label(5),
-                    ]),
-                ])
-
-        dataset = Dataset.from_extractors(SrcExtractor1(), SrcExtractor2())
-
-        compare_datasets(self, DstExtractor(), dataset)
-
-
-class DatasetItemTest(TestCase):
-    def test_ctor_requires_id(self):
-        with self.assertRaises(Exception):
-            # pylint: disable=no-value-for-parameter
-            DatasetItem()
-            # pylint: enable=no-value-for-parameter
-
-    @staticmethod
-    def test_ctors_with_image():
-        for args in [
-            { 'id': 0, 'image': None },
-            { 'id': 0, 'image': 'path.jpg' },
-            { 'id': 0, 'image': np.array([1, 2, 3]) },
-            { 'id': 0, 'image': lambda f: np.array([1, 2, 3]) },
-            { 'id': 0, 'image': Image(data=np.array([1, 2, 3])) },
-        ]:
-            DatasetItem(**args)
\ No newline at end of file
diff --git a/tests/test_splitter.py b/tests/test_splitter.py
new file mode 100644
index 0000000000..ba3cb5a174
--- /dev/null
+++ b/tests/test_splitter.py
@@ -0,0 +1,643 @@
+import numpy as np
+
+from unittest import TestCase
+
+from datumaro.components.project import Dataset
+from datumaro.components.extractor import (DatasetItem, Label, Bbox,
+    LabelCategories, AnnotationType)
+
+import datumaro.plugins.splitter as splitter
+from datumaro.components.operations import compute_ann_statistics
+
+
+class SplitterTest(TestCase):
+    @staticmethod
+    def _get_subset(idx):
+        subsets = ["", "a", "b", "", "", "a", "", "b", "", "a"]
+        return subsets[idx % len(subsets)]
+
+    def _generate_dataset(self, config):
+        # counts = {(0,0):20, (0,1):20, (0,2):30, (1,0):20, (1,1):10, (1,2):20}
+        # attr1 = ['attr1', 'attr2']
+        # attr2 = ['attr1', 'attr3']
+        # config = { "label1": { "attrs": attr1, "counts": counts },
+        #            "label2": { "attrs": attr2, "counts": counts }}
+        iterable = []
+        label_cat = LabelCategories()
+        idx = 0
+        for label_id, label in enumerate(config.keys()):
+            anames = config[label]["attrs"]
+            counts = config[label]["counts"]
+            label_cat.add(label, attributes=anames)
+            if isinstance(counts, dict):
+                for attrs, count in counts.items():
+                    attributes = dict()
+                    if isinstance(attrs, tuple):
+                        for aname, value in zip(anames, attrs):
+                            attributes[aname] = value
+                    else:
+                        attributes[anames[0]] = attrs
+                    for _ in range(count):
+                        idx += 1
+                        iterable.append(
+                            DatasetItem(idx, subset=self._get_subset(idx),
+                                annotations=[
+                                    Label(label_id, attributes=attributes)
+                                ],
+                            )
+                        )
+            else:
+                for _ in range(counts):
+                    idx += 1
+                    iterable.append(
+                        DatasetItem(idx, subset=self._get_subset(idx),
+                            annotations=[Label(label_id)])
+                    )
+        categories = {AnnotationType.label: label_cat}
+        dataset = Dataset.from_iterable(iterable, categories)
+        return dataset
+
+    def test_split_for_classification_multi_class_no_attr(self):
+        config = {
+            "label1": {"attrs": None, "counts": 10},
+            "label2": {"attrs": None, "counts": 20},
+            "label3": {"attrs": None, "counts": 30},
+        }
+        source = self._generate_dataset(config)
+
+        splits = [("train", 0.7), ("test", 0.3)]
+        actual = splitter.ClassificationSplit(source, splits)
+
+        self.assertEqual(42, len(actual.get_subset("train")))
+        self.assertEqual(18, len(actual.get_subset("test")))
+
+        # check stats for train
+        stat_train = compute_ann_statistics(actual.get_subset("train"))
+        dist_train = stat_train["annotations"]["labels"]["distribution"]
+        self.assertEqual(7, dist_train["label1"][0])
+        self.assertEqual(14, dist_train["label2"][0])
+        self.assertEqual(21, dist_train["label3"][0])
+
+        # check stats for test
+        stat_test = compute_ann_statistics(actual.get_subset("test"))
+        dist_test = stat_test["annotations"]["labels"]["distribution"]
+        self.assertEqual(3, dist_test["label1"][0])
+        self.assertEqual(6, dist_test["label2"][0])
+        self.assertEqual(9, dist_test["label3"][0])
+
+    def test_split_for_classification_single_class_single_attr(self):
+        counts = {0: 10, 1: 20, 2: 30}
+        config = {"label": {"attrs": ["attr"], "counts": counts}}
+        source = self._generate_dataset(config)
+
+        splits = [("train", 0.7), ("test", 0.3)]
+        actual = splitter.ClassificationSplit(source, splits)
+
+        self.assertEqual(42, len(actual.get_subset("train")))
+        self.assertEqual(18, len(actual.get_subset("test")))
+
+        # check stats for train
+        stat_train = compute_ann_statistics(actual.get_subset("train"))
+        attr_train = stat_train["annotations"]["labels"]["attributes"]
+        self.assertEqual(7, attr_train["attr"]["distribution"]["0"][0])
+        self.assertEqual(14, attr_train["attr"]["distribution"]["1"][0])
+        self.assertEqual(21, attr_train["attr"]["distribution"]["2"][0])
+
+        # check stats for test
+        stat_test = compute_ann_statistics(actual.get_subset("test"))
+        attr_test = stat_test["annotations"]["labels"]["attributes"]
+        self.assertEqual(3, attr_test["attr"]["distribution"]["0"][0])
+        self.assertEqual(6, attr_test["attr"]["distribution"]["1"][0])
+        self.assertEqual(9, attr_test["attr"]["distribution"]["2"][0])
+
+    def test_split_for_classification_single_class_multi_attr(self):
+        counts = {
+            (0, 0): 20,
+            (0, 1): 20,
+            (0, 2): 30,
+            (1, 0): 20,
+            (1, 1): 10,
+            (1, 2): 20,
+        }
+        attrs = ["attr1", "attr2"]
+        config = {"label": {"attrs": attrs, "counts": counts}}
+        source = self._generate_dataset(config)
+
+        splits = [("train", 0.7), ("test", 0.3)]
+        actual = splitter.ClassificationSplit(source, splits)
+
+        self.assertEqual(84, len(actual.get_subset("train")))
+        self.assertEqual(36, len(actual.get_subset("test")))
+
+        # check stats for train
+        stat_train = compute_ann_statistics(actual.get_subset("train"))
+        attr_train = stat_train["annotations"]["labels"]["attributes"]
+        self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0])
+        self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0])
+        self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0])
+        self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0])
+        self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0])
+
+        # check stats for test
+        stat_test = compute_ann_statistics(actual.get_subset("test"))
+        attr_test = stat_test["annotations"]["labels"]["attributes"]
+        self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0])
+        self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0])
+        self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0])
+        self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0])
+        self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0])
+
+    def test_split_for_classification_multi_label_with_attr(self):
+        counts = {
+            (0, 0): 20,
+            (0, 1): 20,
+            (0, 2): 30,
+            (1, 0): 20,
+            (1, 1): 10,
+            (1, 2): 20,
+        }
+        attr1 = ["attr1", "attr2"]
+        attr2 = ["attr1", "attr3"]
+        config = {
+            "label1": {"attrs": attr1, "counts": counts},
+            "label2": {"attrs": attr2, "counts": counts},
+        }
+        source = self._generate_dataset(config)
+
+        splits = [("train", 0.7), ("test", 0.3)]
+        actual = splitter.ClassificationSplit(source, splits)
+
+        train = actual.get_subset("train")
+        test = actual.get_subset("test")
+        self.assertEqual(168, len(train))
+        self.assertEqual(72, len(test))
+
+        # check stats for train
+        stat_train = compute_ann_statistics(train)
+        dist_train = stat_train["annotations"]["labels"]["distribution"]
+        self.assertEqual(84, dist_train["label1"][0])
+        self.assertEqual(84, dist_train["label2"][0])
+        attr_train = stat_train["annotations"]["labels"]["attributes"]
+        self.assertEqual(49 * 2, attr_train["attr1"]["distribution"]["0"][0])
+        self.assertEqual(35 * 2, attr_train["attr1"]["distribution"]["1"][0])
+        self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0])
+        self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0])
+        self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0])
+        self.assertEqual(28, attr_train["attr3"]["distribution"]["0"][0])
+        self.assertEqual(21, attr_train["attr3"]["distribution"]["1"][0])
+        self.assertEqual(35, attr_train["attr3"]["distribution"]["2"][0])
+
+        # check stats for test
+        stat_test = compute_ann_statistics(test)
+        dist_test = stat_test["annotations"]["labels"]["distribution"]
+        self.assertEqual(36, dist_test["label1"][0])
+        self.assertEqual(36, dist_test["label2"][0])
+        attr_test = stat_test["annotations"]["labels"]["attributes"]
+        self.assertEqual(21 * 2, attr_test["attr1"]["distribution"]["0"][0])
+        self.assertEqual(15 * 2, attr_test["attr1"]["distribution"]["1"][0])
+        self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0])
+        self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0])
+        self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0])
+        self.assertEqual(12, attr_test["attr3"]["distribution"]["0"][0])
+        self.assertEqual(9, attr_test["attr3"]["distribution"]["1"][0])
+        self.assertEqual(15, attr_test["attr3"]["distribution"]["2"][0])
+
+        with self.subTest("random seed test"):
+            r1 = splitter.ClassificationSplit(source, splits, seed=1234)
+            r2 = splitter.ClassificationSplit(source, splits, seed=1234)
+            r3 = splitter.ClassificationSplit(source, splits, seed=4321)
+            self.assertEqual(
+                list(r1.get_subset("test")), list(r2.get_subset("test"))
+            )
+            self.assertNotEqual(
+                list(r1.get_subset("test")), list(r3.get_subset("test"))
+            )
+
+    def test_split_for_classification_gives_error(self):
+        with self.subTest("no label"):
+            source = Dataset.from_iterable([
+                DatasetItem(1, annotations=[]),
+                DatasetItem(2, annotations=[]),
+            ], categories=["a", "b", "c"])
+
+            with self.assertRaisesRegex(Exception, "exactly one is expected"):
+                splits = [("train", 0.7), ("test", 0.3)]
+                actual = splitter.ClassificationSplit(source, splits)
+                len(actual.get_subset("train"))
+
+        with self.subTest("multi label"):
+            source = Dataset.from_iterable([
+                DatasetItem(1, annotations=[Label(0), Label(1)]),
+                DatasetItem(2, annotations=[Label(0), Label(2)]),
+            ], categories=["a", "b", "c"])
+
+            with self.assertRaisesRegex(Exception, "exactly one is expected"):
+                splits = [("train", 0.7), ("test", 0.3)]
+                splitter.ClassificationSplit(source, splits)
+                len(actual.get_subset("train"))
+
+        source = Dataset.from_iterable([
+            DatasetItem(1, annotations=[Label(0)]),
+            DatasetItem(2, annotations=[Label(1)]),
+        ], categories=["a", "b", "c"])
+
+        with self.subTest("wrong ratio"):
+            with self.assertRaisesRegex(Exception, "in the range"):
+                splits = [("train", -0.5), ("test", 1.5)]
+                splitter.ClassificationSplit(source, splits)
+
+            with self.assertRaisesRegex(Exception, "Sum of ratios"):
+                splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)]
+                splitter.ClassificationSplit(source, splits)
+
+        with self.subTest("wrong subset name"):
+            with self.assertRaisesRegex(Exception, "Subset name"):
+                splits = [("train_", 0.5), ("val", 0.2), ("test", 0.3)]
+                splitter.ClassificationSplit(source, splits)
+
+    def test_split_for_reidentification(self):
+        '''
+        Test ReidentificationSplit using Dataset with label (ImageNet style)
+        '''
+        def _get_present(stat):
+            values_present = []
+            for label, dist in stat["distribution"].items():
+                if dist[0] > 0:
+                    values_present.append(label)
+            return set(values_present)
+
+        for with_attr in [True, False]:
+            if with_attr:
+                counts = {i: (i % 3 + 1) * 7 for i in range(10)}
+                config = {"person": {"attrs": ["PID"], "counts": counts}}
+                attr_for_id = "PID"
+            else:
+                counts = {}
+                config = dict()
+                for i in range(10):
+                    label = "label%d" % i
+                    count = (i % 3 + 1) * 7
+                    counts[label] = count
+                    config[label] = {"attrs": None, "counts": count}
+                attr_for_id = None
+            source = self._generate_dataset(config)
+            splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+            query = 0.4 / 0.7
+            actual = splitter.ReidentificationSplit(source,
+                splits, query, attr_for_id)
+
+            stats = dict()
+            for sname in ["train", "val", "test-query", "test-gallery"]:
+                subset = actual.get_subset(sname)
+                stat = compute_ann_statistics(subset)["annotations"]["labels"]
+                if with_attr:
+                    stat = stat["attributes"]["PID"]
+                stats[sname] = stat
+
+            # check size of subsets
+            self.assertEqual(65, stats["train"]["count"])
+            self.assertEqual(26, stats["val"]["count"])
+            self.assertEqual(18, stats["test-gallery"]["count"])
+            self.assertEqual(24, stats["test-query"]["count"])
+
+            # check ID separation between test set and others
+            train_ids = _get_present(stats["train"])
+            test_ids = _get_present(stats["test-gallery"])
+            for pid in train_ids:
+                assert pid not in test_ids
+            self.assertEqual(7, len(train_ids))
+            self.assertEqual(3, len(test_ids))
+            self.assertEqual(train_ids, _get_present(stats["val"]))
+            self.assertEqual(test_ids, _get_present(stats["test-query"]))
+
+            # check trainval set statistics
+            trainval = stats["train"]["count"] + stats["val"]["count"]
+            expected_train_count = int(trainval * 0.5 / 0.7)
+            expected_val_count = int(trainval * 0.2 / 0.7)
+            self.assertEqual(expected_train_count, stats["train"]["count"])
+            self.assertEqual(expected_val_count, stats["val"]["count"])
+            dist_train = stats["train"]["distribution"]
+            dist_val = stats["val"]["distribution"]
+            for pid in train_ids:
+                total = counts[int(pid)] if with_attr else counts[pid]
+                self.assertEqual(int(total * 0.5 / 0.7), dist_train[pid][0])
+                self.assertEqual(int(total * 0.2 / 0.7), dist_val[pid][0])
+
+            # check teset set statistics
+            dist_gallery = stats["test-gallery"]["distribution"]
+            dist_query = stats["test-query"]["distribution"]
+            for pid in test_ids:
+                total = counts[int(pid)] if with_attr else counts[pid]
+                self.assertEqual(int(total * 0.3 / 0.7), dist_gallery[pid][0])
+                self.assertEqual(int(total * 0.4 / 0.7), dist_query[pid][0])
+
+    def test_split_for_reidentification_randomseed(self):
+        '''
+        Test randomseed for reidentification
+        '''
+        counts = {}
+        config = dict()
+        for i in range(10):
+            label = "label%d" % i
+            count = (i % 3 + 1) * 7
+            counts[label] = count
+            config[label] = {"attrs": None, "counts": count}
+        source = self._generate_dataset(config)
+        splits = [("train", 0.5), ("test", 0.5)]
+        query = 0.4 / 0.7
+        r1 = splitter.ReidentificationSplit(source, splits, query, seed=1234)
+        r2 = splitter.ReidentificationSplit(source, splits, query, seed=1234)
+        r3 = splitter.ReidentificationSplit(source, splits, query, seed=4321)
+        self.assertEqual(
+            list(r1.get_subset("train")), list(r2.get_subset("train"))
+        )
+        self.assertNotEqual(
+            list(r1.get_subset("train")), list(r3.get_subset("train"))
+        )
+
+    def test_split_for_reidentification_rebalance(self):
+        '''
+        rebalance function shouldn't gives error when there's no exchange
+        '''
+        config = dict()
+        for i in range(100):
+            label = "label%03d" % i
+            config[label] = {"attrs": None, "counts": 7}
+        source = self._generate_dataset(config)
+        splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+        query = 0.4 / 0.7
+        actual = splitter.ReidentificationSplit(source, splits, query)
+
+        self.assertEqual(350, len(actual.get_subset("train")))
+        self.assertEqual(140, len(actual.get_subset("val")))
+        self.assertEqual(90, len(actual.get_subset("test-gallery")))
+        self.assertEqual(120, len(actual.get_subset("test-query")))
+
+    def test_split_for_reidentification_gives_error(self):
+        query = 0.4 / 0.7  # valid query ratio
+
+        with self.subTest("no label"):
+            source = Dataset.from_iterable([
+                DatasetItem(1, annotations=[]),
+                DatasetItem(2, annotations=[]),
+            ], categories=["a", "b", "c"])
+
+            with self.assertRaisesRegex(Exception, "exactly one is expected"):
+                splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+                actual = splitter.ReidentificationSplit(source, splits, query)
+                len(actual.get_subset("train"))
+
+        with self.subTest(msg="multi label"):
+            source = Dataset.from_iterable([
+                DatasetItem(1, annotations=[Label(0), Label(1)]),
+                DatasetItem(2, annotations=[Label(0), Label(2)]),
+            ], categories=["a", "b", "c"])
+
+            with self.assertRaisesRegex(Exception, "exactly one is expected"):
+                splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+                actual = splitter.ReidentificationSplit(source, splits, query)
+                len(actual.get_subset("train"))
+
+        counts = {i: (i % 3 + 1) * 7 for i in range(10)}
+        config = {"person": {"attrs": ["PID"], "counts": counts}}
+        source = self._generate_dataset(config)
+        with self.subTest("wrong ratio"):
+            with self.assertRaisesRegex(Exception, "in the range"):
+                splits = [("train", -0.5), ("val", 0.2), ("test", 0.3)]
+                splitter.ReidentificationSplit(source, splits, query)
+
+            with self.assertRaisesRegex(Exception, "Sum of ratios"):
+                splits = [("train", 0.6), ("val", 0.2), ("test", 0.3)]
+                splitter.ReidentificationSplit(source, splits, query)
+
+            with self.assertRaisesRegex(Exception, "in the range"):
+                splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+                actual = splitter.ReidentificationSplit(source, splits, -query)
+
+        with self.subTest("wrong subset name"):
+            with self.assertRaisesRegex(Exception, "Subset name"):
+                splits = [("_train", 0.5), ("val", 0.2), ("test", 0.3)]
+                splitter.ReidentificationSplit(source, splits, query)
+
+        with self.subTest("wrong attribute name for person id"):
+            splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+            actual = splitter.ReidentificationSplit(source, splits, query)
+
+            with self.assertRaisesRegex(Exception, "Unknown subset"):
+                actual.get_subset("test")
+
+    def _generate_detection_dataset(self, **kwargs):
+        append_bbox = kwargs.get("append_bbox")
+        with_attr = kwargs.get("with_attr", False)
+        nimages = kwargs.get("nimages", 10)
+
+        label_cat = LabelCategories()
+        for i in range(6):
+            label = "label%d" % (i + 1)
+            if with_attr is True:
+                attributes = {"attr0", "attr%d" % (i + 1)}
+            else:
+                attributes = {}
+            label_cat.add(label, attributes=attributes)
+        categories = {AnnotationType.label: label_cat}
+
+        iterable = []
+        attr_val = 0
+        totals = np.zeros(3)
+        objects = [(1, 5, 2), (3, 4, 1), (2, 3, 4), (1, 1, 1), (2, 4, 2)]
+        for img_id in range(nimages):
+            cnts = objects[img_id % len(objects)]
+            totals += cnts
+            annotations = []
+            for label_id, count in enumerate(cnts):
+                attributes = {}
+                if with_attr:
+                    attr_val += 1
+                    attributes["attr0"] = attr_val % 3
+                    attributes["attr%d" % (label_id + 1)] = attr_val % 2
+                for ann_id in range(count):
+                    append_bbox(annotations, label_id=label_id, ann_id=ann_id,
+                        attributes=attributes)
+            item = DatasetItem(img_id, subset=self._get_subset(img_id),
+                annotations=annotations, attributes={"id": img_id})
+            iterable.append(item)
+
+        dataset = Dataset.from_iterable(iterable, categories)
+        return dataset, totals
+
+    @staticmethod
+    def _get_append_bbox(dataset_type):
+        def append_bbox_coco(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"],
+                    id=kwargs["ann_id"],
+                    attributes=kwargs["attributes"],
+                    group=kwargs["ann_id"],
+                )
+            )
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_voc(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"],
+                    id=kwargs["ann_id"] + 1,
+                    attributes=kwargs["attributes"],
+                    group=kwargs["ann_id"],
+                )
+            )  # obj
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"] + 3,
+                    group=kwargs["ann_id"],
+                )
+            )  # part
+            annotations.append(
+                Label(kwargs["label_id"] + 3, attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_yolo(annotations, **kwargs):
+            annotations.append(Bbox(1, 1, 2, 2, label=kwargs["label_id"]))
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_cvat(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"],
+                    id=kwargs["ann_id"],
+                    attributes=kwargs["attributes"],
+                    group=kwargs["ann_id"],
+                    z_order=kwargs["ann_id"],
+                )
+            )
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_labelme(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"],
+                    id=kwargs["ann_id"],
+                    attributes=kwargs["attributes"],
+                )
+            )
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_mot(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, label=kwargs["label_id"],
+                    attributes=kwargs["attributes"],
+                )
+            )
+            annotations.append(
+                Label(kwargs["label_id"], attributes=kwargs["attributes"])
+            )
+
+        def append_bbox_widerface(annotations, **kwargs):
+            annotations.append(
+                Bbox(1, 1, 2, 2, attributes=kwargs["attributes"])
+            )
+            annotations.append(Label(0, attributes=kwargs["attributes"]))
+
+        functions = {
+            "coco": append_bbox_coco,
+            "voc": append_bbox_voc,
+            "yolo": append_bbox_yolo,
+            "cvat": append_bbox_cvat,
+            "labelme": append_bbox_labelme,
+            "mot": append_bbox_mot,
+            "widerface": append_bbox_widerface,
+        }
+
+        func = functions.get(dataset_type, append_bbox_cvat)
+        return func
+
+    def test_split_for_detection(self):
+        dtypes = ["coco", "voc", "yolo", "cvat", "labelme", "mot", "widerface"]
+        params = []
+        for dtype in dtypes:
+            for with_attr in [False, True]:
+                params.append((dtype, with_attr, 10, 5, 3, 2))
+                params.append((dtype, with_attr, 10, 7, 0, 3))
+
+        for dtype, with_attr, nimages, train, val, test in params:
+            source, _ = self._generate_detection_dataset(
+                append_bbox=self._get_append_bbox(dtype),
+                with_attr=with_attr,
+                nimages=nimages,
+            )
+            total = np.sum([train, val, test])
+            splits = [
+                ("train", train / total),
+                ("val", val / total),
+                ("test", test / total),
+            ]
+            with self.subTest(
+                dtype=dtype,
+                with_attr=with_attr,
+                nimage=nimages,
+                train=train,
+                val=val,
+                test=test,
+            ):
+                actual = splitter.DetectionSplit(source, splits)
+
+                self.assertEqual(train, len(actual.get_subset("train")))
+                self.assertEqual(val, len(actual.get_subset("val")))
+                self.assertEqual(test, len(actual.get_subset("test")))
+
+        # random seed test
+        source, _ = self._generate_detection_dataset(
+            append_bbox=self._get_append_bbox("cvat"),
+            with_attr=True,
+            nimages=10,
+        )
+
+        splits = [("train", 0.5), ("test", 0.5)]
+        r1 = splitter.DetectionSplit(source, splits, seed=1234)
+        r2 = splitter.DetectionSplit(source, splits, seed=1234)
+        r3 = splitter.DetectionSplit(source, splits, seed=4321)
+        self.assertEqual(
+            list(r1.get_subset("test")), list(r2.get_subset("test"))
+        )
+        self.assertNotEqual(
+            list(r1.get_subset("test")), list(r3.get_subset("test"))
+        )
+
+    def test_split_for_detection_gives_error(self):
+        with self.subTest(msg="bbox annotation"):
+            source = Dataset.from_iterable([
+                DatasetItem(1, annotations=[Label(0), Label(1)]),
+                DatasetItem(2, annotations=[Label(0), Label(2)]),
+            ], categories=["a", "b", "c"])
+
+            with self.assertRaisesRegex(Exception, "more than one bbox"):
+                splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
+                actual = splitter.DetectionSplit(source, splits)
+                len(actual.get_subset("train"))
+
+        source, _ = self._generate_detection_dataset(
+            append_bbox=self._get_append_bbox("cvat"),
+            with_attr=True,
+            nimages=5,
+        )
+
+        with self.subTest("wrong ratio"):
+            with self.assertRaisesRegex(Exception, "in the range"):
+                splits = [("train", -0.5), ("test", 1.5)]
+                splitter.DetectionSplit(source, splits)
+
+            with self.assertRaisesRegex(Exception, "Sum of ratios"):
+                splits = [("train", 0.5), ("test", 0.5), ("val", 0.5)]
+                splitter.DetectionSplit(source, splits)
+
+        with self.subTest("wrong subset name"):
+            with self.assertRaisesRegex(Exception, "Subset name"):
+                splits = [("train_", 0.5), ("val", 0.2), ("test", 0.3)]
+                splitter.DetectionSplit(source, splits)
diff --git a/tests/test_tfrecord_format.py b/tests/test_tfrecord_format.py
index f9491c39ef..39cbe7c9b4 100644
--- a/tests/test_tfrecord_format.py
+++ b/tests/test_tfrecord_format.py
@@ -4,11 +4,10 @@
 
 from unittest import TestCase, skipIf
 
-from datumaro.components.project import Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.components.extractor import (DatasetItem,
     AnnotationType, Bbox, Mask, LabelCategories
 )
-from datumaro.components.project import Project
 from datumaro.util.image import Image, ByteImage, encode_image
 from datumaro.util.test_utils import (TestDir, compare_datasets,
     test_save_and_load)
@@ -218,7 +217,6 @@ def test_can_import(self):
                 'label_' + str(label) for label in range(10)),
         })
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'tf_detection_api') \
-            .make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'tf_detection_api')
 
         compare_datasets(self, target_dataset, dataset)
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
index 9660ccac5d..de3cd66943 100644
--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
@@ -386,3 +386,29 @@ def test_remap_labels_delete_unspecified(self):
             mapping={}, default='delete')
 
         compare_datasets(self, target_dataset, actual)
+
+    def test_transform_labels(self):
+        src_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[
+                Label(1),
+                Bbox(1, 2, 3, 4, label=2),
+                Bbox(1, 3, 3, 3),
+                Mask(image=np.array([1]), label=3),
+                Polygon([1, 1, 2, 2, 3, 4], label=4),
+                PolyLine([1, 3, 4, 2, 5, 6], label=5)
+            ])
+        ], categories=['label%s' % i for i in range(6)])
+
+        dst_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, annotations=[
+                Label(1),
+                Label(2),
+                Label(3),
+                Label(4),
+                Label(5)
+            ]),
+        ], categories=['label%s' % i for i in range(6)])
+
+        actual = transforms.AnnsToLabels(src_dataset)
+
+        compare_datasets(self, dst_dataset, actual)
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000000..f19e5d4f95
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,123 @@
+import os
+import os.path as osp
+
+from unittest import TestCase
+
+from datumaro.util import Rollback, error_rollback
+from datumaro.util.test_utils import TestDir
+from datumaro.util.os_util import walk
+
+
+class TestRollback(TestCase):
+    def test_does_not_call_on_no_error(self):
+        success = True
+        def cb():
+            nonlocal success
+            success = False
+
+        with Rollback() as on_error:
+            on_error.do(cb)
+
+        self.assertTrue(success)
+
+    def test_calls_on_error(self):
+        success = False
+        def cb():
+            nonlocal success
+            success = True
+
+        try:
+            with Rollback() as on_error:
+                on_error.do(cb)
+                raise Exception('err')
+        except Exception:
+            pass
+        finally:
+            self.assertTrue(success)
+
+    def test_decorator_calls_on_error(self):
+        success = False
+        def cb():
+            nonlocal success
+            success = True
+
+        @error_rollback('on_error')
+        def foo(on_error=None):
+            on_error.do(cb)
+            raise Exception('err')
+
+        try:
+            foo()
+        except Exception:
+            pass
+        finally:
+            self.assertTrue(success)
+
+    def test_decorator_does_not_call_on_no_error(self):
+        success = True
+        def cb():
+            nonlocal success
+            success = False
+
+        @error_rollback('on_error')
+        def foo(on_error=None):
+            on_error.do(cb)
+
+        foo()
+
+        self.assertTrue(success)
+
+    def test_decorator_supports_implicit_arg(self):
+        success = False
+        def cb():
+            nonlocal success
+            success = True
+
+        @error_rollback('on_error', implicit=True)
+        def foo():
+            on_error.do(cb)  # noqa: F821
+            raise Exception('err')
+
+        try:
+            foo()
+        except Exception:
+            pass
+        finally:
+            self.assertTrue(success)
+
+    def test_can_fowrard_args(self):
+        success1 = False
+        def cb1(a1, a2=None, ignore_errors=None):
+            nonlocal success1
+            if a1 == 5 and a2 == 2 and ignore_errors == None:
+                success1 = True
+
+        success2 = False
+        def cb2(a1, a2=None, ignore_errors=None):
+            nonlocal success2
+            if a1 == 5 and a2 == 2 and ignore_errors == 4:
+                success2 = True
+
+        try:
+            with Rollback() as on_error:
+                on_error.do(cb1, 5, a2=2, ignore_errors=True)
+                on_error.do(cb2, 5, a2=2, ignore_errors=True,
+                    fwd_kwargs={'ignore_errors': 4})
+                raise Exception('err')
+        except Exception:
+            pass
+        finally:
+            self.assertTrue(success1)
+            self.assertTrue(success2)
+
+class TestOsUtils(TestCase):
+    def test_can_walk_with_maxdepth(self):
+        with TestDir() as rootdir:
+            os.makedirs(osp.join(rootdir, '1', '2', '3', '4'))
+
+            visited = set(d for d, _, _ in walk(rootdir, max_depth=2))
+            self.assertEqual({
+                osp.join(rootdir),
+                osp.join(rootdir, '1'),
+                osp.join(rootdir, '1', '2'),
+            }, visited)
\ No newline at end of file
diff --git a/tests/test_vgg_face2_format.py b/tests/test_vgg_face2_format.py
new file mode 100644
index 0000000000..4e260d6bcb
--- /dev/null
+++ b/tests/test_vgg_face2_format.py
@@ -0,0 +1,144 @@
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import (AnnotationType, Bbox, DatasetItem,
+    Label, LabelCategories, Points)
+from datumaro.plugins.vgg_face2_format import (VggFace2Converter,
+    VggFace2Importer)
+from datumaro.util.test_utils import TestDir, compare_datasets
+
+
+class VggFace2FormatTest(TestCase):
+    def test_can_save_and_load(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='1', subset='train', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, label=0, group=1),
+                    Points([3.2, 3.12, 4.11, 3.2, 2.11,
+                        2.5, 3.5, 2.11, 3.8, 2.13], label=0, group=1),
+                ]
+            ),
+            DatasetItem(id='2', subset='train', image=np.ones((10, 10, 3)),
+                annotations=[
+                    Points([4.23, 4.32, 5.34, 4.45, 3.54,
+                        3.56, 4.52, 3.51, 4.78, 3.34], label=1, group=1),
+                ]
+            ),
+            DatasetItem(id='3', subset='train', image=np.ones((8, 8, 3)),
+                annotations=[Label(2, group=1)]
+            ),
+            DatasetItem(id='4', subset='train', image=np.ones((10, 10, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, label=3, group=1),
+                    Points([3.2, 3.12, 4.11, 3.2, 2.11,
+                        2.5, 3.5, 2.11, 3.8, 2.13], label=3, group=1),
+                ]
+            ),
+            DatasetItem(id='a/5', subset='train', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(2, 2, 2, 2, group=1),
+                ]
+            ),
+            DatasetItem(id='label_0', subset='train', image=np.ones((8, 8, 3)),
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                [('label_%s' % i, 'class_%s' % i) for i in range(5)]),
+        })
+
+        with TestDir() as test_dir:
+            VggFace2Converter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_dataset_with_no_subsets(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='b/1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, label=0, group=1),
+                    Points([4.23, 4.32, 5.34, 4.45, 3.54,
+                        3.56, 4.52, 3.51, 4.78, 3.34], label=0, group=1),
+                ]
+            ),
+        ], categories=['a'])
+
+        with TestDir() as test_dir:
+            VggFace2Converter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_dataset_with_no_save_images(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, label=0, group=1),
+                    Points([4.23, 4.32, 5.34, 4.45, 3.54,
+                        3.56, 4.52, 3.51, 4.78, 3.34], label=0, group=1),
+                ]
+            ),
+        ], categories=['label_0'])
+
+        with TestDir() as test_dir:
+            VggFace2Converter.convert(source_dataset, test_dir, save_images=False)
+            parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_dataset_with_no_labels(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, group=1),
+                    Points([4.23, 4.32, 5.34, 4.45, 3.54,
+                        3.56, 4.52, 3.51, 4.78, 3.34], group=1),
+                ]
+            ),
+            DatasetItem(id='2', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(2, 2, 4, 2, group=1),
+                ]
+            ),
+        ], categories=[])
+
+        with TestDir() as test_dir:
+            VggFace2Converter.convert(source_dataset, test_dir, save_images=False)
+            parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'vgg_face2_dataset')
+
+class VggFace2ImporterTest(TestCase):
+    def test_can_detect(self):
+        self.assertTrue(VggFace2Importer.detect(DUMMY_DATASET_DIR))
+
+    def test_can_import(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id='0001_01', subset='train',
+                image=np.ones((10, 15, 3)),
+                annotations=[
+                    Bbox(2, 2, 1, 2, label=0, group=1),
+                    Points([2.787, 2.898, 2.965, 2.79, 2.8,
+                        2.456, 2.81, 2.32, 2.89, 2.3], label=0, group=1),
+                ]
+            ),
+            DatasetItem(id='0002_01', subset='train',
+                image=np.ones((10, 15, 3)),
+                annotations=[
+                    Bbox(1, 3, 1, 1, label=1, group=1),
+                    Points([1.2, 3.8, 1.8, 3.82, 1.51,
+                        3.634, 1.43, 3.34, 1.65, 3.32], label=1, group=1)
+                ]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                [('n000001', 'car'), ('n000002', 'person')]),
+        })
+
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'vgg_face2')
+
+        compare_datasets(self, expected_dataset, dataset)
diff --git a/tests/test_voc_format.py b/tests/test_voc_format.py
index b33aaa125f..9a4502f6e9 100644
--- a/tests/test_voc_format.py
+++ b/tests/test_voc_format.py
@@ -18,7 +18,7 @@
     VocSegmentationConverter,
 )
 from datumaro.plugins.voc_format.importer import VocImporter
-from datumaro.components.project import Project
+from datumaro.components.dataset import Dataset
 from datumaro.util.image import Image
 from datumaro.util.test_utils import (TestDir, compare_datasets,
     test_save_and_load)
@@ -122,7 +122,7 @@ def __iter__(self):
                         image=np.ones((10, 20, 3))),
                 ])
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'voc').make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'voc')
 
         compare_datasets(self, DstExtractor(), dataset)
 
diff --git a/tests/test_widerface_format.py b/tests/test_widerface_format.py
new file mode 100644
index 0000000000..d93f7bf240
--- /dev/null
+++ b/tests/test_widerface_format.py
@@ -0,0 +1,161 @@
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from datumaro.components.extractor import (AnnotationType, Bbox, DatasetItem,
+    Label, LabelCategories)
+from datumaro.components.dataset import Dataset
+from datumaro.plugins.widerface_format import WiderFaceConverter, WiderFaceImporter
+from datumaro.util.test_utils import TestDir, compare_datasets
+
+
+class WiderFaceFormatTest(TestCase):
+    def test_can_save_and_load(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='1', subset='train', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2),
+                    Bbox(0, 1, 2, 3, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 2, 'invalid': 0}),
+                    Label(0),
+                ]
+            ),
+            DatasetItem(id='2', subset='train', image=np.ones((10, 10, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 1,
+                        'occluded': 0, 'pose': 1, 'invalid': 0}),
+                    Bbox(3, 3, 2, 3, attributes = {
+                        'blur': 0, 'expression': 1, 'illumination': 0,
+                        'occluded': 0, 'pose': 2, 'invalid': 0}),
+                    Bbox(2, 1, 2, 3, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 0, 'invalid': 1}),
+                    Label(1),
+                ]
+            ),
+
+            DatasetItem(id='3', subset='val', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 1, 5, 2, attributes = {
+                        'blur': 2, 'expression': 1, 'illumination': 0,
+                        'occluded': 0, 'pose': 1, 'invalid': 0}),
+                    Bbox(0, 2, 3, 2),
+                    Bbox(0, 2, 4, 2),
+                    Bbox(0, 7, 3, 2, attributes = {
+                        'blur': 2, 'expression': 1, 'illumination': 0,
+                        'occluded': 0, 'pose': 1, 'invalid': 0}),
+                ]
+            ),
+
+            DatasetItem(id='4', subset='val', image=np.ones((8, 8, 3))),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(i) for i in range(3)),
+        })
+
+        with TestDir() as test_dir:
+            WiderFaceConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'wider_face')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_dataset_with_no_subsets(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2),
+                    Bbox(0, 1, 2, 3, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 2, 'invalid': 0}),
+                ]
+            ),
+        ], categories=[])
+
+        with TestDir() as test_dir:
+            WiderFaceConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'wider_face')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_dataset_with_non_widerface_attributes(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2),
+                    Bbox(0, 1, 2, 3, attributes = {
+                        'non-widerface attribute': 0,
+                        'blur': 1, 'invalid': 1}),
+                    Bbox(1, 1, 2, 2, attributes = {
+                        'non-widerface attribute': 0}),
+                ]
+            ),
+        ], categories=[])
+
+        target_dataset = Dataset.from_iterable([
+            DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2),
+                    Bbox(0, 1, 2, 3, attributes = {
+                        'blur': 1, 'invalid': 1}),
+                    Bbox(1, 1, 2, 2),
+                ]
+            ),
+        ], categories=[])
+
+        with TestDir() as test_dir:
+            WiderFaceConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'wider_face')
+
+            compare_datasets(self, target_dataset, parsed_dataset)
+
+DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'widerface_dataset')
+
+class WiderFaceImporterTest(TestCase):
+    def test_can_detect(self):
+        self.assertTrue(WiderFaceImporter.detect(DUMMY_DATASET_DIR))
+
+    def test_can_import(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id='0_Parade_image_01', subset='train',
+                image=np.ones((10, 15, 3)),
+                annotations=[
+                    Bbox(1, 2, 2, 2, attributes = {
+                        'blur': 0, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 0, 'invalid': 0}),
+                        Label(0),
+                ]
+            ),
+            DatasetItem(id='1_Handshaking_image_02', subset='train',
+                image=np.ones((10, 15, 3)),
+                annotations=[
+                    Bbox(1, 1, 2, 2, attributes = {
+                        'blur': 0, 'expression': 0, 'illumination': 1,
+                        'occluded': 0, 'pose': 0, 'invalid': 0}),
+                    Bbox(5, 1, 2, 2, attributes = {
+                        'blur': 0, 'expression': 0, 'illumination': 1,
+                        'occluded': 0, 'pose': 0, 'invalid': 0}),
+                        Label(1),
+                ]
+            ),
+            DatasetItem(id='0_Parade_image_03', subset='val',
+                image=np.ones((10, 15, 3)),
+                annotations=[
+                    Bbox(0, 0, 1, 1, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 2, 'invalid': 0}),
+                    Bbox(3, 2, 1, 2, attributes = {
+                        'blur': 0, 'expression': 0, 'illumination': 0,
+                        'occluded': 1, 'pose': 0, 'invalid': 0}),
+                    Bbox(5, 6, 1, 1, attributes = {
+                        'blur': 2, 'expression': 0, 'illumination': 0,
+                        'occluded': 0, 'pose': 2, 'invalid': 0}),
+                        Label(0),
+                ]
+            ),
+        ], categories= ['Parade', 'Handshaking'])
+
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'wider_face')
+
+        compare_datasets(self, expected_dataset, dataset)
diff --git a/tests/test_yolo_format.py b/tests/test_yolo_format.py
index 549811fd1c..615bb6d19f 100644
--- a/tests/test_yolo_format.py
+++ b/tests/test_yolo_format.py
@@ -6,7 +6,7 @@
 from datumaro.components.extractor import (DatasetItem,
     AnnotationType, Bbox, LabelCategories,
 )
-from datumaro.components.project import Project, Dataset
+from datumaro.components.dataset import Dataset
 from datumaro.plugins.yolo_format.extractor import YoloImporter
 from datumaro.plugins.yolo_format.converter import YoloConverter
 from datumaro.util.image import Image, save_image
@@ -130,7 +130,6 @@ def test_can_import(self):
                 'label_' + str(i) for i in range(10)),
         })
 
-        dataset = Project.import_from(DUMMY_DATASET_DIR, 'yolo') \
-            .make_dataset()
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'yolo')
 
         compare_datasets(self, expected_dataset, dataset)