Skip to content

Commit

Permalink
Add dataset export facility (#813)
Browse files Browse the repository at this point in the history
* Add datumaro django application
* Add cvat task datumaro bindings
* Add REST api for task export
* Add scheduler service
* Updated CHANGELOG.md
  • Loading branch information
zhiltsov-max authored and nmanovic committed Nov 22, 2019
1 parent 3aa4abf commit 74f720a
Show file tree
Hide file tree
Showing 82 changed files with 10,370 additions and 0 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ before_script:

script:
- docker exec -it cvat /bin/bash -c 'python3 manage.py test cvat/apps utils/cli'
- docker exec -it cvat /bin/bash -c 'python3 manage.py test datumaro/'
- docker exec -it cvat /bin/bash -c 'cd cvat-core && npm install && npm run test && npm run coveralls'
17 changes: 17 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,22 @@
"env": {},
"console": "internalConsole"
},
{
"name": "server: RQ - scheduler",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"justMyCode": false,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/manage.py",
"args": [
"rqscheduler",
],
"django": true,
"cwd": "${workspaceFolder}",
"env": {},
"console": "internalConsole"
},
{
"name": "server: RQ - low",
"type": "python",
Expand Down Expand Up @@ -177,6 +193,7 @@
"server: django",
"server: RQ - default",
"server: RQ - low",
"server: RQ - scheduler",
"server: git",
]
}
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ https://github.com/opencv/cvat/issues/750).
- Auto segmentation using Mask_RCNN component (Keras+Tensorflow Mask R-CNN Segmentation)
- Added MOT CSV format support
- Ability to dump/load annotations in LabelMe format from UI
- REST API to export an annotation task (images + annotations)
- Datumaro is an experimental framework to build, analyze, debug and visualize datasets for DL algorithms

### Changed
-
Expand Down
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ COPY utils ${HOME}/utils
COPY cvat/ ${HOME}/cvat
COPY cvat-core/ ${HOME}/cvat-core
COPY tests ${HOME}/tests
COPY datumaro/ ${HOME}/datumaro

RUN sed -r "s/^(.*)#.*$/\1/g" ${HOME}/datumaro/requirements.txt | xargs -n 1 -L 1 pip3 install --no-cache-dir

# Binary option is necessary to correctly apply the patch on Windows platform.
# https://unix.stackexchange.com/questions/239364/how-to-fix-hunk-1-failed-at-1-different-line-endings-message
RUN patch --binary -p1 < ${HOME}/cvat/apps/engine/static/engine/js/3rdparty.patch
Expand Down
Empty file.
176 changes: 176 additions & 0 deletions cvat/apps/dataset_manager/bindings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
from collections import OrderedDict
import os
import os.path as osp

from django.db import transaction

from cvat.apps.annotation.annotation import Annotation
from cvat.apps.engine.annotation import TaskAnnotation
from cvat.apps.engine.models import Task, ShapeType

import datumaro.components.extractor as datumaro
from datumaro.util.image import lazy_image


class CvatImagesDirExtractor(datumaro.Extractor):
_SUPPORTED_FORMATS = ['.png', '.jpg']

def __init__(self, url):
super().__init__()

items = []
for (dirpath, _, filenames) in os.walk(url):
for name in filenames:
path = osp.join(dirpath, name)
if self._is_image(path):
item_id = Task.get_image_frame(path)
item = datumaro.DatasetItem(
id=item_id, image=lazy_image(path))
items.append((item.id, item))

items = sorted(items, key=lambda e: e[0])
items = OrderedDict(items)
self._items = items

self._subsets = None

def __iter__(self):
for item in self._items.values():
yield item

def __len__(self):
return len(self._items)

def subsets(self):
return self._subsets

def get(self, item_id, subset=None, path=None):
if path or subset:
raise KeyError()
return self._items[item_id]

def _is_image(self, path):
for ext in self._SUPPORTED_FORMATS:
if osp.isfile(path) and path.endswith(ext):
return True
return False


class CvatTaskExtractor(datumaro.Extractor):
def __init__(self, url, db_task, user):
self._db_task = db_task
self._categories = self._load_categories()

cvat_annotations = TaskAnnotation(db_task.id, user)
with transaction.atomic():
cvat_annotations.init_from_db()
cvat_annotations = Annotation(cvat_annotations.ir_data, db_task)

dm_annotations = []

for cvat_anno in cvat_annotations.group_by_frame():
dm_anno = self._read_cvat_anno(cvat_anno)
dm_item = datumaro.DatasetItem(
id=cvat_anno.frame, annotations=dm_anno)
dm_annotations.append((dm_item.id, dm_item))

dm_annotations = sorted(dm_annotations, key=lambda e: e[0])
self._items = OrderedDict(dm_annotations)

self._subsets = None

def __iter__(self):
for item in self._items.values():
yield item

def __len__(self):
return len(self._items)

def subsets(self):
return self._subsets

def get(self, item_id, subset=None, path=None):
if path or subset:
raise KeyError()
return self._items[item_id]

def _load_categories(self):
categories = {}
label_categories = datumaro.LabelCategories()

db_labels = self._db_task.label_set.all()
for db_label in db_labels:
db_attributes = db_label.attributespec_set.all()
label_categories.add(db_label.name)

for db_attr in db_attributes:
label_categories.attributes.add(db_attr.name)

categories[datumaro.AnnotationType.label] = label_categories

return categories

def categories(self):
return self._categories

def _read_cvat_anno(self, cvat_anno):
item_anno = []

categories = self.categories()
label_cat = categories[datumaro.AnnotationType.label]

label_map = {}
label_attrs = {}
db_labels = self._db_task.label_set.all()
for db_label in db_labels:
label_map[db_label.name] = label_cat.find(db_label.name)[0]

attrs = {}
db_attributes = db_label.attributespec_set.all()
for db_attr in db_attributes:
attrs[db_attr.name] = db_attr.default_value
label_attrs[db_label.name] = attrs
map_label = lambda label_db_name: label_map[label_db_name]

for tag_obj in cvat_anno.tags:
anno_group = tag_obj.group
if isinstance(anno_group, int):
anno_group = anno_group
anno_label = map_label(tag_obj.label)
anno_attr = dict(label_attrs[tag_obj.label])
for attr in tag_obj.attributes:
anno_attr[attr.name] = attr.value

anno = datumaro.LabelObject(label=anno_label,
attributes=anno_attr, group=anno_group)
item_anno.append(anno)

for shape_obj in cvat_anno.labeled_shapes:
anno_group = shape_obj.group
if isinstance(anno_group, int):
anno_group = anno_group
anno_label = map_label(shape_obj.label)
anno_attr = dict(label_attrs[shape_obj.label])
for attr in shape_obj.attributes:
anno_attr[attr.name] = attr.value

anno_points = shape_obj.points
if shape_obj.type == ShapeType.POINTS:
anno = datumaro.PointsObject(anno_points,
label=anno_label, attributes=anno_attr, group=anno_group)
elif shape_obj.type == ShapeType.POLYLINE:
anno = datumaro.PolyLineObject(anno_points,
label=anno_label, attributes=anno_attr, group=anno_group)
elif shape_obj.type == ShapeType.POLYGON:
anno = datumaro.PolygonObject(anno_points,
label=anno_label, attributes=anno_attr, group=anno_group)
elif shape_obj.type == ShapeType.RECTANGLE:
x0, y0, x1, y1 = anno_points
anno = datumaro.BboxObject(x0, y0, x1 - x0, y1 - y0,
label=anno_label, attributes=anno_attr, group=anno_group)
else:
raise Exception("Unknown shape type '%s'" % (shape_obj.type))

item_anno.append(anno)

return item_anno
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from collections import OrderedDict
import getpass
import json
import os, os.path as osp
import requests

from datumaro.components.config import (Config,
SchemaBuilder as _SchemaBuilder,
)
import datumaro.components.extractor as datumaro
from datumaro.util.image import lazy_image, load_image

from cvat.utils.cli.core import CLI as CVAT_CLI, CVAT_API_V1


CONFIG_SCHEMA = _SchemaBuilder() \
.add('task_id', int) \
.add('server_host', str) \
.add('server_port', int) \
.build()

class cvat_rest_api_task_images(datumaro.Extractor):
def _image_local_path(self, item_id):
task_id = self._config.task_id
return osp.join(self._cache_dir,
'task_{}_frame_{:06d}.jpg'.format(task_id, item_id))

def _make_image_loader(self, item_id):
return lazy_image(item_id,
lambda item_id: self._image_loader(item_id, self))

def _is_image_cached(self, item_id):
return osp.isfile(self._image_local_path(item_id))

def _download_image(self, item_id):
self._connect()
os.makedirs(self._cache_dir, exist_ok=True)
self._cvat_cli.tasks_frame(task_id=self._config.task_id,
frame_ids=[item_id], outdir=self._cache_dir)

def _connect(self):
if self._session is not None:
return

session = None
try:
print("Enter credentials for '%s:%s':" % \
(self._config.server_host, self._config.server_port))
username = input('User: ')
password = getpass.getpass()

session = requests.Session()
session.auth = (username, password)

api = CVAT_API_V1(self._config.server_host,
self._config.server_port)
cli = CVAT_CLI(session, api)

self._session = session
self._cvat_cli = cli
except Exception:
if session is not None:
session.close()

def __del__(self):
if hasattr(self, '_session'):
if self._session is not None:
self._session.close()

@staticmethod
def _image_loader(item_id, extractor):
if not extractor._is_image_cached(item_id):
extractor._download_image(item_id)
local_path = extractor._image_local_path(item_id)
return load_image(local_path)

def __init__(self, url):
super().__init__()

local_dir = url
self._local_dir = local_dir
self._cache_dir = osp.join(local_dir, 'images')

with open(osp.join(url, 'config.json'), 'r') as config_file:
config = json.load(config_file)
config = Config(config, schema=CONFIG_SCHEMA)
self._config = config

with open(osp.join(url, 'images_meta.json'), 'r') as images_file:
images_meta = json.load(images_file)
image_list = images_meta['images']

items = []
for entry in image_list:
item_id = entry['id']
item = datumaro.DatasetItem(
id=item_id, image=self._make_image_loader(item_id))
items.append((item.id, item))

items = sorted(items, key=lambda e: e[0])
items = OrderedDict(items)
self._items = items

self._cvat_cli = None
self._session = None

def __iter__(self):
for item in self._items.values():
yield item

def __len__(self):
return len(self._items)

def subsets(self):
return None

def get(self, item_id, subset=None, path=None):
if path or subset:
raise KeyError()
return self._items[item_id]
Loading

0 comments on commit 74f720a

Please sign in to comment.