diff --git a/config/prismjs/dvc-commands.js b/config/prismjs/dvc-commands.js index 4b4e7689ad..6558e26178 100644 --- a/config/prismjs/dvc-commands.js +++ b/config/prismjs/dvc-commands.js @@ -34,6 +34,9 @@ module.exports = [ 'metrics', 'params diff', 'params', + 'plot show', + 'plot diff', + 'plot', 'lock', 'list', 'install', diff --git a/content/docs/command-reference/plot/diff.md b/content/docs/command-reference/plot/diff.md new file mode 100644 index 0000000000..596a2f3f15 --- /dev/null +++ b/content/docs/command-reference/plot/diff.md @@ -0,0 +1,130 @@ +# plot diff + +Show multiple versions of +[continuous metrics](/doc/command-reference/plot#continous-metrics) by plotting +them in a single image. + +## Synopsis + +```usage +usage: dvc plot diff [-h] [-q | -v] [-t [TEMPLATE]] [-d [DATAFILE]] [-f FILE] + [-s SELECT] [-x X] [-y Y] [--stdout] [--no-csv-header] + [--no-html] [--title TITLE] [--xlab XLAB] [--ylab YLAB] + +positional arguments: + revisions Git revisions to plot from +``` + +## Description + +This command visualize difference between continuous metrics among experiments +in the repository history. Requires that Git is being used to version the +metrics files. + +The metrics file needs to be specified through `-d`/`--datafile` option. Also, a +plot can be customized by [Vega](https://vega.github.io/) templates through +option `--template`. To learn more about the file formats and templates please +see `dvc plot`. + +Run without any revision specified, this command compares metrics currently +presented in the workspace (uncommitted changes) with the latest committed +version. A single specified revision shows the difference between the revision +and the version in the workspace. + +In contrast to many commands such as `git diff`, `dvc metrics diff` and +`dvc prams diff` the plot difference shows all the revisions in a single ouput +and does not limited by two versions. A user can specify as many revisions as +needed. + +The files with metrics can be files commited in Git as well as data files under +DVC control. In the case of data files, the file revision is corresponded to Git +revision of [DVC-files](/doc/user-guide/dvc-file-format) that has this file as +an output. + +## Options + +- `-d [DATAFILE], --datafile [DATAFILE]` - Continuous metrics file to visualize. + +- `-t [TEMPLATE], --template [TEMPLATE]` - File to be injected with data. The + default temlpate is `.dvc/plot/default.json`. See more details in `dvc plot`. + +- `-f FILE, --file FILE` - Name of the generated file. By default, the output + file name is equal to the input filename with additional `.html` suffix or + `.json` suffix for `--no-html` mode. + +- `--no-html` - Do not wrap output vega plot json with HTML. + +- `-s SELECT, --select SELECT` - Select which fileds or jsonpath to put into + plot. All the fields will be included by default with DVC generated `index` + field - see `dvc plot`. + +- `-x X` - Field name for x axis. `index` is the default field for X. + +- `-y Y` - Field name for y axis. The dafult field is the last field found in + the input file: the last column in CSV file or the last field in the JSON + array object (the first object). + +- `--xlab XLAB` - X axis title. The X column name is the default title. + +- `--ylab YLAB` - Y axis title. The Y column name is the default title. + +- `--title TITLE` - Plot title. + +- `-o, --stdout` - Print plot content to stdout. + +- `--no-csv-header` - Provided CSV or TSV datafile does not have a header. + +- `-h`, `--help` - prints the usage/help message, and exit. + +- `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no + problems arise, otherwise 1. + +- `-v`, `--verbose` - displays detailed tracing information. + +## Examples + +The difference between a not commited version of the file and the last commited +one: + +```dvc +$ dvc plot diff -d logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +A new file `logs.csv.html` was generated. User can open it in a web browser. + +![](/img/plot_diff_workspace.svg) + +The difference between two specified commits (multiple commits, tag or branches +can be specified): + +```dvc +$ dvc plot diff -d logs.csv HEAD 11c0bf1 +file:///Users/dmitry/src/plot/logs.csv.html +``` + +![](/img/plot_diff.svg) + +The predefined confusion matrix template shows how continuous metrics difference +can be faceted by separate plots: + +```csv +actual,predicted +cat,cat +cat,cat +cat,cat +cat,dog +cat,dinosaur +cat,dinosaur +cat,bird +turtle,dog +turtle,cat +... +``` + +```dvc +$ dvc plot diff -d classes.csv -t confusion +file:///Users/dmitry/src/test/plot_old/classes.csv.html +``` + +![](/img/plot_diff_confusion.svg) diff --git a/content/docs/command-reference/plot/index.md b/content/docs/command-reference/plot/index.md new file mode 100644 index 0000000000..697a9dd4de --- /dev/null +++ b/content/docs/command-reference/plot/index.md @@ -0,0 +1,239 @@ +# plot + +Contains commands to visualize +[continuous metrics](/doc/command-reference/plot#continuous-metrics) in +structured files (JSON, CSV, or TSV): [show](/doc/command-reference/plot/show), +[diff](/doc/command-reference/plot/diff). + +## Synopsis + +```usage +usage: dvc plot [-h] [-q | -v] {show,diff} ... + +positional arguments: + COMMAND + show Generate a plot image file from a continuous metrics file. + diff Plot continuous metrics differences between commits in the DVC + repository, or between the last commit and the workspace. +``` + +## Description + +DVC provides a set of commands to visualize _continuous metrics_ of machine +learning experiments. Usual examples of plots are AUC curves, loss functions, +and confusion matrices. + +Continuous metrics represent plots, and should be stored as data series in one +of the supported [file formats](#file-formats). These files are usually created +by users or generated by user modeling or data processing code. + +The plot commands can work with these continuous metrics files that are commited +to a repository history, data files controlled by DVC or files from workspace. +For examlpe, the command `dvc plot diff` generates a plot with two versions of +the metrics: + +```dvc +$ dvc plot diff -d logs.csv +file:///Users/dmitry/src/plot/logs.html +``` + +![](/img/plot_auc.svg) + +### Difference between continuous and scalar metrics + +DVC has two concepts for metrics for representing result of machine learning +training or data processing: + +1. `dvc metrics` to represent scalar numbers such as AUC, true positive rate and + others. +2. `dvc plot` to visualize continuous metrics such as AUC curve, loss function, + confusion matrixes and others. + +In contrast to continuous metrics, scalar metrics should be stored in a +hirarchical files such as JSON and YAML and `dvc metrics diff` command can +represent difference between the metrics in different experiments as a float +numbers. Like `AUC` metrics is `0.801807` and was increase by `+0.037826` from +the previous value: + +```dvc +$ dvc metrics diff + Path Metric Value Change +summary.json AUC 0.801807 0.037826 +``` + +### File formats + +Supported file formats for continuous metrics are: JSON, CSV, TSV. DVC expects +to see an array (or multiple arrays) of objects (usually _float numbers_) in the +file. + +In tabular file formats such as CSV and TSV the array is a column. Plot command +can generate visuals for a specified column or a set of columns. Like `AUC` +column: + +``` +epoch, AUC, loss +34, 0.91935, 0.0317345 +35, 0.91913, 0.0317829 +36, 0.92256, 0.0304632 +37, 0.92302, 0.0299015 +``` + +In hierarchical file formats such as JSON an array of JSON-objects is expected. +Plot command can generate visuals for a specified field name or a set of fields +from the array's object. Like `val_loss` field in the `train` array in this +example: + +``` +{ + "train": [ + {"val_accuracy": 0.9665, "val_loss": 0.10757}, + {"val_accuracy": 0.9764, "val_loss": 0.07324}, + {"val_accuracy": 0.8770, "val_loss": 0.08136}, + {"val_accuracy": 0.8740, "val_loss": 0.09026}, + {"val_accuracy": 0.8795, "val_loss": 0.07640}, + {"val_accuracy": 0.8803, "val_loss": 0.07608}, + {"val_accuracy": 0.8987, "val_loss": 0.08455} + ] +} +``` + +### Plot templates + +DVC generates plots as HTML files that a user can click and open in a web +browser. The HTML files contain plots as [Vega-Lite](https://vega.github.io/) +objects. The files can also be transformed to traditional PNG, JPEG, SVG image +formats using external tools. + +Vega is a declarative, programming language agnostic format of defining plots as +JSON specification. DVC gives users the ability to change the specification and +generate plots in the format that fits the best to the users need. At the same +time, it does not make DVC dependent on user's visualization code or any +programming language or environment which allows DVC stay programming language +agnostic. + +Plot templates are stored in `.dvc/plot/` directory as json files. A user can +define it's own templates or modify the existing ones. The default template is +`.dvc/plot/default.json`. User can change the temlpate by `--template` or `-t` +option of `dvc plot show` or `dvc plot diff` commands and specifying a file +name. + +For temlpates in the templates directory the path and the json extension are not +required. User can specify only `--template scatter` instead of +`--template .dvc/plot/scatter.json`. Any custom template can be added to the +temlpate directory. + +### Custom templates + +User can define their own temlpate for specific plot types. Any temlpate file is +a JSON specification with predefined DVC anchors that help DVC to inject user's +data properly. + +All input JSON files of `dvc plot show` and `dvc plot diff` commands are +combined together into a single array for the injection to a template file. + +There are two important additional signals or fields that DVC adds: + +- `rev` - specified revision, tag or branch of input file. This option helps to + destinguish between different revisions of the file in `dvc plot diff` + command. + +- `index` - is a ordering number in the file. In many cases it corresponds to + mchine learning training epoch or step number. + +DVC applies the same logic to all input CSV files but first transforms all CSV +data into JSON. DVC uses CSV files columns name from a header for JSON +conversion. + +DVC temlpate anchors: + +- `` - Plotting command input data from either CSV or JSON + files is converted to JSON array and injected instead of this anchor. Two + additional signal will be added `index` and `rev` - revision (See above). + +- `` - A plot title that can be defined by `--title` option. + +- `` - a field name for Y axis of the plot. It can be defined by + `-y` option of the commands. The dafult field is the last field found in the + input file: the last column in CSV file or the last field in the JSON array + object. + +- `` - a field name for Y axes. It can be defined by `-x` option. + `index` is the default field for X. + +- `` - a displayed field label for Y. + +- `` - a displayed field label for X. + +## Options + +- `-h`, `--help` - prints the usage/help message, and exit. + +- `-q`, `--quiet` - do not write anything to standard output. + +- `-v`, `--verbose` - displays detailed tracing information. + +## Examples + +Tabular file `logs.csv` visualization: + +``` +epoch,accuracy,loss,val_accuracy,val_loss +0,0.9418667,0.19958884770199656,0.9679,0.10217399864746257 +1,0.9763333,0.07896138601688048,0.9768,0.07310650711813942 +2,0.98375,0.05241111190887168,0.9788,0.06665669009438716 +3,0.98801666,0.03681169906261687,0.9781,0.06697812260198989 +4,0.99111664,0.027362171787042946,0.978,0.07385754839298315 +5,0.9932333,0.02069501801203781,0.9771,0.08009233058886166 +6,0.9945,0.017702101902437668,0.9803,0.07830339228538505 +7,0.9954,0.01396906608727198,0.9802,0.07247738889862157 +``` + +```dvc +$ dvc plot show logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +![](/img/plot_show.svg) + +Difference between the current file and the previous commited one: + +```dvc +$ dvc plot diff -d logs.csv HEAD^ +file:///Users/dmitry/src/plot/logs.csv.html +``` + +![](/img/plot_diff.svg) + +Visualize a specific field: + +```dvc +$ dvc plot show -y loss logs.csv +file:///Users/dmitry/src/plot/logs.html +``` + +![](/img/plot_show_field.svg) + +Confusion matrix template is predefined in DVC (file +`.dvc/plot/confusion_matrix.json`): + +```csv +actual,predicted +cat,cat +cat,cat +cat,cat +cat,dog +cat,dinosaur +cat,dinosaur +cat,bird +turtle,dog +turtle,cat +... +``` + +```dvc +$ dvc plot show classes.csv --template confusion -x actual -y predicted +file:///Users/dmitry/src/plot/classes.csv.html +``` + +![](/img/plot_show_confusion.svg) diff --git a/content/docs/command-reference/plot/show.md b/content/docs/command-reference/plot/show.md new file mode 100644 index 0000000000..3faeab1b33 --- /dev/null +++ b/content/docs/command-reference/plot/show.md @@ -0,0 +1,157 @@ +# plot show + +Generate a plot image from from a +[continuous metrics](/doc/command-reference/plot#continous-metrics) file. + +## Synopsis + +```usage +usage: dvc plot show [-h] [-q | -v] [-t [TEMPLATE]] [-f FILE] [-s SELECT] + [-x X] [-y Y] [--stdout] [--no-csv-header] [--no-html] + [--title TITLE] [--xlab XLAB] [--ylab YLAB] + +positional arguments: + datafile Metrics file to visualize +``` + +## Description + +This command provides a quick way to visualize countinuous metrics such as loss +functions, AUC curves, confusion matrixes etc. Please read `dvc plot` for more +information. + +## Options + +- `-t [TEMPLATE], --template [TEMPLATE]` - File to be injected with data. The + default temlpate is `.dvc/plot/default.json`. See more details in `dvc plot`. + +- `-f FILE, --file FILE` - Name of the generated file. By default, the output + file name is equal to the input filename with additional `.html` suffix or + `.json` suffix for `--no-html` mode. + +- `--no-html` - Do not wrap output vega plot json with HTML. + +- `-s SELECT, --select SELECT` - Select which fileds or jsonpath to put into + plot. All the fields will be included by default with DVC generated `index` + field - see `dvc plot`. + +- `-x X` - Field name for x axis. `index` is the default field for X. + +- `-y Y` - Field name for y axis. The dafult field is the last field found in + the input file: the last column in CSV file or the last field in the JSON + array object (the first object). + +- `--xlab XLAB` - X axis title. The X column name is the default title. + +- `--ylab YLAB` - Y axis title. The Y column name is the default title. + +- `--title TITLE` - Plot title. + +- `-o, --stdout` - Print plot content to stdout. + +- `--no-csv-header` - Provided CSV or TSV datafile does not have a header. + +- `-h`, `--help` - prints the usage/help message, and exit. + +- `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no + problems arise, otherwise 1. + +- `-v`, `--verbose` - displays detailed tracing information. + +## Examples + +Tabular file `logs.csv` visualization: + +```csv +epoch,accuracy,loss,val_accuracy,val_loss +0,0.9418667,0.19958884770199656,0.9679,0.10217399864746257 +1,0.9763333,0.07896138601688048,0.9768,0.07310650711813942 +2,0.98375,0.05241111190887168,0.9788,0.06665669009438716 +3,0.98801666,0.03681169906261687,0.9781,0.06697812260198989 +4,0.99111664,0.027362171787042946,0.978,0.07385754839298315 +5,0.9932333,0.02069501801203781,0.9771,0.08009233058886166 +6,0.9945,0.017702101902437668,0.9803,0.07830339228538505 +7,0.9954,0.01396906608727198,0.9802,0.07247738889862157 +``` + +By default, the command plots the last column of the tabular file. Please look +at the default behaviour of `-y` option. + +```dvc +$ dvc plot show logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +![](/img/plot_show.svg) + +Use `-y` option to change column to visualize: + +```dvc +$ dvc plot show -y loss logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +![](/img/plot_show_field.svg) + +In the previous examlpe all the columns (or fields) were included into the +output file. You can select only specified subset ot the columns by `--select` +option which might be important for reducing the output file size. In this case +the default `index` column will be still included. + +```dvc +$ dvc plot show -y loss --select loss logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +A tabular file without header can be plotted with `--no-csv-header` option. A +field can be specified through column number (starting with 0): + +```dvc +$ dvc plot show --no-csv-header --field 2 logs.csv +file:///Users/dmitry/src/plot/logs.csv.html +``` + +In many automation scenarios (like CI/CD for ML), it is convinient to have Vega +specification instead of a whole HTML file because it might be used for +generating another image format like PNG or JPEG or just included to some web +page. `--no-html` option prevents adding HTML header and footer to the file. +Note, the result file extension changes to JSON: + +``` +$ dvc plot show --no-html logs.csv +file:///Users/dmitry/src/plot/logs.csv.json +``` + +JSON file plotting example: + +```json +{ + "train": [ + { "accuracy": 0.96658, "loss": 0.10757 }, + { "accuracy": 0.97641, "loss": 0.07324 }, + { "accuracy": 0.87707, "loss": 0.08136 }, + { "accuracy": 0.87402, "loss": 0.09026 }, + { "accuracy": 0.8795, "loss": 0.0764 }, + { "accuracy": 0.88038, "loss": 0.07608 }, + { "accuracy": 0.89872, "loss": 0.08455 } + ] +} +``` + +DVC identifies and plots JSON-objects from the first JSON array it was able to +find. + +```dvc +$ dvc plot show train.json +file:///Users/dmitry/src/plot/train.json.html +``` + +![](/img/plot_show.svg) + +The field name can be specified with the same `-y` option. The signal from the +first JSON array with the specified name will be showned: + +```dvc +$ dvc plot show -y accuracy logs.json +file:///Users/dmitry/src/plot/logs.json.html +``` diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index e527b38dec..6115c80e8e 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -314,6 +314,21 @@ } ] }, + { + "label": "plot", + "slug": "plot", + "source": "plot/index.md", + "children": [ + { + "label": "plot show", + "slug": "show" + }, + { + "label": "plot diff", + "slug": "diff" + } + ] + }, { "label": "pull", "slug": "pull" diff --git a/static/img/plot_auc.svg b/static/img/plot_auc.svg new file mode 100644 index 0000000000..e18738ba55 --- /dev/null +++ b/static/img/plot_auc.svg @@ -0,0 +1 @@ +0246810x0.00.20.40.60.81.0AUCHEADworkspacerevlogs.csv \ No newline at end of file diff --git a/static/img/plot_diff.svg b/static/img/plot_diff.svg new file mode 100644 index 0000000000..6576531047 --- /dev/null +++ b/static/img/plot_diff.svg @@ -0,0 +1 @@ +01234567x0.000.020.040.060.080.10val_lossHEAD^workspacerevlogs.csv \ No newline at end of file diff --git a/static/img/plot_diff_confusion.svg b/static/img/plot_diff_confusion.svg new file mode 100644 index 0000000000..f3853c292e --- /dev/null +++ b/static/img/plot_diff_confusion.svg @@ -0,0 +1 @@ +revbirdcatdinosaurdogturtleactualbirdcatdinosaurdogturtlepredictedbirdcatdinosaurdogturtlepredictedHEADworkspace152Count of Records \ No newline at end of file diff --git a/static/img/plot_diff_workspace.svg b/static/img/plot_diff_workspace.svg new file mode 100644 index 0000000000..b7b1b29375 --- /dev/null +++ b/static/img/plot_diff_workspace.svg @@ -0,0 +1 @@ +01234567x0.000.020.040.060.080.10val_lossHEADworkspacerevlogs.csv \ No newline at end of file diff --git a/static/img/plot_show.svg b/static/img/plot_show.svg new file mode 100644 index 0000000000..eae88eb33e --- /dev/null +++ b/static/img/plot_show.svg @@ -0,0 +1 @@ +01234567x0.000.020.040.060.080.10val_lossworkspacerevlogs.csv \ No newline at end of file diff --git a/static/img/plot_show_confusion.svg b/static/img/plot_show_confusion.svg new file mode 100644 index 0000000000..2e94daf1a5 --- /dev/null +++ b/static/img/plot_show_confusion.svg @@ -0,0 +1 @@ +revbirdcatdinosaurdogturtleactualbirdcatdinosaurdogturtlepredictedworkspace136Count of Records \ No newline at end of file diff --git a/static/img/plot_show_field.svg b/static/img/plot_show_field.svg new file mode 100644 index 0000000000..d4830836a9 --- /dev/null +++ b/static/img/plot_show_field.svg @@ -0,0 +1 @@ +01234567x0.000.050.100.150.20lossworkspacerevlogs.csv \ No newline at end of file