diff --git a/.github/workflows/publish-release.yaml b/.github/workflows/publish-release.yaml index cd63fec..8f64962 100644 --- a/.github/workflows/publish-release.yaml +++ b/.github/workflows/publish-release.yaml @@ -17,7 +17,7 @@ jobs: - name: Install Hugo uses: peaceiris/actions-hugo@v2 with: - hugo-version: v0.96.0 + hugo-version: latest extended: true - name: Install Go diff --git a/.gitignore b/.gitignore index 265d4b6..1708630 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ -.tags* +/.tags* /dud -/mockery -/goreleaser -depgraph.png +/depgraph.png +/website # auto-generated hugo files # (e.g. cobra CLI and converted Jupyter notebooks) diff --git a/Makefile b/Makefile index dc65be0..5fdbd89 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ bench: test-short .PHONY: serve-jupyter serve-jupyter: $(GOBIN)/dud - jupyter notebook -y --ip=0.0.0.0 ./hugo/notebooks/ + jupyter notebook -y --ip=0.0.0.0 --notebook-dir ./hugo/notebooks/ # First delete stale supporting files, then convert the notebook to markdown, # and finally clear the notebook outputs. @@ -113,7 +113,7 @@ bench-docs: hugo/content/benchmarks/_index.md hugo/content/%.md: hugo/notebooks/%.md mkdir -p '$(dir $@)' - awk --lint=fatal -f ./hugo/notebooks/fix_md.awk '$<' > '$@' + gawk --lint=fatal -f ./hugo/notebooks/fix_md.awk '$<' > '$@' $(eval supporting_files = $(wildcard $(patsubst %.md,%_files,$<)/*.*)) if test -n "$(supporting_files)"; then cp -v $(supporting_files) $(dir $@); fi @@ -150,8 +150,10 @@ serve-hugo: hugo server \ --disableFastRender \ --bind 0.0.0.0 \ - --baseUrl $(shell hostname -i | xargs)/dud/ + --port 8888 \ + --baseUrl "$(shell hostname -i | xargs)/dud/" # xargs trims whitespace from the hostname +# Port 8888 matches the port exposed by the docker rule above. .PHONY: coverage coverage: diff --git a/hugo/content/benchmarks/_index.md b/hugo/content/benchmarks/_index.md index 7bcbfb0..0ed9d05 100644 --- a/hugo/content/benchmarks/_index.md +++ b/hugo/content/benchmarks/_index.md @@ -8,13 +8,13 @@ **RAM**: 16 GB -**Go version**: 1.18.3 +**Go version**: 1.18.4 -**Rclone version**: rclone v1.58.1 +**Rclone version**: rclone v1.59.0 -**Dud version**: v0.3.1-12-g3f98bdd +**Dud version**: v0.4.0 -**DVC version**: 2.11.0 (pip) +**DVC version**: 2.13.0 (pip) DVC non-default configuration: @@ -30,32 +30,32 @@ This dataset consists of four 1 GB files in a single directory. | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 0.018 ± 0.008 | 0.010 | 0.024 | 1.00 | -| `DVC` | 0.349 ± 0.008 | 0.340 | 0.355 | 18.99 ± 7.95 | +| `Dud` | 0.023 ± 0.000 | 0.022 | 0.023 | 1.00 | +| `DVC` | 0.338 ± 0.013 | 0.326 | 0.352 | 14.92 ± 0.64 | ### commit | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 0.665 ± 0.013 | 0.656 | 0.680 | 1.00 | -| `DVC` | 3.669 ± 0.034 | 3.645 | 3.708 | 5.52 ± 0.12 | +| `Dud` | 0.677 ± 0.030 | 0.644 | 0.701 | 1.00 | +| `DVC` | 6.483 ± 0.012 | 6.471 | 6.495 | 9.57 ± 0.42 | ### fetch | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 11.792 ± 0.519 | 11.224 | 12.244 | 1.56 ± 0.16 | -| `DVC` | 7.562 ± 0.703 | 6.865 | 8.270 | 1.00 | +| `Dud` | 11.876 ± 1.790 | 10.703 | 13.936 | 1.53 ± 0.43 | +| `DVC` | 7.764 ± 1.813 | 5.759 | 9.288 | 1.00 | ### push | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 11.577 ± 0.336 | 11.340 | 11.961 | 1.00 | -| `DVC` | 12.794 ± 6.961 | 8.587 | 20.829 | 1.11 ± 0.60 | +| `Dud` | 10.755 ± 0.240 | 10.538 | 11.012 | 1.52 ± 0.30 | +| `DVC` | 7.082 ± 1.393 | 5.987 | 8.650 | 1.00 | ### status | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 0.021 ± 0.002 | 0.019 | 0.023 | 1.00 | -| `DVC` | 0.303 ± 0.011 | 0.294 | 0.315 | 14.21 ± 1.27 | +| `Dud` | 0.018 ± 0.006 | 0.011 | 0.022 | 1.00 | +| `DVC` | 0.298 ± 0.006 | 0.291 | 0.303 | 16.57 ± 5.21 | ## Many small files This dataset consists of twenty thousand 100 KB files in a single directory. @@ -64,29 +64,29 @@ This dataset consists of twenty thousand 100 KB files in a single directory. | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 1.969 ± 0.025 | 1.941 | 1.988 | 1.00 | -| `DVC` | 11.483 ± 8.241 | 6.644 | 20.998 | 5.83 ± 4.19 | +| `Dud` | 1.262 ± 0.430 | 1.013 | 1.759 | 1.00 | +| `DVC` | 9.021 ± 0.063 | 8.983 | 9.093 | 7.15 ± 2.44 | ### commit | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 1.583 ± 0.261 | 1.423 | 1.885 | 1.00 | -| `DVC` | 46.087 ± 6.321 | 39.050 | 51.285 | 29.11 ± 6.25 | +| `Dud` | 1.668 ± 0.030 | 1.636 | 1.695 | 1.00 | +| `DVC` | 45.934 ± 5.117 | 40.035 | 49.171 | 27.54 ± 3.11 | ### fetch | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 7.227 ± 0.226 | 6.968 | 7.382 | 1.00 | -| `DVC` | 56.176 ± 2.097 | 53.845 | 57.910 | 7.77 ± 0.38 | +| `Dud` | 7.605 ± 0.042 | 7.563 | 7.647 | 1.00 | +| `DVC` | 57.540 ± 2.498 | 56.029 | 60.422 | 7.57 ± 0.33 | ### push | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 5.644 ± 0.051 | 5.586 | 5.683 | 1.00 | -| `DVC` | 44.879 ± 0.184 | 44.745 | 45.089 | 7.95 ± 0.08 | +| `Dud` | 9.896 ± 4.191 | 7.438 | 14.735 | 1.00 | +| `DVC` | 44.301 ± 0.245 | 44.072 | 44.558 | 4.48 ± 1.90 | ### status | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `Dud` | 0.290 ± 0.041 | 0.266 | 0.337 | 1.00 | -| `DVC` | 1.075 ± 0.018 | 1.062 | 1.095 | 3.70 ± 0.52 | +| `Dud` | 0.283 ± 0.012 | 0.270 | 0.295 | 1.00 | +| `DVC` | 1.575 ± 0.015 | 1.562 | 1.592 | 5.56 ± 0.24 | diff --git a/hugo/content/getting_started.md b/hugo/content/getting_started.md index 3ce5f80..b5717a5 100644 --- a/hugo/content/getting_started.md +++ b/hugo/content/getting_started.md @@ -20,7 +20,7 @@ As Dud tells us, `dud init` creates a `.dud` directory and some config files. Du Next, let's download the CIFAR-10 computer vision dataset. - $ curl -C- -sO https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz + $ curl -sSO 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' $ du -h cifar-10-python.tar.gz 163M cifar-10-python.tar.gz @@ -53,7 +53,7 @@ Now we have a stage file, but we need to register it with Dud. We do that with ` $ dud commit committing stage cifar.yaml - cifar-10-python.tar.gz 162.60 MiB / 162.60 MiB 100% ?/s 66ms total + cifar-10-python.tar.gz 162.60 MiB / 162.60 MiB 100% ?/s 65ms total `dud commit` goes through all of our stages (in this case, just `cifar.yaml`) and copies their files/directories to the Dud cache. The cache is a directory that holds all versions of all files and directories owned by Dud. By default, the cache lives at `.dud/cache/`, but it's location is configurable (see `.dud/config.yaml`). @@ -61,16 +61,16 @@ To get a better sense of what `dud commit` did, let's look at the directory stru $ tree -an . + ├── .dud + │   ├── .gitignore + │   ├── cache + │   │   └── fe + │   │   └── 3d11c475ae0f6fec91f3cf42f9c69e87dc32ec6b44a83f8b22544666e25eea + │   ├── config.yaml + │   ├── index + │   └── rclone.conf ├── cifar-10-python.tar.gz -> .dud/cache/fe/3d11c475ae0f6fec91f3cf42f9c69e87dc32ec6b44a83f8b22544666e25eea - ├── cifar.yaml - └── .dud - ├── cache - │   └── fe - │   └── 3d11c475ae0f6fec91f3cf42f9c69e87dc32ec6b44a83f8b22544666e25eea - ├── config.yaml - ├── .gitignore - ├── index - └── rclone.conf + └── cifar.yaml 3 directories, 7 files @@ -79,9 +79,9 @@ Our tarball has been replaced with a link to a file in Dud's cache. That cached But how do we make sure we don't corrupt the cached version of the tarball? What happens if we accidentally modify our dataset? $ echo 'accidental overwrite' > cifar-10-python.tar.gz - /usr/sbin/sh: line 1: cifar-10-python.tar.gz: Permission denied + /usr/bin/sh: 1: cannot create cifar-10-python.tar.gz: Permission denied -Dud makes it very difficult to accidentally modify committed files. When Dud commits a file, it makes the link to the cache read-only. +Dud makes it difficult to accidentally modify committed files. When Dud commits a file, it makes the link to the cache read-only. The tarball isn't the only thing that's changed. Let's look at our stage file: @@ -101,6 +101,7 @@ Dud recorded the tarball's checksum in the stage file. (It also checksummed the $ dud checkout checking out stage cifar.yaml + cifar-10-python.tar.gz 1 / 1 100% ?/s 0s total $ readlink -v cifar-10-python.tar.gz .dud/cache/fe/3d11c475ae0f6fec91f3cf42f9c69e87dc32ec6b44a83f8b22544666e25eea @@ -158,6 +159,7 @@ This looks as expected. Our tarball is committed, but we haven't extracted it ye $ dud run nothing to do for stage cifar.yaml + running stage extract_cifar.yaml cifar-10-batches-py/ cifar-10-batches-py/data_batch_4 @@ -181,9 +183,10 @@ Congrats on [defusing the bomb](https://xkcd.com/1168/)! Now that we know our pi $ dud commit committing stage cifar.yaml - cifar-10-python.tar.gz up-to-date; skipping commit + cifar-10-python.tar.gz up-to-date; skipping commit + committing stage extract_cifar.yaml - cifar-10-batches-py 177.59 MiB / 177.59 MiB 100% ?/s 32ms total + cifar-10-batches-py 177.59 MiB / 177.59 MiB 100% ?/s 30ms total Notice that Dud detected that the tarball from `cifar.yaml` hasn't changed, so it knew not to waste time committing it again. @@ -202,6 +205,7 @@ Because everything is up-to-date, I bet you can guess what happens if we try re- $ dud run nothing to do for stage cifar.yaml + nothing to do for stage extract_cifar.yaml Because both of our stages are committed and up-to-date, Dud detects that there's no sense in re-extracting the tarball. Excellent! @@ -273,20 +277,25 @@ We're now ready to push our data to the remote cache! We do this with one simple $ dud push pushing stage cifar.yaml - Transferred: 162.600Mi / 162.600 MiByte, 100%, 0 Byte/s, ETA - + Gathering files 1 + Transferred: 162.600 MiB / 162.600 MiB, 100%, 0 B/s, ETA - Transferred: 1 / 1, 100% - Elapsed time: 0.4s + Elapsed time: 0.3s + Fixing permissions 1 / 1 + pushing stage extract_cifar.yaml - Transferred: 177.589Mi / 177.589 MiByte, 100%, 0 Byte/s, ETA - + Gathering files 9 + Transferred: 177.589 MiB / 177.589 MiB, 100%, 0 B/s, ETA - Transferred: 9 / 9, 100% Elapsed time: 0.2s + Fixing permissions 9 / 9 `dud push` goes through all of our stages, looks up their committed artifacts (by checksum), and instructs rclone to copy them to the remote cache. We can confirm our artifacts were copied to `/tmp/dud/cache` using rclone as well, which provides the `check` command to compare two directories (or indeed remotes): $ rclone check .dud/cache /tmp/dud/cache - 2022/01/02 16:37:52 NOTICE: Config file "/home/user/.config/rclone/rclone.conf" not found - using defaults - 2022/01/02 16:37:52 NOTICE: Local file system at /tmp/dud/cache: 0 differences found - 2022/01/02 16:37:52 NOTICE: Local file system at /tmp/dud/cache: 10 matching files + 2022/07/18 01:32:43 NOTICE: Config file "/home/user/.config/rclone/rclone.conf" not found - using defaults + 2022/07/18 01:32:45 NOTICE: Local file system at /tmp/dud/cache: 0 differences found + 2022/07/18 01:32:45 NOTICE: Local file system at /tmp/dud/cache: 10 matching files Sure enough, rclone reports that `.dud/cache` and `/tmp/dud/cache` are identical. If it were "real", our collaborators with access to `fake_remote` could now access all of the data we've committed so far! Let's pretend we are one of those collaborators, and we need to fetch the data files from the remote cache. We can do that with the aptly-named `fetch` command: @@ -294,21 +303,25 @@ Sure enough, rclone reports that `.dud/cache` and `/tmp/dud/cache` are identical $ dud fetch fetching stage cifar.yaml - Transferred: 162.600Mi / 162.600 MiByte, 100%, 0 Byte/s, ETA - + Transferred: 162.600 MiB / 162.600 MiB, 100%, 0 B/s, ETA - Transferred: 1 / 1, 100% Elapsed time: 0.3s + Fixing permissions 1 / 1 + fetching stage extract_cifar.yaml - Transferred: 974 / 974 Byte, 100%, 0 Byte/s, ETA - + Transferred: 974 B / 974 B, 100%, 0 B/s, ETA - Transferred: 1 / 1, 100% Elapsed time: 0.0s - Transferred: 177.588Mi / 177.588 MiByte, 100%, 0 Byte/s, ETA - + Fixing permissions 1 / 1 + Transferred: 177.588 MiB / 177.588 MiB, 100%, 0 B/s, ETA - Transferred: 8 / 8, 100% Elapsed time: 0.2s + Fixing permissions 8 / 8 $ rclone check .dud/cache /tmp/dud/cache - 2022/01/02 16:37:55 NOTICE: Config file "/home/user/.config/rclone/rclone.conf" not found - using defaults - 2022/01/02 16:37:55 NOTICE: Local file system at /tmp/dud/cache: 0 differences found - 2022/01/02 16:37:55 NOTICE: Local file system at /tmp/dud/cache: 10 matching files + 2022/07/18 01:32:47 NOTICE: Config file "/home/user/.config/rclone/rclone.conf" not found - using defaults + 2022/07/18 01:32:48 NOTICE: Local file system at /tmp/dud/cache: 0 differences found + 2022/07/18 01:32:48 NOTICE: Local file system at /tmp/dud/cache: 10 matching files `dud fetch` is the inverse of `dud push`; it looks up artifacts the same way `push` does (from stage files), but it copies _from_ the remote cache _to_ the local cache. @@ -322,7 +335,7 @@ The steps below gloss over installing and configuring Git. If you are new to Git First things first, let's create a Git repository for the project: - $ git init -b main + $ git init Initialized empty Git repository in /home/user/cifar/.git/ Let's take a look at our status to decide what to commit: @@ -390,7 +403,7 @@ Let's tell Git to track everything else: With everything in order, let's commit our code: $ git commit -m 'initial commit' - [main (root-commit) 5329de9] initial commit + [main (root-commit) ca689da] initial commit 7 files changed, 25 insertions(+) create mode 100644 .dud/.gitignore create mode 100644 .dud/config.yaml diff --git a/hugo/content/getting_started_51_0.png b/hugo/content/getting_started_51_0.png index 3355e82..258e63d 100644 Binary files a/hugo/content/getting_started_51_0.png and b/hugo/content/getting_started_51_0.png differ diff --git a/hugo/notebooks/getting_started.ipynb b/hugo/notebooks/getting_started.ipynb index a904dcc..8b28594 100644 --- a/hugo/notebooks/getting_started.ipynb +++ b/hugo/notebooks/getting_started.ipynb @@ -39,7 +39,7 @@ }, "outputs": [], "source": [ - "!rm -rf ~/cifar" + "!rm -rf ~/cifar /tmp/dud/cache" ] }, { @@ -94,17 +94,18 @@ }, "outputs": [], "source": [ + "%%bash\n", "# Bypass downloading if we've cached the tarball.\n", - "!test -f /dud-data/cifar-10-python.tar.gz && cp -v /dud-data/cifar-10-python.tar.gz ." + "test -s ~/dud-data/cifar-10-python.tar.gz || \\\n", + " curl -o ~/dud-data/cifar-10-python.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n", + "cp -v ~/dud-data/cifar-10-python.tar.gz ." ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!curl -C- -sO https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" + " $ curl -sSO 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'" ] }, { @@ -226,7 +227,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Dud makes it very difficult to accidentally modify committed files. When Dud commits a file, it makes the link to the cache read-only.\n", + "Dud makes it difficult to accidentally modify committed files. When Dud commits a file, it makes the link to the cache read-only.\n", "\n", "The tarball isn't the only thing that's changed. Let's look at our stage file:" ] @@ -694,7 +695,25 @@ "metadata": {}, "outputs": [], "source": [ - "!git init -b main" + "!git init" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "hide_input", + "hide_output" + ] + }, + "outputs": [], + "source": [ + "# Crappy workaround to set the default branch in Git <2.28,\n", + "# which is the Git version in Ubuntu 20.04. This can/should\n", + "# be removed when Git is updated.\n", + "# See: https://superuser.com/a/1419674\n", + "!git symbolic-ref HEAD refs/heads/main" ] }, { @@ -850,7 +869,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/integration/Dockerfile b/integration/Dockerfile index 20c25a8..797414f 100644 --- a/integration/Dockerfile +++ b/integration/Dockerfile @@ -4,7 +4,9 @@ RUN apt update && apt install -y software-properties-common \ && add-apt-repository ppa:longsleep/golang-backports \ && apt update \ && apt install -y \ + build-essential \ curl \ + gawk \ git \ golang-go \ graphviz \ @@ -22,11 +24,6 @@ RUN apt update && apt install -y software-properties-common \ # happen to match). # See: https://git-scm.com/docs/git-config#Documentation/git-config.txt-safedirectory -COPY integration/install_hyperfine_deb.sh ./ -RUN ./install_hyperfine_deb.sh - -RUN curl https://rclone.org/install.sh | bash - RUN useradd --no-log-init -m user -G sudo \ && echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers @@ -34,18 +31,22 @@ USER user WORKDIR /home/user -# Create a directory to mount a Docker volume to. If we don't create the mount -# point now as the user, Docker will create it with root permissions when it -# creates the container. -RUN mkdir ~/dud-data +RUN curl --fail --location https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh | bash -ENV PATH=$PATH:/home/user/go/bin:/home/user/.local/bin +ENV PATH=$PATH:/home/user/go/bin:/home/user/.local/bin:/home/linuxbrew/.linuxbrew/bin + +RUN brew install hyperfine hugo rclone RUN pip install --no-cache --user dvc notebook \ && dvc config --global core.analytics false \ && dvc config --global core.check_update false \ && dvc config --global cache.type symlink +# Create a directory to mount a Docker volume to. If we don't create the mount +# point now as the user, Docker will create it with root permissions when it +# creates the container. +RUN mkdir ~/dud-data + # Pre-download the Go dependencies for Dud. COPY --chown=user go.mod go.sum ./ diff --git a/integration/install_hyperfine_deb.sh b/integration/install_hyperfine_deb.sh deleted file mode 100755 index 140310f..0000000 --- a/integration/install_hyperfine_deb.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Only considering 64-bit -arch=amd64 -case "$(uname -m)" in - *arm*|*aarch*) - arch=arm64 - ;; -esac - -deb_url=$( - curl -sS https://api.github.com/repos/sharkdp/hyperfine/releases/latest \ - | jq -r '.assets[] | .browser_download_url' \ - | grep -v 'musl' \ - | grep "$arch\.deb$" -) - -echo "using '$deb_url'" - -curl -Lo hyperfine.deb "$deb_url" -dpkg -i hyperfine.deb -rm hyperfine.deb