Skip to content

Commit

Permalink
diff: Don't read full output from git diff-tree in one go
Browse files Browse the repository at this point in the history
  • Loading branch information
craigds committed Feb 4, 2025
1 parent 5b5598a commit 90071e5
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 12 deletions.
31 changes: 19 additions & 12 deletions kart/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import sys
from collections.abc import Iterable
from itertools import chain
from typing import Callable, Optional, Protocol, TYPE_CHECKING, ClassVar, Iterator

import click
Expand All @@ -15,7 +16,7 @@
from kart.key_filters import DatasetKeyFilter, MetaKeyFilter, UserStringKeyFilter
from kart.meta_items import MetaItemFileType, MetaItemVisibility
from kart.raw_diff_delta import RawDiffDelta
from kart.utils import chunk
from kart.utils import chunk, iter_records_from_file


class BaseDatasetMetaClass(type):
Expand Down Expand Up @@ -205,8 +206,7 @@ def get_deltas_from_git_diff_for_subtree(
if reverse:
self_subtree, other_subtree = other_subtree, self_subtree

# TODO: iterate over 0-terminated 'lines' of output rather than using check_output. Efficiently. How?
output = subprocess_util.check_output(
p = subprocess_util.Popen(
[
"git",
"diff-tree",
Expand All @@ -215,19 +215,26 @@ def get_deltas_from_git_diff_for_subtree(
"--name-status",
str(self_subtree.id),
str(other_subtree.id),
]
],
stdout=subprocess_util.PIPE,
)
if not output:

# Split on null bytes and decode each part as utf-8, filtering out empty strings.
pieces = iter_records_from_file(p.stdout, b"\0")
pairs = chunk(pieces, 2)
try:
pair0 = next(pairs)
except StopIteration:
# empty diff
return

pieces = output.split(b"\0")
# diff-tree adds an extra \0 on the end of the output; trim it off
assert len(pieces) % 2 == 1
pieces.pop()

# Split on null bytes and decode each part as utf-8, filtering out empty strings
for status_char, path in chunk(pieces, 2, strict=True):
for pair in chain([pair0], pairs):
if len(pair) == 1:
# diff-tree adds an extra \0 on the end of the output [except if the diff is empty?]
assert pair[0] == b""
assert next(pairs, None) is None
break
status_char, path = pair
status_char = status_char.decode("utf8")
path = path.decode("utf8")
if path_transform:
Expand Down
29 changes: 29 additions & 0 deletions kart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from pathlib import Path
import platform
from typing import BinaryIO, Iterator


def ungenerator(cast_function):
Expand Down Expand Up @@ -42,6 +43,34 @@ def chunk(iterable, size, strict=False):
yield chunk


def iter_records_from_file(
file_: BinaryIO, separator: bytes, chunk_size: int = 4096
) -> Iterator[bytes]:
"""
Split a (binary) file into records, using a separator. Yields records as bytes.
Oddly, there's no easy stdlib way to do this.
See fairly-stale discussion at https://bugs.python.org/issue1152248
"""
partial_record = None
while True:
chunk = file_.read(chunk_size)
if not chunk:
break
pieces = chunk.split(separator)
if partial_record is None:
partial_record = pieces[0]
else:
partial_record += pieces[0]
if len(pieces) > 1:
yield partial_record
for i in range(1, len(pieces) - 1):
yield pieces[i]
partial_record = pieces[-1]
if partial_record is not None:
yield partial_record


def get_num_available_cores():
"""
Returns the number of available CPU cores (best effort)
Expand Down
37 changes: 37 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from io import BytesIO
from kart.utils import iter_records_from_file


def test_iter_records_from_file():
# basic case
f = BytesIO(b"foo\0bar\0baz\0")
assert list(iter_records_from_file(f, b"\0")) == [b"foo", b"bar", b"baz", b""]
# separator is customisable
f = BytesIO(b"foo\0bar\0baz\0")
assert list(iter_records_from_file(f, b"b")) == [b"foo\0", b"ar\0", b"az\0"]
# exhausted file means no more records
assert list(iter_records_from_file(f, b"\0")) == []
# no trailing separator means the final chunk isn't empty
f = BytesIO(b"foo\0bar\0baz")
assert list(iter_records_from_file(f, b"\0")) == [b"foo", b"bar", b"baz"]
# if the chunk size is silly then it still works
f = BytesIO(b"foo\0bar\0baz")
assert list(iter_records_from_file(f, b"\0", chunk_size=1)) == [
b"foo",
b"bar",
b"baz",
]
# if the chunk size matches the position of the separator then it still works
f = BytesIO(b"foo\0bar\0baz")
assert list(iter_records_from_file(f, b"\0", chunk_size=3)) == [
b"foo",
b"bar",
b"baz",
]
# if the chunk size matches the position after the separator then it still works
f = BytesIO(b"foo\0bar\0baz")
assert list(iter_records_from_file(f, b"\0", chunk_size=4)) == [
b"foo",
b"bar",
b"baz",
]

0 comments on commit 90071e5

Please sign in to comment.