From fe6f493ddb06709dcb6b9ed65ffcc175e1760213 Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Tue, 16 May 2023 22:32:22 +0200 Subject: [PATCH] Add CAR files support --- atproto/__init__.py | 2 + atproto/car/__init__.py | 68 ++++++++++++++ atproto/cid/__init__.py | 25 +---- atproto/leb128/__init__.py | 91 +++++++++++++++++++ docs/source/atproto/atproto.car.rst | 7 ++ docs/source/atproto/atproto.leb128.rst | 7 ++ docs/source/atproto/atproto.rst | 2 + .../atproto/atproto.xrpc_client.models.rst | 1 + ...oto.xrpc_client.models.type_conversion.rst | 7 ++ docs/source/car.rst | 7 ++ docs/source/cid.rst | 1 + docs/source/index.rst | 1 + poetry.lock | 19 +++- pyproject.toml | 1 + test.py | 10 +- 15 files changed, 224 insertions(+), 25 deletions(-) create mode 100644 atproto/car/__init__.py create mode 100644 atproto/leb128/__init__.py create mode 100644 docs/source/atproto/atproto.car.rst create mode 100644 docs/source/atproto/atproto.leb128.rst create mode 100644 docs/source/atproto/atproto.xrpc_client.models.type_conversion.rst create mode 100644 docs/source/car.rst diff --git a/atproto/__init__.py b/atproto/__init__.py index 218030bb..77c30383 100644 --- a/atproto/__init__.py +++ b/atproto/__init__.py @@ -1,3 +1,4 @@ +from .car import CAR from .cid import CID from .nsid import NSID from .uri import AtUri @@ -10,6 +11,7 @@ 'Client', 'models', 'NSID', + 'CAR', 'CID', 'AtUri', ] diff --git a/atproto/car/__init__.py b/atproto/car/__init__.py new file mode 100644 index 00000000..0def4d11 --- /dev/null +++ b/atproto/car/__init__.py @@ -0,0 +1,68 @@ +from io import BytesIO +from typing import Dict + +import dag_cbor + +from .. import leb128 +from ..cid import CID + +Nodes = Dict[CID, dict] + + +class CAR: + """CAR file.""" + + _CID_V1_BYTES_LEN = 36 + + def __init__(self, root: str, nodes: Nodes): + self._root = root + self._nodes = nodes + + @property + def root(self): + """Get root.""" + return self._root + + @property + def nodes(self) -> Nodes: + """Get nodes.""" + return self._nodes + + @classmethod + def from_bytes(cls, data: bytes) -> 'CAR': + """Decode CAR file. + + Note: + You could pass as `data` response of `client.com.atproto.sync.get_repo`, for example. + And another responses of methods in the `sync` namespace. + + Example: + >>> from atproto import CAR, Client + >>> client = Client() + >>> client.login('my-handle', 'my-password') + >>> repo = client.com.atproto.sync.get_repo({'did': client.me.did}) + >>> car_file = CAR.from_bytes(repo) + >>> print(car_file.root) + >>> print(car_file.nodes) + + Args: + data: content of the file. + + Returns: + :obj:`atproto.CAR`: Parsed CAR file. + """ + repo = BytesIO(data) + + header_len, _ = leb128.u.decode_reader(repo) + header = dag_cbor.decode(repo.read(header_len)) + root = header.get('roots')[0] + + nodes = {} + while repo.tell() != len(data): + block_len, _ = leb128.u.decode_reader(repo) + cid = CID.decode(repo.read(CAR._CID_V1_BYTES_LEN)) + block = dag_cbor.decode(repo.read(block_len - CAR._CID_V1_BYTES_LEN)) + + nodes[cid] = block + + return cls(root=root, nodes=nodes) diff --git a/atproto/cid/__init__.py b/atproto/cid/__init__.py index ab89724b..48aed9c4 100644 --- a/atproto/cid/__init__.py +++ b/atproto/cid/__init__.py @@ -1,29 +1,8 @@ -from typing import Union +from multiformats import CID as _CID -from multiformats import CID as MCID -# TODO(MarshalX): Implement more methods - - -class CID: +class CID(_CID): """CID (Content IDentifier). Hash for Merkle Search Tree (MST). """ - - def __init__(self, cid: MCID): - self._cid = cid - - def encode(self) -> str: - """Encodes the CID.""" - return self._cid.encode() - - @classmethod - def decode(cls, cid: Union[str, bytes]) -> 'CID': - """Decodes a CID from str or bytes.""" - return cls(MCID.decode(cid)) - - @property - def version(self) -> int: - """Get CID version.""" - return self._cid.version diff --git a/atproto/leb128/__init__.py b/atproto/leb128/__init__.py new file mode 100644 index 00000000..c65660e3 --- /dev/null +++ b/atproto/leb128/__init__.py @@ -0,0 +1,91 @@ +""" +Original source code: https://github.com/mohanson/leb128 + +https://en.wikipedia.org/wiki/LEB128 + +LEB128 or Little Endian Base 128 is a form of variable-length code +compression used to store an arbitrarily large integer in a small number of +bytes. LEB128 is used in the DWARF debug file format and the WebAssembly +binary encoding for all integer literals. +""" + +import typing + + +class _U: + @staticmethod + def encode(i: int) -> bytearray: + """Encode the int i using unsigned leb128 and return the encoded bytearray.""" + assert i >= 0 + r = [] + while True: + byte = i & 0x7F + i = i >> 7 + if i == 0: + r.append(byte) + return bytearray(r) + r.append(0x80 | byte) + + @staticmethod + def decode(b: bytearray) -> int: + """Decode the unsigned leb128 encoded bytearray""" + r = 0 + for i, e in enumerate(b): + r = r + ((e & 0x7F) << (i * 7)) + return r + + @staticmethod + def decode_reader(r: typing.BinaryIO) -> (int, int): + """ + Decode the unsigned leb128 encoded from a reader, it will return two values, the actual number and the number + of bytes read. + """ + a = bytearray() + while True: + b = ord(r.read(1)) + a.append(b) + if (b & 0x80) == 0: + break + return _U.decode(a), len(a) + + +class _I: + @staticmethod + def encode(i: int) -> bytearray: + """Encode the int i using signed leb128 and return the encoded bytearray.""" + r = [] + while True: + byte = i & 0x7F + i = i >> 7 + if (i == 0 and byte & 0x40 == 0) or (i == -1 and byte & 0x40 != 0): + r.append(byte) + return bytearray(r) + r.append(0x80 | byte) + + @staticmethod + def decode(b: bytearray) -> int: + """Decode the signed leb128 encoded bytearray""" + r = 0 + for i, e in enumerate(b): + r = r + ((e & 0x7F) << (i * 7)) + if e & 0x40 != 0: + r |= -(1 << (i * 7) + 7) + return r + + @staticmethod + def decode_reader(r: typing.BinaryIO) -> (int, int): + """ + Decode the signed leb128 encoded from a reader, it will return two values, the actual number and the number + of bytes read. + """ + a = bytearray() + while True: + b = ord(r.read(1)) + a.append(b) + if (b & 0x80) == 0: + break + return _I.decode(a), len(a) + + +u = _U() +i = _I() diff --git a/docs/source/atproto/atproto.car.rst b/docs/source/atproto/atproto.car.rst new file mode 100644 index 00000000..cdca2c5f --- /dev/null +++ b/docs/source/atproto/atproto.car.rst @@ -0,0 +1,7 @@ +atproto.car +=========== + +.. automodule:: atproto.car + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/atproto/atproto.leb128.rst b/docs/source/atproto/atproto.leb128.rst new file mode 100644 index 00000000..fcb92400 --- /dev/null +++ b/docs/source/atproto/atproto.leb128.rst @@ -0,0 +1,7 @@ +atproto.leb128 +============== + +.. automodule:: atproto.leb128 + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/atproto/atproto.rst b/docs/source/atproto/atproto.rst index 8d3b5762..d7cb6680 100644 --- a/docs/source/atproto/atproto.rst +++ b/docs/source/atproto/atproto.rst @@ -12,9 +12,11 @@ Subpackages .. toctree:: :maxdepth: 4 + atproto.car atproto.cid atproto.cli atproto.codegen + atproto.leb128 atproto.lexicon atproto.nsid atproto.uri diff --git a/docs/source/atproto/atproto.xrpc_client.models.rst b/docs/source/atproto/atproto.xrpc_client.models.rst index 27fcb1cb..79ef04c7 100644 --- a/docs/source/atproto/atproto.xrpc_client.models.rst +++ b/docs/source/atproto/atproto.xrpc_client.models.rst @@ -23,4 +23,5 @@ Submodules atproto.xrpc_client.models.base atproto.xrpc_client.models.blob_ref + atproto.xrpc_client.models.type_conversion atproto.xrpc_client.models.utils diff --git a/docs/source/atproto/atproto.xrpc_client.models.type_conversion.rst b/docs/source/atproto/atproto.xrpc_client.models.type_conversion.rst new file mode 100644 index 00000000..0120d02c --- /dev/null +++ b/docs/source/atproto/atproto.xrpc_client.models.type_conversion.rst @@ -0,0 +1,7 @@ +type\_conversion +============================================ + +.. automodule:: atproto.xrpc_client.models.type_conversion + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/car.rst b/docs/source/car.rst new file mode 100644 index 00000000..c560da5e --- /dev/null +++ b/docs/source/car.rst @@ -0,0 +1,7 @@ +CAR +=== + +.. automodule:: atproto.car + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cid.rst b/docs/source/cid.rst index d6862283..66d1b521 100644 --- a/docs/source/cid.rst +++ b/docs/source/cid.rst @@ -5,3 +5,4 @@ CID :members: :undoc-members: :show-inheritance: + :inherited-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 8708ee4b..b3169ced 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,6 +19,7 @@ Documentation nsid cid uri + car exceptions .. toctree:: diff --git a/poetry.lock b/poetry.lock index 61a75c7e..f16f9c4d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -290,6 +290,23 @@ files = [ [package.extras] dev = ["black", "coveralls", "mypy", "pre-commit", "pylint", "pytest (>=5)", "pytest-benchmark", "pytest-cov"] +[[package]] +name = "dag-cbor" +version = "0.3.2" +description = "Python implementation of the DAG-CBOR codec." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "dag-cbor-0.3.2.tar.gz", hash = "sha256:d113050d6f5cb6c1a324c911684c5daa305425702585573269bef3000aa9f106"}, + {file = "dag_cbor-0.3.2-py3-none-any.whl", hash = "sha256:c78030379f385b90498711d7613af5bdb4155d5a7623211cea2b20441d5794b4"}, +] + +[package.dependencies] +multiformats = "*" +typing-extensions = "*" +typing-validation = "*" + [[package]] name = "docutils" version = "0.19" @@ -1476,4 +1493,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.7" -content-hash = "4ec051ac82f3b336c2da90c7d40c38e2cac6e3da9706b0950ec16b854e5866e2" +content-hash = "0a6e31171943cf8c1b9d543bbf1dd4d56576e6e8ba97e619e3a92dfc84a77ad0" diff --git a/pyproject.toml b/pyproject.toml index 3d183a1f..f139b864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ httpx = "0.24.0" dacite = "1.8.0" multiformats = "0.2.1" typing-extensions = "4.5.0" +dag-cbor = "0.3.2" [tool.poetry.dev-dependencies] diff --git a/test.py b/test.py index 03b0c28d..29c97157 100644 --- a/test.py +++ b/test.py @@ -2,7 +2,7 @@ import logging import os -from atproto import AsyncClient, AtUri, Client, models +from atproto import CAR, AsyncClient, AtUri, Client, models # logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.INFO) @@ -31,6 +31,14 @@ def sync_main(): client = Client() client.login(os.environ['USERNAME'], os.environ['PASSWORD']) + repo = client.com.atproto.sync.get_repo({'did': client.me.did}) + car_file = CAR.from_bytes(repo) + print(car_file.root) + print(car_file.nodes) + + # res = client.com.atproto.repo.get_record(...) # implement by yourself + # also you need to parse "res.value" as profile record using get_or_create_model method + # search_result = client.bsky.actor.search_actors_typeahead({'term': 'marshal'}) # for actor in search_result.actors: # print(actor.handle, actor.displayName)