From 71cb95b34a76499d6416a66d5402f236be7e0ee5 Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Sat, 8 Apr 2023 10:59:51 -0500 Subject: [PATCH 01/20] Fix typo --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d9903d9f11..9eb0097c1d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,7 +17,7 @@ Others * Fix deprecation warning in query tracing (PR 1103) * Remove mutable default values from some tests (PR 1116) * Remove dependency on unittest2 (PYTHON-1289) -* Fix deprecation warnings for asyncio.coroutine annotation in asyncioreactor (PYTTHON-1290) +* Fix deprecation warnings for asyncio.coroutine annotation in asyncioreactor (PYTHON-1290) * Fix typos in source files (PR 1126) * HostFilterPolicyInitTest fix for Python 3.11 (PR 1131) * Fix for DontPrepareOnIgnoredHostsTest (PYTHON-1287) From b0b9527c2054ba5ffa44e407903896eec4bdc13a Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 1 May 2023 10:56:12 -0500 Subject: [PATCH 02/20] PYTHON-1341 Impl of client-side column-level encryption/decryption (#1150) --- cassandra/cluster.py | 18 +- cassandra/obj_parser.pyx | 16 +- cassandra/parsing.pxd | 2 + cassandra/parsing.pyx | 4 +- cassandra/policies.py | 188 +++++++++++++++++- cassandra/protocol.py | 46 +++-- cassandra/query.py | 23 ++- cassandra/row_parser.pyx | 12 +- docs/column_encryption.rst | 92 +++++++++ docs/conf.py | 2 +- docs/index.rst | 4 + requirements.txt | 1 + .../standard/test_custom_protocol_handler.py | 4 +- tests/integration/standard/test_policies.py | 75 ++++++- tests/unit/test_policies.py | 134 ++++++++++++- 15 files changed, 573 insertions(+), 48 deletions(-) create mode 100644 docs/column_encryption.rst diff --git a/cassandra/cluster.py b/cassandra/cluster.py index cd5bac51a5..d9dbfe08a3 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -1034,6 +1034,12 @@ def default_retry_policy(self, policy): or to disable the shardaware port (advanced shardaware) """ + column_encryption_policy = None + """ + An instance of :class:`cassandra.policies.ColumnEncryptionPolicy` specifying encryption materials to be + used for columns in this cluster. + """ + metadata_request_timeout = datetime.timedelta(seconds=2) """ Timeout for all queries used by driver it self. @@ -1157,6 +1163,7 @@ def __init__(self, scylla_cloud=None, shard_aware_options=None, metadata_request_timeout=None, + column_encryption_policy=None, ): """ ``executor_threads`` defines the number of threads in a pool for handling asynchronous tasks such as @@ -1234,6 +1241,9 @@ def __init__(self, self.port = port + if column_encryption_policy is not None: + self.column_encryption_policy = column_encryption_policy + self.endpoint_factory = endpoint_factory or DefaultEndPointFactory(port=self.port) self.endpoint_factory.configure(self) @@ -2658,6 +2668,12 @@ def __init__(self, cluster, hosts, keyspace=None): self.encoder = Encoder() + if self.cluster.column_encryption_policy is not None: + try: + self.client_protocol_handler.column_encryption_policy = self.cluster.column_encryption_policy + except AttributeError: + log.info("Unable to set column encryption policy for session") + # create connection pools in parallel self._initial_connect_futures = set() for host in hosts: @@ -3197,7 +3213,7 @@ def prepare(self, query, custom_payload=None, keyspace=None): prepared_keyspace = keyspace if keyspace else None prepared_statement = PreparedStatement.from_message( response.query_id, response.bind_metadata, response.pk_indexes, self.cluster.metadata, query, prepared_keyspace, - self._protocol_version, response.column_metadata, response.result_metadata_id) + self._protocol_version, response.column_metadata, response.result_metadata_id, self.cluster.column_encryption_policy) prepared_statement.custom_payload = future.custom_payload self.cluster.add_prepared(response.query_id, prepared_statement) diff --git a/cassandra/obj_parser.pyx b/cassandra/obj_parser.pyx index a0b5316a33..cf43771dd7 100644 --- a/cassandra/obj_parser.pyx +++ b/cassandra/obj_parser.pyx @@ -17,9 +17,12 @@ include "ioutils.pyx" from cassandra import DriverException from cassandra.bytesio cimport BytesIOReader from cassandra.deserializers cimport Deserializer, from_binary +from cassandra.deserializers import find_deserializer from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser from cassandra.tuple cimport tuple_new, tuple_set +from cpython.bytes cimport PyBytes_AsStringAndSize + cdef class ListParser(ColumnParser): """Decode a ResultMessage into a list of tuples (or other objects)""" @@ -58,18 +61,29 @@ cdef class TupleRowParser(RowParser): assert desc.rowsize >= 0 cdef Buffer buf + cdef Buffer newbuf cdef Py_ssize_t i, rowsize = desc.rowsize cdef Deserializer deserializer cdef tuple res = tuple_new(desc.rowsize) + ce_policy = desc.column_encryption_policy for i in range(rowsize): # Read the next few bytes get_buf(reader, &buf) # Deserialize bytes to python object deserializer = desc.deserializers[i] + coldesc = desc.coldescs[i] + uses_ce = ce_policy and ce_policy.contains_column(coldesc) try: - val = from_binary(deserializer, &buf, desc.protocol_version) + if uses_ce: + col_type = ce_policy.column_type(coldesc) + decrypted_bytes = ce_policy.decrypt(coldesc, to_bytes(&buf)) + PyBytes_AsStringAndSize(decrypted_bytes, &newbuf.ptr, &newbuf.size) + deserializer = find_deserializer(ce_policy.column_type(coldesc)) + val = from_binary(deserializer, &newbuf, desc.protocol_version) + else: + val = from_binary(deserializer, &buf, desc.protocol_version) except Exception as e: raise DriverException('Failed decoding result column "%s" of type %s: %s' % (desc.colnames[i], desc.coltypes[i].cql_parameterized_type(), diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd index aa9478cd14..27dc368b07 100644 --- a/cassandra/parsing.pxd +++ b/cassandra/parsing.pxd @@ -18,6 +18,8 @@ from cassandra.deserializers cimport Deserializer cdef class ParseDesc: cdef public object colnames cdef public object coltypes + cdef public object column_encryption_policy + cdef public list coldescs cdef Deserializer[::1] deserializers cdef public int protocol_version cdef Py_ssize_t rowsize diff --git a/cassandra/parsing.pyx b/cassandra/parsing.pyx index d2bc0a3abe..954767d227 100644 --- a/cassandra/parsing.pyx +++ b/cassandra/parsing.pyx @@ -19,9 +19,11 @@ Module containing the definitions and declarations (parsing.pxd) for parsers. cdef class ParseDesc: """Description of what structure to parse""" - def __init__(self, colnames, coltypes, deserializers, protocol_version): + def __init__(self, colnames, coltypes, column_encryption_policy, coldescs, deserializers, protocol_version): self.colnames = colnames self.coltypes = coltypes + self.column_encryption_policy = column_encryption_policy + self.coldescs = coldescs self.deserializers = deserializers self.protocol_version = protocol_version self.rowsize = len(colnames) diff --git a/cassandra/policies.py b/cassandra/policies.py index d9d3da7980..7a6fe467fc 100644 --- a/cassandra/policies.py +++ b/cassandra/policies.py @@ -12,14 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import random + +from collections import namedtuple +from functools import lru_cache from itertools import islice, cycle, groupby, repeat import logging +import os from random import randint, shuffle from threading import Lock import socket import warnings + +from cryptography.hazmat.primitives import padding +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes + from cassandra import WriteType as WT -from cassandra.connection import UnixSocketEndPoint +from cassandra.cqltypes import _cqltypes # This is done this way because WriteType was originally @@ -572,8 +580,9 @@ def __init__(self, hosts): self._allowed_hosts = tuple(hosts) self._allowed_hosts_resolved = [] for h in self._allowed_hosts: - if isinstance(h, UnixSocketEndPoint): - self._allowed_hosts_resolved.append(h._unix_socket_path) + unix_socket_path = getattr(h, "_unix_socket_path", None) + if unix_socket_path: + self._allowed_hosts_resolved.append(unix_socket_path) else: self._allowed_hosts_resolved.extend([endpoint[4][0] for endpoint in socket.getaddrinfo(h, None, socket.AF_UNSPEC, socket.SOCK_STREAM)]) @@ -608,7 +617,7 @@ class HostFilterPolicy(LoadBalancingPolicy): A :class:`.LoadBalancingPolicy` subclass configured with a child policy, and a single-argument predicate. This policy defers to the child policy for hosts where ``predicate(host)`` is truthy. Hosts for which - ``predicate(host)`` is falsey will be considered :attr:`.IGNORED`, and will + ``predicate(host)`` is falsy will be considered :attr:`.IGNORED`, and will not be used in a query plan. This can be used in the cases where you need a whitelist or blacklist @@ -644,7 +653,7 @@ def __init__(self, child_policy, predicate): :param child_policy: an instantiated :class:`.LoadBalancingPolicy` that this one will defer to. :param predicate: a one-parameter function that takes a :class:`.Host`. - If it returns a falsey value, the :class:`.Host` will + If it returns a falsy value, the :class:`.Host` will be :attr:`.IGNORED` and not returned in query plans. """ super(HostFilterPolicy, self).__init__() @@ -680,7 +689,7 @@ def predicate(self): def distance(self, host): """ Checks if ``predicate(host)``, then returns - :attr:`~HostDistance.IGNORED` if falsey, and defers to the child policy + :attr:`~HostDistance.IGNORED` if falsy, and defers to the child policy otherwise. """ if self.predicate(host): @@ -769,7 +778,7 @@ class ReconnectionPolicy(object): def new_schedule(self): """ This should return a finite or infinite iterable of delays (each as a - floating point number of seconds) inbetween each failed reconnection + floating point number of seconds) in-between each failed reconnection attempt. Note that if the iterable is finite, reconnection attempts will cease once the iterable is exhausted. """ @@ -779,12 +788,12 @@ def new_schedule(self): class ConstantReconnectionPolicy(ReconnectionPolicy): """ A :class:`.ReconnectionPolicy` subclass which sleeps for a fixed delay - inbetween each reconnection attempt. + in-between each reconnection attempt. """ def __init__(self, delay, max_attempts=64): """ - `delay` should be a floating point number of seconds to wait inbetween + `delay` should be a floating point number of seconds to wait in-between each attempt. `max_attempts` should be a total number of attempts to be made before @@ -808,7 +817,7 @@ def new_schedule(self): class ExponentialReconnectionPolicy(ReconnectionPolicy): """ A :class:`.ReconnectionPolicy` subclass which exponentially increases - the length of the delay inbetween each reconnection attempt up to + the length of the delay in-between each reconnection attempt up to a set maximum delay. A random amount of jitter (+/- 15%) will be added to the pure exponential @@ -868,7 +877,7 @@ class RetryPolicy(object): timeout and unavailable failures. These are failures reported from the server side. Timeouts are configured by `settings in cassandra.yaml `_. - Unavailable failures occur when the coordinator cannot acheive the consistency + Unavailable failures occur when the coordinator cannot achieve the consistency level for a request. For further information see the method descriptions below. @@ -1385,3 +1394,160 @@ def _rethrow(self, *args, **kwargs): on_read_timeout = _rethrow on_write_timeout = _rethrow on_unavailable = _rethrow + + +ColDesc = namedtuple('ColDesc', ['ks', 'table', 'col']) +ColData = namedtuple('ColData', ['key','type']) + +class ColumnEncryptionPolicy(object): + """ + A policy enabling (mostly) transparent encryption and decryption of data before it is + sent to the cluster. + + Key materials and other configurations are specified on a per-column basis. This policy can + then be used by driver structures which are aware of the underlying columns involved in their + work. In practice this includes the following cases: + + * Prepared statements - data for columns specified by the cluster's policy will be transparently + encrypted before they are sent + * Rows returned from any query - data for columns specified by the cluster's policy will be + transparently decrypted before they are returned to the user + + To enable this functionality, create an instance of this class (or more likely a subclass) + before creating a cluster. This policy should then be configured and supplied to the Cluster + at creation time via the :attr:`.Cluster.column_encryption_policy` attribute. + """ + + def encrypt(self, coldesc, obj_bytes): + """ + Encrypt the specified bytes using the cryptography materials for the specified column. + Largely used internally, although this could also be used to encrypt values supplied + to non-prepared statements in a way that is consistent with this policy. + """ + raise NotImplementedError() + + def decrypt(self, coldesc, encrypted_bytes): + """ + Decrypt the specified (encrypted) bytes using the cryptography materials for the + specified column. Used internally; could be used externally as well but there's + not currently an obvious use case. + """ + raise NotImplementedError() + + def add_column(self, coldesc, key): + """ + Provide cryptography materials to be used when encrypted and/or decrypting data + for the specified column. + """ + raise NotImplementedError() + + def contains_column(self, coldesc): + """ + Predicate to determine if a specific column is supported by this policy. + Currently only used internally. + """ + raise NotImplementedError() + + def encode_and_encrypt(self, coldesc, obj): + """ + Helper function to enable use of this policy on simple (i.e. non-prepared) + statements. + """ + raise NotImplementedError() + +AES256_BLOCK_SIZE = 128 +AES256_BLOCK_SIZE_BYTES = int(AES256_BLOCK_SIZE / 8) +AES256_KEY_SIZE = 256 +AES256_KEY_SIZE_BYTES = int(AES256_KEY_SIZE / 8) + +class AES256ColumnEncryptionPolicy(ColumnEncryptionPolicy): + + # CBC uses an IV that's the same size as the block size + # + # TODO: Need to find some way to expose mode options + # (CBC etc.) without leaking classes from the underlying + # impl here + def __init__(self, mode = modes.CBC, iv = os.urandom(AES256_BLOCK_SIZE_BYTES)): + + self.mode = mode + self.iv = iv + + # ColData for a given ColDesc is always preserved. We only create a Cipher + # when there's an actual need to for a given ColDesc + self.coldata = {} + self.ciphers = {} + + def encrypt(self, coldesc, obj_bytes): + + # AES256 has a 128-bit block size so if the input bytes don't align perfectly on + # those blocks we have to pad them. There's plenty of room for optimization here: + # + # * Instances of the PKCS7 padder should be managed in a bounded pool + # * It would be nice if we could get a flag from encrypted data to indicate + # whether it was padded or not + # * Might be able to make this happen with a leading block of flags in encrypted data + padder = padding.PKCS7(AES256_BLOCK_SIZE).padder() + padded_bytes = padder.update(obj_bytes) + padder.finalize() + + cipher = self._get_cipher(coldesc) + encryptor = cipher.encryptor() + return encryptor.update(padded_bytes) + encryptor.finalize() + + def decrypt(self, coldesc, encrypted_bytes): + + cipher = self._get_cipher(coldesc) + decryptor = cipher.decryptor() + padded_bytes = decryptor.update(encrypted_bytes) + decryptor.finalize() + + unpadder = padding.PKCS7(AES256_BLOCK_SIZE).unpadder() + return unpadder.update(padded_bytes) + unpadder.finalize() + + def add_column(self, coldesc, key, type): + + if not coldesc: + raise ValueError("ColDesc supplied to add_column cannot be None") + if not key: + raise ValueError("Key supplied to add_column cannot be None") + if not type: + raise ValueError("Type supplied to add_column cannot be None") + if type not in _cqltypes.keys(): + raise ValueError("Type %s is not a supported type".format(type)) + if not len(key) == AES256_KEY_SIZE_BYTES: + raise ValueError("AES256 column encryption policy expects a 256-bit encryption key") + self.coldata[coldesc] = ColData(key, _cqltypes[type]) + + def contains_column(self, coldesc): + return coldesc in self.coldata + + def encode_and_encrypt(self, coldesc, obj): + if not coldesc: + raise ValueError("ColDesc supplied to encode_and_encrypt cannot be None") + if not obj: + raise ValueError("Object supplied to encode_and_encrypt cannot be None") + coldata = self.coldata.get(coldesc) + if not coldata: + raise ValueError("Could not find ColData for ColDesc %s".format(coldesc)) + return self.encrypt(coldesc, coldata.type.serialize(obj, None)) + + def cache_info(self): + return AES256ColumnEncryptionPolicy._build_cipher.cache_info() + + def column_type(self, coldesc): + return self.coldata[coldesc].type + + def _get_cipher(self, coldesc): + """ + Access relevant state from this instance necessary to create a Cipher and then get one, + hopefully returning a cached instance if we've already done so (and it hasn't been evicted) + """ + + try: + coldata = self.coldata[coldesc] + return AES256ColumnEncryptionPolicy._build_cipher(coldata.key, self.mode, self.iv) + except KeyError: + raise ValueError("Could not find column {}".format(coldesc)) + + # Explicitly use a class method here to avoid caching self + @lru_cache(maxsize=128) + def _build_cipher(key, mode, iv): + return Cipher(algorithms.AES256(key), mode(iv)) diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 53a4938d0d..29ae404048 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -27,9 +27,6 @@ AlreadyExists, InvalidRequest, Unauthorized, UnsupportedOperation, UserFunctionDescriptor, UserAggregateDescriptor, SchemaTargetType) -from cassandra.marshal import (int32_pack, int32_unpack, uint16_pack, uint16_unpack, - uint8_pack, int8_unpack, uint64_pack, header_pack, - v3_header_pack, uint32_pack, uint32_le_unpack, uint32_le_pack) from cassandra.cqltypes import (AsciiType, BytesType, BooleanType, CounterColumnType, DateType, DecimalType, DoubleType, FloatType, Int32Type, @@ -38,6 +35,10 @@ UTF8Type, VarcharType, UUIDType, UserType, TupleType, lookup_casstype, SimpleDateType, TimeType, ByteType, ShortType, DurationType) +from cassandra.marshal import (int32_pack, int32_unpack, uint16_pack, uint16_unpack, + uint8_pack, int8_unpack, uint64_pack, header_pack, + v3_header_pack, uint32_pack, uint32_le_unpack, uint32_le_pack) +from cassandra.policies import ColDesc from cassandra import WriteType from cassandra.cython_deps import HAVE_CYTHON, HAVE_NUMPY from cassandra import util @@ -733,11 +734,11 @@ class ResultMessage(_MessageType): def __init__(self, kind): self.kind = kind - def recv(self, f, protocol_version, user_type_map, result_metadata): + def recv(self, f, protocol_version, user_type_map, result_metadata, column_encryption_policy): if self.kind == RESULT_KIND_VOID: return elif self.kind == RESULT_KIND_ROWS: - self.recv_results_rows(f, protocol_version, user_type_map, result_metadata) + self.recv_results_rows(f, protocol_version, user_type_map, result_metadata, column_encryption_policy) elif self.kind == RESULT_KIND_SET_KEYSPACE: self.new_keyspace = read_string(f) elif self.kind == RESULT_KIND_PREPARED: @@ -748,32 +749,40 @@ def recv(self, f, protocol_version, user_type_map, result_metadata): raise DriverException("Unknown RESULT kind: %d" % self.kind) @classmethod - def recv_body(cls, f, protocol_version, protocol_features, user_type_map, result_metadata): + def recv_body(cls, f, protocol_version, protocol_features, user_type_map, result_metadata, column_encryption_policy): kind = read_int(f) msg = cls(kind) - msg.recv(f, protocol_version, user_type_map, result_metadata) + msg.recv(f, protocol_version, user_type_map, result_metadata, column_encryption_policy) return msg - def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata): + def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata, column_encryption_policy): self.recv_results_metadata(f, user_type_map) column_metadata = self.column_metadata or result_metadata rowcount = read_int(f) rows = [self.recv_row(f, len(column_metadata)) for _ in range(rowcount)] self.column_names = [c[2] for c in column_metadata] self.column_types = [c[3] for c in column_metadata] + col_descs = [ColDesc(md[0], md[1], md[2]) for md in column_metadata] + + def decode_val(val, col_md, col_desc): + uses_ce = column_encryption_policy and column_encryption_policy.contains_column(col_desc) + col_type = column_encryption_policy.column_type(col_desc) if uses_ce else col_md[3] + raw_bytes = column_encryption_policy.decrypt(col_desc, val) if uses_ce else val + return col_type.from_binary(raw_bytes, protocol_version) + + def decode_row(row): + return tuple(decode_val(val, col_md, col_desc) for val, col_md, col_desc in zip(row, column_metadata, col_descs)) + try: - self.parsed_rows = [ - tuple(ctype.from_binary(val, protocol_version) - for ctype, val in zip(self.column_types, row)) - for row in rows] + self.parsed_rows = [decode_row(row) for row in rows] except Exception: for row in rows: - for i in range(len(row)): + for val, col_md, col_desc in zip(row, column_metadata, col_descs): try: - self.column_types[i].from_binary(row[i], protocol_version) + decode_val(val, col_md, col_desc) except Exception as e: - raise DriverException('Failed decoding result column "%s" of type %s: %s' % (self.column_names[i], - self.column_types[i].cql_parameterized_type(), + raise DriverException('Failed decoding result column "%s" of type %s: %s' % (col_md[2], + col_md[3].cql_parameterized_type(), str(e))) def recv_results_prepared(self, f, protocol_version, user_type_map): @@ -1109,6 +1118,9 @@ class _ProtocolHandler(object): result decoding implementations. """ + column_encryption_policy = None + """Instance of :class:`cassandra.policies.ColumnEncryptionPolicy` in use by this handler""" + @classmethod def encode_message(cls, msg, stream_id, protocol_version, compressor, allow_beta_protocol_version): """ @@ -1203,7 +1215,7 @@ def decode_message(cls, protocol_version, protocol_features, user_type_map, stre log.warning("Unknown protocol flags set: %02x. May cause problems.", flags) msg_class = cls.message_types_by_opcode[opcode] - msg = msg_class.recv_body(body, protocol_version, protocol_features, user_type_map, result_metadata) + msg = msg_class.recv_body(body, protocol_version, protocol_features, user_type_map, result_metadata, cls.column_encryption_policy) msg.stream_id = stream_id msg.trace_id = trace_id msg.custom_payload = custom_payload diff --git a/cassandra/query.py b/cassandra/query.py index f8447bd04c..13813f742c 100644 --- a/cassandra/query.py +++ b/cassandra/query.py @@ -29,6 +29,7 @@ from cassandra.util import unix_time_from_uuid1, maybe_add_timeout_to_query from cassandra.encoder import Encoder import cassandra.encoder +from cassandra.policies import ColDesc from cassandra.protocol import _UNSET_VALUE from cassandra.util import OrderedDict, _sanitize_identifiers @@ -449,12 +450,14 @@ class PreparedStatement(object): query_string = None result_metadata = None result_metadata_id = None + column_encryption_policy = None routing_key_indexes = None _routing_key_index_set = None serial_consistency_level = None # TODO never used? def __init__(self, column_metadata, query_id, routing_key_indexes, query, - keyspace, protocol_version, result_metadata, result_metadata_id): + keyspace, protocol_version, result_metadata, result_metadata_id, + column_encryption_policy=None): self.column_metadata = column_metadata self.query_id = query_id self.routing_key_indexes = routing_key_indexes @@ -463,14 +466,17 @@ def __init__(self, column_metadata, query_id, routing_key_indexes, query, self.protocol_version = protocol_version self.result_metadata = result_metadata self.result_metadata_id = result_metadata_id + self.column_encryption_policy = column_encryption_policy self.is_idempotent = False @classmethod def from_message(cls, query_id, column_metadata, pk_indexes, cluster_metadata, query, prepared_keyspace, protocol_version, result_metadata, - result_metadata_id): + result_metadata_id, column_encryption_policy=None): if not column_metadata: - return PreparedStatement(column_metadata, query_id, None, query, prepared_keyspace, protocol_version, result_metadata, result_metadata_id) + return PreparedStatement(column_metadata, query_id, None, + query, prepared_keyspace, protocol_version, result_metadata, + result_metadata_id, column_encryption_policy) if pk_indexes: routing_key_indexes = pk_indexes @@ -496,7 +502,7 @@ def from_message(cls, query_id, column_metadata, pk_indexes, cluster_metadata, return PreparedStatement(column_metadata, query_id, routing_key_indexes, query, prepared_keyspace, protocol_version, result_metadata, - result_metadata_id) + result_metadata_id, column_encryption_policy) def bind(self, values): """ @@ -585,6 +591,7 @@ def bind(self, values): values = () proto_version = self.prepared_statement.protocol_version col_meta = self.prepared_statement.column_metadata + ce_policy = self.prepared_statement.column_encryption_policy # special case for binding dicts if isinstance(values, dict): @@ -631,7 +638,13 @@ def bind(self, values): raise ValueError("Attempt to bind UNSET_VALUE while using unsuitable protocol version (%d < 4)" % proto_version) else: try: - self.values.append(col_spec.type.serialize(value, proto_version)) + col_desc = ColDesc(col_spec.keyspace_name, col_spec.table_name, col_spec.name) + uses_ce = ce_policy and ce_policy.contains_column(col_desc) + col_type = ce_policy.column_type(col_desc) if uses_ce else col_spec.type + col_bytes = col_type.serialize(value, proto_version) + if uses_ce: + col_bytes = ce_policy.encrypt(col_desc, col_bytes) + self.values.append(col_bytes) except (TypeError, struct.error) as exc: actual_type = type(value) message = ('Received an argument of invalid type for column "%s". ' diff --git a/cassandra/row_parser.pyx b/cassandra/row_parser.pyx index 3a4b2f4604..88277a4593 100644 --- a/cassandra/row_parser.pyx +++ b/cassandra/row_parser.pyx @@ -13,13 +13,14 @@ # limitations under the License. from cassandra.parsing cimport ParseDesc, ColumnParser +from cassandra.policies import ColDesc from cassandra.obj_parser import TupleRowParser from cassandra.deserializers import make_deserializers include "ioutils.pyx" def make_recv_results_rows(ColumnParser colparser): - def recv_results_rows(self, f, int protocol_version, user_type_map, result_metadata): + def recv_results_rows(self, f, int protocol_version, user_type_map, result_metadata, column_encryption_policy): """ Parse protocol data given as a BytesIO f into a set of columns (e.g. list of tuples) This is used as the recv_results_rows method of (Fast)ResultMessage @@ -28,11 +29,12 @@ def make_recv_results_rows(ColumnParser colparser): column_metadata = self.column_metadata or result_metadata - self.column_names = [c[2] for c in column_metadata] - self.column_types = [c[3] for c in column_metadata] + self.column_names = [md[2] for md in column_metadata] + self.column_types = [md[3] for md in column_metadata] - desc = ParseDesc(self.column_names, self.column_types, make_deserializers(self.column_types), - protocol_version) + desc = ParseDesc(self.column_names, self.column_types, column_encryption_policy, + [ColDesc(md[0], md[1], md[2]) for md in column_metadata], + make_deserializers(self.column_types), protocol_version) reader = BytesIOReader(f.read()) try: self.parsed_rows = colparser.parse_rows(reader, desc) diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst new file mode 100644 index 0000000000..4d2a6c2d91 --- /dev/null +++ b/docs/column_encryption.rst @@ -0,0 +1,92 @@ +Column Encryption +================= + +Overview +-------- +Support for client-side encryption of data was added in version 3.27.0 of the Python driver. When using +this feature data will be encrypted on-the-fly according to a specified :class:`~.ColumnEncryptionPolicy` +instance. This policy is also used to decrypt data in returned rows. If a prepared statement is used +this decryption is transparent to the user; retrieved data will be decrypted and converted into the original +type (according to definitions in the encryption policy). Support for simple (i.e. non-prepared) queries is +also available, although in this case values must be manually encrypted and/or decrypted. The +:class:`~.ColumnEncryptionPolicy` instance provides methods to assist with these operations. + +Client-side encryption and decryption should work against all versions of Cassandra and DSE. It does not +utilize any server-side functionality to do its work. + +Configuration +------------- +Client-side encryption is enabled by creating an instance of a subclass of :class:`~.ColumnEncryptionPolicy` +and adding information about columns to be encrypted to it. This policy is then supplied to :class:`~.Cluster` +when it's created. + +.. code-block:: python + import os + + from cassandra.policies import ColDesc, AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES + + key = os.urandom(AES256_KEY_SIZE_BYTES) + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('ks1','table1','column1') + cql_type = "int" + cl_policy.add_column(col_desc, key, cql_type) + cluster = Cluster(column_encryption_policy=cl_policy) + +:class:`~.AES256ColumnEncryptionPolicy` is a subclass of :class:`~.ColumnEncryptionPolicy` which provides +encryption and decryption via AES-256. This class is currently the only available column encryption policy +implementation, although users can certainly implement their own by subclassing :class:`~.ColumnEncryptionPolicy`. + +:class:`~.ColDesc` is a named tuple which uniquely identifies a column in a given keyspace and table. When we +have this tuple, the encryption key and the CQL type contained by this column we can add the column to the policy +using :func:`~.ColumnEncryptionPolicy.add_column`. Once we have added all column definitions to the policy we +pass it along to the cluster. + +The CQL type for the column only has meaning at the client; it is never sent to Cassandra. The encryption key +is also never sent to the server; all the server ever sees are random bytes reflecting the encrypted data. As a +result all columns containing client-side encrypted values should be declared with the CQL type "blob" at the +Cassandra server. + +Usage +----- + +Encryption +^^^^^^^^^^ +Client-side encryption shines most when used with prepared statements. A prepared statement is aware of information +about the columns in the query it was built from and we can use this information to transparently encrypt any +supplied parameters. For example, we can create a prepared statement to insert a value into column1 (as defined above) +by executing the following code after creating a :class:`~.Cluster` in the manner described above: + +.. code-block:: python + session = cluster.connect() + prepared = session.prepare("insert into ks1.table1 (column1) values (?)") + session.execute(prepared, (1000,)) + +Our encryption policy will detect that "column1" is an encrypted column and take appropriate action. + +As mentioned above client-side encryption can also be used with simple queries, although such use cases are +certainly not transparent. :class:`~.ColumnEncryptionPolicy` provides a helper named +:func:`~.ColumnEncryptionPolicy.encode_and_encrypt` which will convert an input value into bytes using the +standard serialization methods employed by the driver. The result is then encrypted according to the configuration +of the policy. Using this approach the example above could be implemented along the lines of the following: + +.. code-block:: python + session = cluster.connect() + session.execute("insert into ks1.table1 (column1) values (%s)",(cl_policy.encode_and_encrypt(col_desc, 1000),)) + +Decryption +^^^^^^^^^^ +Decryption of values returned from the server is always transparent. Whether we're executing a simple or prepared +statement encrypted columns will be decrypted automatically and made available via rows just like any other +result. + +Limitations +----------- +:class:`~.AES256ColumnEncryptionPolicy` uses the implementation of AES-256 provided by the +`cryptography `_ module. Any limitations of this module should be considered +when deploying client-side encryption. Note specifically that a Rust compiler is required for modern versions +of the cryptography package, although wheels exist for many common platforms. + +Client-side encryption has been implemented for both the default Cython and pure Python row processing logic. +This functionality has not yet been ported to the NumPy Cython implementation. We have reason to believe the +NumPy processing works reasonably well on Python 3.7 but fails for Python 3.8. We hope to address this discrepancy +in a future release. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 653625bd96..d2bfa813a7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -73,7 +73,7 @@ # -- Options for not found extension # Template used to render the 404.html generated by this extension. -notfound_template = '404.html' +notfound_template = '404.html' # Prefix added to all the URLs generated in the 404 page. notfound_urls_prefix = '' diff --git a/docs/index.rst b/docs/index.rst index 578c787aa4..eed7d89ea1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,6 +56,9 @@ Contents :doc:`scylla-cloud-serverless` Connect to ScyllaDB Cloud Serverless +:doc:`column_encryption` + Transparent client-side per-column encryption and decryption + :doc:`CHANGELOG` Log of changes to the driver, organized by version. @@ -83,6 +86,7 @@ Contents dates-and-times scylla-cloud scylla-cloud-serverless + column-encryption faq Getting Help diff --git a/requirements.txt b/requirements.txt index 100a12905a..0f84701d5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ +cryptography >= 35.0 geomet>=0.1,<0.3 diff --git a/tests/integration/standard/test_custom_protocol_handler.py b/tests/integration/standard/test_custom_protocol_handler.py index 9f3a52e256..3a3d50ed39 100644 --- a/tests/integration/standard/test_custom_protocol_handler.py +++ b/tests/integration/standard/test_custom_protocol_handler.py @@ -263,7 +263,7 @@ class CustomResultMessageRaw(ResultMessage): my_type_codes[0xc] = UUIDType type_codes = my_type_codes - def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata): + def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata, column_encryption_policy): self.recv_results_metadata(f, user_type_map) column_metadata = self.column_metadata or result_metadata rowcount = read_int(f) @@ -292,7 +292,7 @@ class CustomResultMessageTracked(ResultMessage): type_codes = my_type_codes checked_rev_row_set = set() - def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata): + def recv_results_rows(self, f, protocol_version, user_type_map, result_metadata, column_encryption_policy): self.recv_results_metadata(f, user_type_map) column_metadata = self.column_metadata or result_metadata rowcount = read_int(f) diff --git a/tests/integration/standard/test_policies.py b/tests/integration/standard/test_policies.py index a91505fe24..30b106fb03 100644 --- a/tests/integration/standard/test_policies.py +++ b/tests/integration/standard/test_policies.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from decimal import Decimal +import os +import random import unittest from cassandra.cluster import ExecutionProfile, EXEC_PROFILE_DEFAULT from cassandra.policies import HostFilterPolicy, RoundRobinPolicy, SimpleConvictionPolicy, \ - WhiteListRoundRobinPolicy, ExponentialBackoffRetryPolicy + WhiteListRoundRobinPolicy, ExponentialBackoffRetryPolicy, ColDesc, AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES from cassandra.pool import Host from cassandra.connection import DefaultEndPoint @@ -106,4 +109,72 @@ def test_exponential_retries(self): """ CREATE KEYSPACE preparedtests WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} - """) \ No newline at end of file + """) + +class ColumnEncryptionPolicyTest(unittest.TestCase): + + def _recreate_keyspace(self, session): + session.execute("drop keyspace if exists foo") + session.execute("CREATE KEYSPACE foo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}") + session.execute("CREATE TABLE foo.bar(encrypted blob, unencrypted int, primary key(unencrypted))") + + def test_end_to_end_prepared(self): + + # We only currently perform testing on a single type/expected value pair since CLE functionality is essentially + # independent of the underlying type. We intercept data after it's been encoded when it's going out and before it's + # encoded when coming back; the actual types of the data involved don't impact us. + expected = 12345 + expected_type = "int" + + key = os.urandom(AES256_KEY_SIZE_BYTES) + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('foo','bar','encrypted') + cl_policy.add_column(col_desc, key, expected_type) + + cluster = TestCluster(column_encryption_policy=cl_policy) + session = cluster.connect() + self._recreate_keyspace(session) + + prepared = session.prepare("insert into foo.bar (encrypted, unencrypted) values (?,?)") + session.execute(prepared, (expected,expected)) + + # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted + # values here to confirm that we don't interfere with regular processing of unencrypted vals. + (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + # Confirm the same behaviour from a subsequent prepared statement as well + prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") + (encrypted,unencrypted) = session.execute(prepared, [expected]).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + def test_end_to_end_simple(self): + + expected = 67890 + expected_type = "int" + + key = os.urandom(AES256_KEY_SIZE_BYTES) + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('foo','bar','encrypted') + cl_policy.add_column(col_desc, key, expected_type) + + cluster = TestCluster(column_encryption_policy=cl_policy) + session = cluster.connect() + self._recreate_keyspace(session) + + # Use encode_and_encrypt helper function to populate date + session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)",(cl_policy.encode_and_encrypt(col_desc, expected), expected)) + + # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted + # values here to confirm that we don't interfere with regular processing of unencrypted vals. + (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + # Confirm the same behaviour from a subsequent prepared statement as well + prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") + (encrypted,unencrypted) = session.execute(prepared, [expected]).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) diff --git a/tests/unit/test_policies.py b/tests/unit/test_policies.py index 15bd1ea95b..849a6c5e94 100644 --- a/tests/unit/test_policies.py +++ b/tests/unit/test_policies.py @@ -16,6 +16,7 @@ from itertools import islice, cycle from mock import Mock, patch, call +import os from random import randint import pytest from _thread import LockType @@ -32,9 +33,10 @@ RetryPolicy, WriteType, DowngradingConsistencyRetryPolicy, ConstantReconnectionPolicy, LoadBalancingPolicy, ConvictionPolicy, ReconnectionPolicy, FallthroughRetryPolicy, - IdentityTranslator, EC2MultiRegionTranslator, HostFilterPolicy, ExponentialBackoffRetryPolicy) -from cassandra.pool import Host + IdentityTranslator, EC2MultiRegionTranslator, HostFilterPolicy, ExponentialBackoffRetryPolicy, + ColDesc, AES256ColumnEncryptionPolicy, AES256_BLOCK_SIZE_BYTES, AES256_KEY_SIZE_BYTES) from cassandra.connection import DefaultEndPoint, UnixSocketEndPoint +from cassandra.pool import Host from cassandra.query import Statement @@ -1580,3 +1582,131 @@ def test_create_whitelist(self): self.assertEqual(set(query_plan), {Host(DefaultEndPoint("127.0.0.1"), SimpleConvictionPolicy), Host(DefaultEndPoint("127.0.0.4"), SimpleConvictionPolicy)}) +class AES256ColumnEncryptionPolicyTest(unittest.TestCase): + + def _random_block(self): + return os.urandom(AES256_BLOCK_SIZE_BYTES) + + def _random_key(self): + return os.urandom(AES256_KEY_SIZE_BYTES) + + def _test_round_trip(self, bytes): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + policy.add_column(coldesc, self._random_key(), "blob") + encrypted_bytes = policy.encrypt(coldesc, bytes) + self.assertEqual(bytes, policy.decrypt(coldesc, encrypted_bytes)) + + def test_no_padding_necessary(self): + self._test_round_trip(self._random_block()) + + def test_some_padding_required(self): + for byte_size in range(1,AES256_BLOCK_SIZE_BYTES - 1): + bytes = os.urandom(byte_size) + self._test_round_trip(bytes) + for byte_size in range(AES256_BLOCK_SIZE_BYTES + 1,(2 * AES256_BLOCK_SIZE_BYTES) - 1): + bytes = os.urandom(byte_size) + self._test_round_trip(bytes) + + def test_add_column_invalid_key_size_raises(self): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + for key_size in range(1,AES256_KEY_SIZE_BYTES - 1): + with self.assertRaises(ValueError): + policy.add_column(coldesc, os.urandom(key_size), "blob") + for key_size in range(AES256_KEY_SIZE_BYTES + 1,(2 * AES256_KEY_SIZE_BYTES) - 1): + with self.assertRaises(ValueError): + policy.add_column(coldesc, os.urandom(key_size), "blob") + + def test_add_column_null_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + policy.add_column(None, self._random_block(), "blob") + + def test_add_column_null_key_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, None, "blob") + + def test_add_column_null_type_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_block(), None) + + def test_add_column_unknown_type_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_block(), "foobar") + + def test_encode_and_encrypt_null_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(None, self._random_block()) + + def test_encode_and_encrypt_null_obj_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(coldesc, None) + + def test_encode_and_encrypt_unknown_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(ColDesc('ks2','table2','col2'), self._random_block()) + + def test_contains_column(self): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + policy.add_column(coldesc, self._random_key(), "blob") + self.assertTrue(policy.contains_column(coldesc)) + self.assertFalse(policy.contains_column(ColDesc('ks2','table1','col1'))) + self.assertFalse(policy.contains_column(ColDesc('ks1','table2','col1'))) + self.assertFalse(policy.contains_column(ColDesc('ks1','table1','col2'))) + self.assertFalse(policy.contains_column(ColDesc('ks2','table2','col2'))) + + def test_encrypt_unknown_column(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encrypt(ColDesc('ks2','table2','col2'), self._random_block()) + + def test_decrypt_unknown_column(self): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + encrypted_bytes = policy.encrypt(coldesc, self._random_block()) + with self.assertRaises(ValueError): + policy.decrypt(ColDesc('ks2','table2','col2'), encrypted_bytes) + + def test_cache_info(self): + coldesc1 = ColDesc('ks1','table1','col1') + coldesc2 = ColDesc('ks2','table2','col2') + coldesc3 = ColDesc('ks3','table3','col3') + policy = AES256ColumnEncryptionPolicy() + for coldesc in [coldesc1, coldesc2, coldesc3]: + policy.add_column(coldesc, self._random_key(), "blob") + + # First run for this coldesc should be a miss, everything else should be a cache hit + for _ in range(10): + policy.encrypt(coldesc1, self._random_block()) + cache_info = policy.cache_info() + self.assertEqual(cache_info.hits, 9) + self.assertEqual(cache_info.misses, 1) + self.assertEqual(cache_info.maxsize, 128) + + # Important note: we're measuring the size of the cache of ciphers, NOT stored + # keys. We won't have a cipher here until we actually encrypt something + self.assertEqual(cache_info.currsize, 1) + policy.encrypt(coldesc2, self._random_block()) + self.assertEqual(policy.cache_info().currsize, 2) + policy.encrypt(coldesc3, self._random_block()) + self.assertEqual(policy.cache_info().currsize, 3) From 12f092224df11b9ae390eab9d0037aa1a63f533f Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 1 May 2023 11:01:35 -0500 Subject: [PATCH 03/20] Release 3.27: changelog & version --- CHANGELOG.rst | 8 ++++++++ cassandra/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9eb0097c1d..5599e37dcb 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,11 @@ +3.27.0 +====== +May 1, 2023 + +Features +-------- +* Add support for client-side encryption (PYTHON-1341) + 3.26.0 ====== March 13, 2023 diff --git a/cassandra/__init__.py b/cassandra/__init__.py index 97b79d22bc..301bcbf1c1 100644 --- a/cassandra/__init__.py +++ b/cassandra/__init__.py @@ -23,7 +23,7 @@ def emit(self, record): logging.getLogger('cassandra').addHandler(NullHandler()) -__version_info__ = (3, 26, 9) +__version_info__ = (3, 27, 0) __version__ = '.'.join(map(str, __version_info__)) From dd00b5db7e7e3f5cc2bb187e72b2c8e74ab0711b Mon Sep 17 00:00:00 2001 From: Jamie Gillenwater Date: Mon, 1 May 2023 16:45:09 -0400 Subject: [PATCH 04/20] update RH nav order (#1154) * update RH nav order * add line break * add api --- docs/.nav | 2 +- docs/index.rst | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/.nav b/docs/.nav index 37ab5e93df..e57bdd5bcc 100644 --- a/docs/.nav +++ b/docs/.nav @@ -3,7 +3,7 @@ getting_started scylla_specific execution_profiles lwt -object_mapper` +object_mapper performance query_paging security diff --git a/docs/index.rst b/docs/index.rst index eed7d89ea1..248e44b7c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -59,9 +59,6 @@ Contents :doc:`column_encryption` Transparent client-side per-column encryption and decryption -:doc:`CHANGELOG` - Log of changes to the driver, organized by version. - :doc:`faq` A collection of Frequently Asked Questions From 52737078c2865612a6c26423044786a5bbb110d2 Mon Sep 17 00:00:00 2001 From: Jamie Gillenwater Date: Mon, 1 May 2023 17:04:53 -0400 Subject: [PATCH 05/20] remove future plans (#1155) --- docs/column_encryption.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst index 4d2a6c2d91..289f9cd62b 100644 --- a/docs/column_encryption.rst +++ b/docs/column_encryption.rst @@ -87,6 +87,5 @@ when deploying client-side encryption. Note specifically that a Rust compiler i of the cryptography package, although wheels exist for many common platforms. Client-side encryption has been implemented for both the default Cython and pure Python row processing logic. -This functionality has not yet been ported to the NumPy Cython implementation. We have reason to believe the -NumPy processing works reasonably well on Python 3.7 but fails for Python 3.8. We hope to address this discrepancy -in a future release. \ No newline at end of file +This functionality has not yet been ported to the NumPy Cython implementation. During testing, +the NumPy processing works on Python 3.7 but fails for Python 3.8. \ No newline at end of file From 0d3b5fa44f61d7c1e66261ec4789dda2bf22ad2d Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 1 May 2023 21:18:34 -0500 Subject: [PATCH 06/20] Missed dependency on cryptography in setup.py --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ea3875a371..889450b1d3 100644 --- a/setup.py +++ b/setup.py @@ -414,8 +414,11 @@ def run_setup(extensions): else: sys.stderr.write("Bypassing Cython setup requirement\n") - dependencies = ['geomet>=0.1,<0.3', - 'pyyaml > 5.0'] + dependencies = [ + 'geomet>=0.1,<0.3', + 'pyyaml > 5.0', + 'cryptography>=35.0' + ] _EXTRAS_REQUIRE = { 'graph': ['gremlinpython==3.4.6'] From 070d72aeef179da8327bea80ce72a86e26f428c1 Mon Sep 17 00:00:00 2001 From: Emelia <105240296+emeliawilkinson24@users.noreply.github.com> Date: Tue, 9 May 2023 10:38:02 -0400 Subject: [PATCH 07/20] DOC-2813 (#1145) Added error handling blog reference. --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index c5c4e2b73f..3dca8cc118 100644 --- a/README.rst +++ b/README.rst @@ -69,6 +69,10 @@ Contributing ------------ See `CONTRIBUTING `_. +Error Handling +------------ +While originally written for the Java driver, users may reference the `Cassandra error handling done right blog `_ for resolving error handling scenarios with Apache Cassandra. + Reporting Problems ------------------ Please report any bugs and make any feature requests by clicking the New Issue button in From b9b976e1f389fc1190c3b9bdfec8c614fdc236db Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Fri, 12 May 2023 09:54:04 -0500 Subject: [PATCH 08/20] DOC-3278 Update comment for retry policy (#1158) --- cassandra/policies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cassandra/policies.py b/cassandra/policies.py index 7a6fe467fc..6d60379953 100644 --- a/cassandra/policies.py +++ b/cassandra/policies.py @@ -1027,7 +1027,7 @@ def on_request_error(self, query, consistency, error, retry_num): `retry_num` counts how many times the operation has been retried, so the first time this method is called, `retry_num` will be 0. - The default, it triggers a retry on the next host in the query plan + By default, it triggers a retry on the next host in the query plan with the same consistency level. """ # TODO revisit this for the next major From 3c996d4da60d98b225831413c3b333830d2ad27d Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Fri, 19 May 2023 12:25:32 -0500 Subject: [PATCH 09/20] Fix for rendering of code blocks in CLE documentation (#1159) --- docs/column_encryption.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst index 289f9cd62b..1392972fa6 100644 --- a/docs/column_encryption.rst +++ b/docs/column_encryption.rst @@ -21,6 +21,7 @@ and adding information about columns to be encrypted to it. This policy is then when it's created. .. code-block:: python + import os from cassandra.policies import ColDesc, AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES @@ -57,6 +58,7 @@ supplied parameters. For example, we can create a prepared statement to insert by executing the following code after creating a :class:`~.Cluster` in the manner described above: .. code-block:: python + session = cluster.connect() prepared = session.prepare("insert into ks1.table1 (column1) values (?)") session.execute(prepared, (1000,)) @@ -70,6 +72,7 @@ standard serialization methods employed by the driver. The result is then encry of the policy. Using this approach the example above could be implemented along the lines of the following: .. code-block:: python + session = cluster.connect() session.execute("insert into ks1.table1 (column1) values (%s)",(cl_policy.encode_and_encrypt(col_desc, 1000),)) @@ -88,4 +91,4 @@ of the cryptography package, although wheels exist for many common platforms. Client-side encryption has been implemented for both the default Cython and pure Python row processing logic. This functionality has not yet been ported to the NumPy Cython implementation. During testing, -the NumPy processing works on Python 3.7 but fails for Python 3.8. \ No newline at end of file +the NumPy processing works on Python 3.7 but fails for Python 3.8. From b4f4354b2803bdaec31847937dde70e19cd5648f Mon Sep 17 00:00:00 2001 From: Brad Schoening <5796692+bschoening@users.noreply.github.com> Date: Tue, 23 May 2023 17:30:35 -0400 Subject: [PATCH 10/20] remove unnecessary import __future__ (#1156) --- cassandra/cluster.py | 1 - cassandra/connection.py | 1 - cassandra/cqlengine/functions.py | 1 - cassandra/cqltypes.py | 1 - cassandra/protocol.py | 1 - 5 files changed, 5 deletions(-) diff --git a/cassandra/cluster.py b/cassandra/cluster.py index d9dbfe08a3..5a25c2fe34 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -16,7 +16,6 @@ This module houses the main classes you will interact with, :class:`.Cluster` and :class:`.Session`. """ -from __future__ import absolute_import import atexit import datetime diff --git a/cassandra/connection.py b/cassandra/connection.py index ebdfe99993..36f853f78e 100644 --- a/cassandra/connection.py +++ b/cassandra/connection.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import # to enable import io from stdlib from collections import defaultdict, deque import errno from functools import wraps, partial, total_ordering diff --git a/cassandra/cqlengine/functions.py b/cassandra/cqlengine/functions.py index 5cb0f673d1..a2495c010d 100644 --- a/cassandra/cqlengine/functions.py +++ b/cassandra/cqlengine/functions.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import division from datetime import datetime from cassandra.cqlengine import UnicodeMixin, ValidationError diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index 2daa1603a4..f4f0c915c6 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -27,7 +27,6 @@ # for example), these classes would be a good place to tack on # .from_cql_literal() and .as_cql_literal() classmethods (or whatever). -from __future__ import absolute_import # to enable import io from stdlib import ast from binascii import unhexlify import calendar diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 29ae404048..a3cfc54ef0 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import # to enable import io from stdlib from collections import namedtuple import logging import socket From 0b054eeae32ca72e44b479c6224c34d740e11c7e Mon Sep 17 00:00:00 2001 From: Lukas Elmer Date: Tue, 23 May 2023 23:34:41 +0200 Subject: [PATCH 11/20] docs: convert print statement to function in docs (#1157) --- cassandra/cluster.py | 4 ++-- cassandra/cqlengine/query.py | 12 ++++++------ cassandra/datastax/graph/fluent/__init__.py | 2 +- cassandra/query.py | 10 +++++----- docs/api/cassandra/cqlengine/models.rst | 4 ++-- docs/cqlengine/connections.rst | 2 +- docs/cqlengine/models.rst | 2 +- docs/execution-profiles.rst | 14 +++++++------- docs/faq.rst | 4 ++-- docs/getting-started.rst | 12 ++++++------ docs/object-mapper.rst | 4 ++-- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cassandra/cluster.py b/cassandra/cluster.py index 5a25c2fe34..966dc9c4f3 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -1014,7 +1014,7 @@ def default_retry_policy(self, policy): cloud = None """ A dict of the cloud configuration. Example:: - + { # path to the secure connect bundle 'secure_connect_bundle': '/path/to/secure-connect-dbname.zip', @@ -1542,7 +1542,7 @@ def __init__(self, street, zipcode): # results will include Address instances results = session.execute("SELECT * FROM users") row = results[0] - print row.id, row.location.street, row.location.zipcode + print(row.id, row.location.street, row.location.zipcode) """ if self.protocol_version < 3: diff --git a/cassandra/cqlengine/query.py b/cassandra/cqlengine/query.py index 40134e884e..73f48a5928 100644 --- a/cassandra/cqlengine/query.py +++ b/cassandra/cqlengine/query.py @@ -285,15 +285,15 @@ class ContextQuery(object): with ContextQuery(Automobile, keyspace='test2') as A: A.objects.create(manufacturer='honda', year=2008, model='civic') - print len(A.objects.all()) # 1 result + print(len(A.objects.all())) # 1 result with ContextQuery(Automobile, keyspace='test4') as A: - print len(A.objects.all()) # 0 result + print(len(A.objects.all())) # 0 result # Multiple models with ContextQuery(Automobile, Automobile2, connection='cluster2') as (A, A2): - print len(A.objects.all()) - print len(A2.objects.all()) + print(len(A.objects.all())) + print(len(A2.objects.all())) """ @@ -808,11 +808,11 @@ class Comment(Model): print("Normal") for comment in Comment.objects(photo_id=u): - print comment.comment_id + print(comment.comment_id) print("Reversed") for comment in Comment.objects(photo_id=u).order_by("-comment_id"): - print comment.comment_id + print(comment.comment_id) """ if len(colnames) == 0: clone = copy.deepcopy(self) diff --git a/cassandra/datastax/graph/fluent/__init__.py b/cassandra/datastax/graph/fluent/__init__.py index 44a0d136e0..92f148721e 100644 --- a/cassandra/datastax/graph/fluent/__init__.py +++ b/cassandra/datastax/graph/fluent/__init__.py @@ -257,7 +257,7 @@ def traversal_source(session=None, graph_name=None, execution_profile=EXEC_PROFI session = c.connect() g = DseGraph.traversal_source(session, 'my_graph') - print g.V().valueMap().toList() + print(g.V().valueMap().toList()) """ diff --git a/cassandra/query.py b/cassandra/query.py index 13813f742c..f3922849ab 100644 --- a/cassandra/query.py +++ b/cassandra/query.py @@ -75,7 +75,7 @@ def tuple_factory(colnames, rows): >>> session = cluster.connect('mykeyspace') >>> session.row_factory = tuple_factory >>> rows = session.execute("SELECT name, age FROM users LIMIT 1") - >>> print rows[0] + >>> print(rows[0]) ('Bob', 42) .. versionchanged:: 2.0.0 @@ -131,16 +131,16 @@ def named_tuple_factory(colnames, rows): >>> user = rows[0] >>> # you can access field by their name: - >>> print "name: %s, age: %d" % (user.name, user.age) + >>> print("name: %s, age: %d" % (user.name, user.age)) name: Bob, age: 42 >>> # or you can access fields by their position (like a tuple) >>> name, age = user - >>> print "name: %s, age: %d" % (name, age) + >>> print("name: %s, age: %d" % (name, age)) name: Bob, age: 42 >>> name = user[0] >>> age = user[1] - >>> print "name: %s, age: %d" % (name, age) + >>> print("name: %s, age: %d" % (name, age)) name: Bob, age: 42 .. versionchanged:: 2.0.0 @@ -186,7 +186,7 @@ def dict_factory(colnames, rows): >>> session = cluster.connect('mykeyspace') >>> session.row_factory = dict_factory >>> rows = session.execute("SELECT name, age FROM users LIMIT 1") - >>> print rows[0] + >>> print(rows[0]) {u'age': 42, u'name': u'Bob'} .. versionchanged:: 2.0.0 diff --git a/docs/api/cassandra/cqlengine/models.rst b/docs/api/cassandra/cqlengine/models.rst index 60b1471184..44a015a9f4 100644 --- a/docs/api/cassandra/cqlengine/models.rst +++ b/docs/api/cassandra/cqlengine/models.rst @@ -103,7 +103,7 @@ Model TestIfNotExistsModel.if_not_exists().create(id=id, count=9, text='111111111111') except LWTException as e: # handle failure case - print e.existing # dict containing LWT result fields + print(e.existing) # dict containing LWT result fields) This method is supported on Cassandra 2.0 or later. @@ -144,7 +144,7 @@ Model t.iff(count=5).update('other text') except LWTException as e: # handle failure case - print e.existing # existing object + print(e.existing) # existing object .. automethod:: get diff --git a/docs/cqlengine/connections.rst b/docs/cqlengine/connections.rst index 03ade27521..fd44303514 100644 --- a/docs/cqlengine/connections.rst +++ b/docs/cqlengine/connections.rst @@ -99,7 +99,7 @@ You can specify a default connection per model: year = columns.Integer(primary_key=True) model = columns.Text(primary_key=True) - print len(Automobile.objects.all()) # executed on the connection 'cluster2' + print(len(Automobile.objects.all())) # executed on the connection 'cluster2' QuerySet and model instance --------------------------- diff --git a/docs/cqlengine/models.rst b/docs/cqlengine/models.rst index c0ba390119..719513f4a9 100644 --- a/docs/cqlengine/models.rst +++ b/docs/cqlengine/models.rst @@ -201,7 +201,7 @@ are only created, presisted, and queried via table Models. A short example to in users.create(name="Joe", addr=address(street="Easy St.", zipcode=99999)) user = users.objects(name="Joe")[0] - print user.name, user.addr + print(user.name, user.addr) # Joe address(street=u'Easy St.', zipcode=99999) UDTs are modeled by inheriting :class:`~.usertype.UserType`, and setting column type attributes. Types are then used in defining diff --git a/docs/execution-profiles.rst b/docs/execution-profiles.rst index 7be1a85e3f..0965d77f3d 100644 --- a/docs/execution-profiles.rst +++ b/docs/execution-profiles.rst @@ -43,7 +43,7 @@ Default session = cluster.connect() local_query = 'SELECT rpc_address FROM system.local' for _ in cluster.metadata.all_hosts(): - print session.execute(local_query)[0] + print(session.execute(local_query)[0]) .. parsed-literal:: @@ -69,7 +69,7 @@ Initializing cluster with profiles profiles = {'node1': node1_profile, 'node2': node2_profile} session = Cluster(execution_profiles=profiles).connect() for _ in cluster.metadata.all_hosts(): - print session.execute(local_query, execution_profile='node1')[0] + print(session.execute(local_query, execution_profile='node1')[0]) .. parsed-literal:: @@ -81,7 +81,7 @@ Initializing cluster with profiles .. code:: python for _ in cluster.metadata.all_hosts(): - print session.execute(local_query, execution_profile='node2')[0] + print(session.execute(local_query, execution_profile='node2')[0]) .. parsed-literal:: @@ -93,7 +93,7 @@ Initializing cluster with profiles .. code:: python for _ in cluster.metadata.all_hosts(): - print session.execute(local_query)[0] + print(session.execute(local_query)[0]) .. parsed-literal:: @@ -123,7 +123,7 @@ New profiles can be added constructing from scratch, or deriving from default: cluster.add_execution_profile(node1_profile, locked_execution) for _ in cluster.metadata.all_hosts(): - print session.execute(local_query, execution_profile=node1_profile)[0] + print(session.execute(local_query, execution_profile=node1_profile)[0]) .. parsed-literal:: @@ -144,8 +144,8 @@ We also have the ability to pass profile instances to be used for execution, but tmp = session.execution_profile_clone_update('node1', request_timeout=100, row_factory=tuple_factory) - print session.execute(local_query, execution_profile=tmp)[0] - print session.execute(local_query, execution_profile='node1')[0] + print(session.execute(local_query, execution_profile=tmp)[0]) + print(session.execute(local_query, execution_profile='node1')[0]) .. parsed-literal:: diff --git a/docs/faq.rst b/docs/faq.rst index 56cb648a24..194d5520e8 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -44,7 +44,7 @@ Since tracing is done asynchronously to the request, this method polls until the >>> result = future.result() >>> trace = future.get_query_trace() >>> for e in trace.events: - >>> print e.source_elapsed, e.description + >>> print(e.source_elapsed, e.description) 0:00:00.000077 Parsing select * from system.local 0:00:00.000153 Preparing statement @@ -67,7 +67,7 @@ With prepared statements, the replicas are obtained by ``routing_key``, based on >>> bound = prepared.bind((1,)) >>> replicas = cluster.metadata.get_replicas(bound.keyspace, bound.routing_key) >>> for h in replicas: - >>> print h.address + >>> print(h.address) 127.0.0.1 127.0.0.2 diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 1969b503ba..76685c5fdf 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -119,7 +119,7 @@ way to execute a query is to use :meth:`~.Session.execute()`: rows = session.execute('SELECT name, age, email FROM users') for user_row in rows: - print user_row.name, user_row.age, user_row.email + print(user_row.name, user_row.age, user_row.email) This will transparently pick a Cassandra node to execute the query against and handle any retries that are necessary if the operation fails. @@ -135,19 +135,19 @@ examples are equivalent: rows = session.execute('SELECT name, age, email FROM users') for row in rows: - print row.name, row.age, row.email + print(row.name, row.age, row.email) .. code-block:: python rows = session.execute('SELECT name, age, email FROM users') for (name, age, email) in rows: - print name, age, email + print(name, age, email) .. code-block:: python rows = session.execute('SELECT name, age, email FROM users') for row in rows: - print row[0], row[1], row[2] + print(row[0], row[1], row[2]) If you prefer another result format, such as a ``dict`` per row, you can change the :attr:`~.Session.row_factory` attribute. @@ -335,7 +335,7 @@ For example: try: rows = future.result() user = rows[0] - print user.name, user.age + print(user.name, user.age) except ReadTimeout: log.exception("Query timed out:") @@ -352,7 +352,7 @@ This works well for executing many queries concurrently: # wait for them to complete and use the results for future in futures: rows = future.result() - print rows[0].name + print(rows[0].name) Alternatively, instead of calling :meth:`~.ResponseFuture.result()`, you can attach callback and errback functions through the diff --git a/docs/object-mapper.rst b/docs/object-mapper.rst index 421be246ac..5eb78f57b6 100644 --- a/docs/object-mapper.rst +++ b/docs/object-mapper.rst @@ -87,7 +87,7 @@ Getting Started >>> q.count() 4 >>> for instance in q: - >>> print instance.description + >>> print(instance.description) example5 example6 example7 @@ -101,5 +101,5 @@ Getting Started >>> q2.count() 1 >>> for instance in q2: - >>> print instance.description + >>> print(instance.description) example5 From b400697b71b4f51b05e99b0b3e844c64a989a0fe Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Wed, 24 May 2023 12:06:18 -0500 Subject: [PATCH 12/20] Revert "remove unnecessary import __future__ (#1156)" This reverts commit 6894b028ffda01a63fad1deebe0b5300349d1611. --- cassandra/cluster.py | 1 + cassandra/connection.py | 1 + cassandra/cqlengine/functions.py | 1 + cassandra/cqltypes.py | 1 + cassandra/protocol.py | 1 + 5 files changed, 5 insertions(+) diff --git a/cassandra/cluster.py b/cassandra/cluster.py index 966dc9c4f3..2c3f458eff 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -16,6 +16,7 @@ This module houses the main classes you will interact with, :class:`.Cluster` and :class:`.Session`. """ +from __future__ import absolute_import import atexit import datetime diff --git a/cassandra/connection.py b/cassandra/connection.py index 36f853f78e..ebdfe99993 100644 --- a/cassandra/connection.py +++ b/cassandra/connection.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import absolute_import # to enable import io from stdlib from collections import defaultdict, deque import errno from functools import wraps, partial, total_ordering diff --git a/cassandra/cqlengine/functions.py b/cassandra/cqlengine/functions.py index a2495c010d..5cb0f673d1 100644 --- a/cassandra/cqlengine/functions.py +++ b/cassandra/cqlengine/functions.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import division from datetime import datetime from cassandra.cqlengine import UnicodeMixin, ValidationError diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index f4f0c915c6..2daa1603a4 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -27,6 +27,7 @@ # for example), these classes would be a good place to tack on # .from_cql_literal() and .as_cql_literal() classmethods (or whatever). +from __future__ import absolute_import # to enable import io from stdlib import ast from binascii import unhexlify import calendar diff --git a/cassandra/protocol.py b/cassandra/protocol.py index a3cfc54ef0..29ae404048 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import absolute_import # to enable import io from stdlib from collections import namedtuple import logging import socket From 7e4777293bc3e6503825538d50389fa65a787bb2 Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Thu, 1 Jun 2023 15:34:15 -0500 Subject: [PATCH 13/20] PYTHON-1351 Convert cryptography to an optional dependency (#1164) --- cassandra/column_encryption/_policies.py | 126 +++++++++++++++ cassandra/column_encryption/policies.py | 20 +++ cassandra/policies.py | 107 +------------ docs/column_encryption.rst | 3 +- docs/installation.rst | 2 +- requirements.txt | 1 - setup.py | 9 +- .../column_encryption/test_policies.py | 94 +++++++++++ tests/integration/standard/test_policies.py | 73 +-------- tests/unit/column_encryption/test_policies.py | 149 ++++++++++++++++++ tests/unit/test_policies.py | 133 +--------------- 11 files changed, 400 insertions(+), 317 deletions(-) create mode 100644 cassandra/column_encryption/_policies.py create mode 100644 cassandra/column_encryption/policies.py create mode 100644 tests/integration/standard/column_encryption/test_policies.py create mode 100644 tests/unit/column_encryption/test_policies.py diff --git a/cassandra/column_encryption/_policies.py b/cassandra/column_encryption/_policies.py new file mode 100644 index 0000000000..e049ba2d22 --- /dev/null +++ b/cassandra/column_encryption/_policies.py @@ -0,0 +1,126 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +from functools import lru_cache + +import logging +import os + +log = logging.getLogger(__name__) + +from cassandra.cqltypes import _cqltypes +from cassandra.policies import ColumnEncryptionPolicy + +from cryptography.hazmat.primitives import padding +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes + +AES256_BLOCK_SIZE = 128 +AES256_BLOCK_SIZE_BYTES = int(AES256_BLOCK_SIZE / 8) +AES256_KEY_SIZE = 256 +AES256_KEY_SIZE_BYTES = int(AES256_KEY_SIZE / 8) + +ColData = namedtuple('ColData', ['key','type']) + +class AES256ColumnEncryptionPolicy(ColumnEncryptionPolicy): + + # CBC uses an IV that's the same size as the block size + # + # TODO: Need to find some way to expose mode options + # (CBC etc.) without leaking classes from the underlying + # impl here + def __init__(self, mode = modes.CBC, iv = os.urandom(AES256_BLOCK_SIZE_BYTES)): + + self.mode = mode + self.iv = iv + + # ColData for a given ColDesc is always preserved. We only create a Cipher + # when there's an actual need to for a given ColDesc + self.coldata = {} + self.ciphers = {} + + def encrypt(self, coldesc, obj_bytes): + + # AES256 has a 128-bit block size so if the input bytes don't align perfectly on + # those blocks we have to pad them. There's plenty of room for optimization here: + # + # * Instances of the PKCS7 padder should be managed in a bounded pool + # * It would be nice if we could get a flag from encrypted data to indicate + # whether it was padded or not + # * Might be able to make this happen with a leading block of flags in encrypted data + padder = padding.PKCS7(AES256_BLOCK_SIZE).padder() + padded_bytes = padder.update(obj_bytes) + padder.finalize() + + cipher = self._get_cipher(coldesc) + encryptor = cipher.encryptor() + return encryptor.update(padded_bytes) + encryptor.finalize() + + def decrypt(self, coldesc, encrypted_bytes): + + cipher = self._get_cipher(coldesc) + decryptor = cipher.decryptor() + padded_bytes = decryptor.update(encrypted_bytes) + decryptor.finalize() + + unpadder = padding.PKCS7(AES256_BLOCK_SIZE).unpadder() + return unpadder.update(padded_bytes) + unpadder.finalize() + + def add_column(self, coldesc, key, type): + + if not coldesc: + raise ValueError("ColDesc supplied to add_column cannot be None") + if not key: + raise ValueError("Key supplied to add_column cannot be None") + if not type: + raise ValueError("Type supplied to add_column cannot be None") + if type not in _cqltypes.keys(): + raise ValueError("Type %s is not a supported type".format(type)) + if not len(key) == AES256_KEY_SIZE_BYTES: + raise ValueError("AES256 column encryption policy expects a 256-bit encryption key") + self.coldata[coldesc] = ColData(key, _cqltypes[type]) + + def contains_column(self, coldesc): + return coldesc in self.coldata + + def encode_and_encrypt(self, coldesc, obj): + if not coldesc: + raise ValueError("ColDesc supplied to encode_and_encrypt cannot be None") + if not obj: + raise ValueError("Object supplied to encode_and_encrypt cannot be None") + coldata = self.coldata.get(coldesc) + if not coldata: + raise ValueError("Could not find ColData for ColDesc %s".format(coldesc)) + return self.encrypt(coldesc, coldata.type.serialize(obj, None)) + + def cache_info(self): + return AES256ColumnEncryptionPolicy._build_cipher.cache_info() + + def column_type(self, coldesc): + return self.coldata[coldesc].type + + def _get_cipher(self, coldesc): + """ + Access relevant state from this instance necessary to create a Cipher and then get one, + hopefully returning a cached instance if we've already done so (and it hasn't been evicted) + """ + + try: + coldata = self.coldata[coldesc] + return AES256ColumnEncryptionPolicy._build_cipher(coldata.key, self.mode, self.iv) + except KeyError: + raise ValueError("Could not find column {}".format(coldesc)) + + # Explicitly use a class method here to avoid caching self + @lru_cache(maxsize=128) + def _build_cipher(key, mode, iv): + return Cipher(algorithms.AES256(key), mode(iv)) diff --git a/cassandra/column_encryption/policies.py b/cassandra/column_encryption/policies.py new file mode 100644 index 0000000000..770084bd48 --- /dev/null +++ b/cassandra/column_encryption/policies.py @@ -0,0 +1,20 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import cryptography + from cassandra.column_encryption._policies import * +except ImportError: + # Cryptography is not installed + pass diff --git a/cassandra/policies.py b/cassandra/policies.py index 6d60379953..cb83238e87 100644 --- a/cassandra/policies.py +++ b/cassandra/policies.py @@ -17,17 +17,14 @@ from functools import lru_cache from itertools import islice, cycle, groupby, repeat import logging -import os from random import randint, shuffle from threading import Lock import socket import warnings -from cryptography.hazmat.primitives import padding -from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +log = logging.getLogger(__name__) from cassandra import WriteType as WT -from cassandra.cqltypes import _cqltypes # This is done this way because WriteType was originally @@ -35,12 +32,8 @@ # It may removed in the next mayor. WriteType = WT - from cassandra import ConsistencyLevel, OperationTimedOut -log = logging.getLogger(__name__) - - class HostDistance(object): """ A measure of how "distant" a node is from the client, which @@ -1397,7 +1390,6 @@ def _rethrow(self, *args, **kwargs): ColDesc = namedtuple('ColDesc', ['ks', 'table', 'col']) -ColData = namedtuple('ColData', ['key','type']) class ColumnEncryptionPolicy(object): """ @@ -1454,100 +1446,3 @@ def encode_and_encrypt(self, coldesc, obj): statements. """ raise NotImplementedError() - -AES256_BLOCK_SIZE = 128 -AES256_BLOCK_SIZE_BYTES = int(AES256_BLOCK_SIZE / 8) -AES256_KEY_SIZE = 256 -AES256_KEY_SIZE_BYTES = int(AES256_KEY_SIZE / 8) - -class AES256ColumnEncryptionPolicy(ColumnEncryptionPolicy): - - # CBC uses an IV that's the same size as the block size - # - # TODO: Need to find some way to expose mode options - # (CBC etc.) without leaking classes from the underlying - # impl here - def __init__(self, mode = modes.CBC, iv = os.urandom(AES256_BLOCK_SIZE_BYTES)): - - self.mode = mode - self.iv = iv - - # ColData for a given ColDesc is always preserved. We only create a Cipher - # when there's an actual need to for a given ColDesc - self.coldata = {} - self.ciphers = {} - - def encrypt(self, coldesc, obj_bytes): - - # AES256 has a 128-bit block size so if the input bytes don't align perfectly on - # those blocks we have to pad them. There's plenty of room for optimization here: - # - # * Instances of the PKCS7 padder should be managed in a bounded pool - # * It would be nice if we could get a flag from encrypted data to indicate - # whether it was padded or not - # * Might be able to make this happen with a leading block of flags in encrypted data - padder = padding.PKCS7(AES256_BLOCK_SIZE).padder() - padded_bytes = padder.update(obj_bytes) + padder.finalize() - - cipher = self._get_cipher(coldesc) - encryptor = cipher.encryptor() - return encryptor.update(padded_bytes) + encryptor.finalize() - - def decrypt(self, coldesc, encrypted_bytes): - - cipher = self._get_cipher(coldesc) - decryptor = cipher.decryptor() - padded_bytes = decryptor.update(encrypted_bytes) + decryptor.finalize() - - unpadder = padding.PKCS7(AES256_BLOCK_SIZE).unpadder() - return unpadder.update(padded_bytes) + unpadder.finalize() - - def add_column(self, coldesc, key, type): - - if not coldesc: - raise ValueError("ColDesc supplied to add_column cannot be None") - if not key: - raise ValueError("Key supplied to add_column cannot be None") - if not type: - raise ValueError("Type supplied to add_column cannot be None") - if type not in _cqltypes.keys(): - raise ValueError("Type %s is not a supported type".format(type)) - if not len(key) == AES256_KEY_SIZE_BYTES: - raise ValueError("AES256 column encryption policy expects a 256-bit encryption key") - self.coldata[coldesc] = ColData(key, _cqltypes[type]) - - def contains_column(self, coldesc): - return coldesc in self.coldata - - def encode_and_encrypt(self, coldesc, obj): - if not coldesc: - raise ValueError("ColDesc supplied to encode_and_encrypt cannot be None") - if not obj: - raise ValueError("Object supplied to encode_and_encrypt cannot be None") - coldata = self.coldata.get(coldesc) - if not coldata: - raise ValueError("Could not find ColData for ColDesc %s".format(coldesc)) - return self.encrypt(coldesc, coldata.type.serialize(obj, None)) - - def cache_info(self): - return AES256ColumnEncryptionPolicy._build_cipher.cache_info() - - def column_type(self, coldesc): - return self.coldata[coldesc].type - - def _get_cipher(self, coldesc): - """ - Access relevant state from this instance necessary to create a Cipher and then get one, - hopefully returning a cached instance if we've already done so (and it hasn't been evicted) - """ - - try: - coldata = self.coldata[coldesc] - return AES256ColumnEncryptionPolicy._build_cipher(coldata.key, self.mode, self.iv) - except KeyError: - raise ValueError("Could not find column {}".format(coldesc)) - - # Explicitly use a class method here to avoid caching self - @lru_cache(maxsize=128) - def _build_cipher(key, mode, iv): - return Cipher(algorithms.AES256(key), mode(iv)) diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst index 1392972fa6..5cfb736c1f 100644 --- a/docs/column_encryption.rst +++ b/docs/column_encryption.rst @@ -24,7 +24,8 @@ when it's created. import os - from cassandra.policies import ColDesc, AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES + from cassandra.policies import ColDesc + from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES key = os.urandom(AES256_KEY_SIZE_BYTES) cl_policy = AES256ColumnEncryptionPolicy() diff --git a/docs/installation.rst b/docs/installation.rst index 4a4db3b172..db70ce4be6 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -26,7 +26,7 @@ To check if the installation was successful, you can run:: python -c 'import cassandra; print cassandra.__version__' -It should print something like "3.22.0". +It should print something like "3.27.0". (*Optional*) Compression Support -------------------------------- diff --git a/requirements.txt b/requirements.txt index 0f84701d5a..100a12905a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -cryptography >= 35.0 geomet>=0.1,<0.3 diff --git a/setup.py b/setup.py index 889450b1d3..e9c9be61dd 100644 --- a/setup.py +++ b/setup.py @@ -416,12 +416,12 @@ def run_setup(extensions): dependencies = [ 'geomet>=0.1,<0.3', - 'pyyaml > 5.0', - 'cryptography>=35.0' + 'pyyaml > 5.0' ] _EXTRAS_REQUIRE = { - 'graph': ['gremlinpython==3.4.6'] + 'graph': ['gremlinpython==3.4.6'], + 'cle': ['cryptography>=35.0'] } setup( @@ -440,7 +440,8 @@ def run_setup(extensions): packages=[ 'cassandra', 'cassandra.io', 'cassandra.cqlengine', 'cassandra.graph', 'cassandra.datastax', 'cassandra.datastax.insights', 'cassandra.datastax.graph', - 'cassandra.datastax.graph.fluent', 'cassandra.datastax.cloud', 'cassandra.scylla' + 'cassandra.datastax.graph.fluent', 'cassandra.datastax.cloud', 'cassandra.scylla', + 'cassandra.column_encryption' ], keywords='cassandra,cql,orm,dse,graph', include_package_data=True, diff --git a/tests/integration/standard/column_encryption/test_policies.py b/tests/integration/standard/column_encryption/test_policies.py new file mode 100644 index 0000000000..87bfde3c31 --- /dev/null +++ b/tests/integration/standard/column_encryption/test_policies.py @@ -0,0 +1,94 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from tests.integration import use_singledc, TestCluster + +from cassandra.policies import ColDesc + +from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, \ + AES256_KEY_SIZE_BYTES + +def setup_module(): + use_singledc() + +class ColumnEncryptionPolicyTest(unittest.TestCase): + + def _recreate_keyspace(self, session): + session.execute("drop keyspace if exists foo") + session.execute("CREATE KEYSPACE foo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}") + session.execute("CREATE TABLE foo.bar(encrypted blob, unencrypted int, primary key(unencrypted))") + + def test_end_to_end_prepared(self): + + # We only currently perform testing on a single type/expected value pair since CLE functionality is essentially + # independent of the underlying type. We intercept data after it's been encoded when it's going out and before it's + # encoded when coming back; the actual types of the data involved don't impact us. + expected = 12345 + expected_type = "int" + + key = os.urandom(AES256_KEY_SIZE_BYTES) + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('foo','bar','encrypted') + cl_policy.add_column(col_desc, key, expected_type) + + cluster = TestCluster(column_encryption_policy=cl_policy) + session = cluster.connect() + self._recreate_keyspace(session) + + prepared = session.prepare("insert into foo.bar (encrypted, unencrypted) values (?,?)") + session.execute(prepared, (expected,expected)) + + # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted + # values here to confirm that we don't interfere with regular processing of unencrypted vals. + (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + # Confirm the same behaviour from a subsequent prepared statement as well + prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") + (encrypted,unencrypted) = session.execute(prepared, [expected]).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + def test_end_to_end_simple(self): + + expected = 67890 + expected_type = "int" + + key = os.urandom(AES256_KEY_SIZE_BYTES) + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('foo','bar','encrypted') + cl_policy.add_column(col_desc, key, expected_type) + + cluster = TestCluster(column_encryption_policy=cl_policy) + session = cluster.connect() + self._recreate_keyspace(session) + + # Use encode_and_encrypt helper function to populate date + session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)",(cl_policy.encode_and_encrypt(col_desc, expected), expected)) + + # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted + # values here to confirm that we don't interfere with regular processing of unencrypted vals. + (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) + + # Confirm the same behaviour from a subsequent prepared statement as well + prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") + (encrypted,unencrypted) = session.execute(prepared, [expected]).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) diff --git a/tests/integration/standard/test_policies.py b/tests/integration/standard/test_policies.py index 30b106fb03..bbab35a02a 100644 --- a/tests/integration/standard/test_policies.py +++ b/tests/integration/standard/test_policies.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from decimal import Decimal -import os -import random import unittest from cassandra.cluster import ExecutionProfile, EXEC_PROFILE_DEFAULT from cassandra.policies import HostFilterPolicy, RoundRobinPolicy, SimpleConvictionPolicy, \ - WhiteListRoundRobinPolicy, ExponentialBackoffRetryPolicy, ColDesc, AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES + WhiteListRoundRobinPolicy, ExponentialBackoffRetryPolicy, ColDesc from cassandra.pool import Host from cassandra.connection import DefaultEndPoint @@ -110,71 +107,3 @@ def test_exponential_retries(self): CREATE KEYSPACE preparedtests WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} """) - -class ColumnEncryptionPolicyTest(unittest.TestCase): - - def _recreate_keyspace(self, session): - session.execute("drop keyspace if exists foo") - session.execute("CREATE KEYSPACE foo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}") - session.execute("CREATE TABLE foo.bar(encrypted blob, unencrypted int, primary key(unencrypted))") - - def test_end_to_end_prepared(self): - - # We only currently perform testing on a single type/expected value pair since CLE functionality is essentially - # independent of the underlying type. We intercept data after it's been encoded when it's going out and before it's - # encoded when coming back; the actual types of the data involved don't impact us. - expected = 12345 - expected_type = "int" - - key = os.urandom(AES256_KEY_SIZE_BYTES) - cl_policy = AES256ColumnEncryptionPolicy() - col_desc = ColDesc('foo','bar','encrypted') - cl_policy.add_column(col_desc, key, expected_type) - - cluster = TestCluster(column_encryption_policy=cl_policy) - session = cluster.connect() - self._recreate_keyspace(session) - - prepared = session.prepare("insert into foo.bar (encrypted, unencrypted) values (?,?)") - session.execute(prepared, (expected,expected)) - - # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted - # values here to confirm that we don't interfere with regular processing of unencrypted vals. - (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() - self.assertEquals(expected, encrypted) - self.assertEquals(expected, unencrypted) - - # Confirm the same behaviour from a subsequent prepared statement as well - prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") - (encrypted,unencrypted) = session.execute(prepared, [expected]).one() - self.assertEquals(expected, encrypted) - self.assertEquals(expected, unencrypted) - - def test_end_to_end_simple(self): - - expected = 67890 - expected_type = "int" - - key = os.urandom(AES256_KEY_SIZE_BYTES) - cl_policy = AES256ColumnEncryptionPolicy() - col_desc = ColDesc('foo','bar','encrypted') - cl_policy.add_column(col_desc, key, expected_type) - - cluster = TestCluster(column_encryption_policy=cl_policy) - session = cluster.connect() - self._recreate_keyspace(session) - - # Use encode_and_encrypt helper function to populate date - session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)",(cl_policy.encode_and_encrypt(col_desc, expected), expected)) - - # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted - # values here to confirm that we don't interfere with regular processing of unencrypted vals. - (encrypted,unencrypted) = session.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() - self.assertEquals(expected, encrypted) - self.assertEquals(expected, unencrypted) - - # Confirm the same behaviour from a subsequent prepared statement as well - prepared = session.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") - (encrypted,unencrypted) = session.execute(prepared, [expected]).one() - self.assertEquals(expected, encrypted) - self.assertEquals(expected, unencrypted) diff --git a/tests/unit/column_encryption/test_policies.py b/tests/unit/column_encryption/test_policies.py new file mode 100644 index 0000000000..f6b06a3ade --- /dev/null +++ b/tests/unit/column_encryption/test_policies.py @@ -0,0 +1,149 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from cassandra.policies import ColDesc +from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, \ + AES256_BLOCK_SIZE_BYTES, AES256_KEY_SIZE_BYTES + +class AES256ColumnEncryptionPolicyTest(unittest.TestCase): + + def _random_block(self): + return os.urandom(AES256_BLOCK_SIZE_BYTES) + + def _random_key(self): + return os.urandom(AES256_KEY_SIZE_BYTES) + + def _test_round_trip(self, bytes): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + policy.add_column(coldesc, self._random_key(), "blob") + encrypted_bytes = policy.encrypt(coldesc, bytes) + self.assertEqual(bytes, policy.decrypt(coldesc, encrypted_bytes)) + + def test_no_padding_necessary(self): + self._test_round_trip(self._random_block()) + + def test_some_padding_required(self): + for byte_size in range(1,AES256_BLOCK_SIZE_BYTES - 1): + bytes = os.urandom(byte_size) + self._test_round_trip(bytes) + for byte_size in range(AES256_BLOCK_SIZE_BYTES + 1,(2 * AES256_BLOCK_SIZE_BYTES) - 1): + bytes = os.urandom(byte_size) + self._test_round_trip(bytes) + + def test_add_column_invalid_key_size_raises(self): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + for key_size in range(1,AES256_KEY_SIZE_BYTES - 1): + with self.assertRaises(ValueError): + policy.add_column(coldesc, os.urandom(key_size), "blob") + for key_size in range(AES256_KEY_SIZE_BYTES + 1,(2 * AES256_KEY_SIZE_BYTES) - 1): + with self.assertRaises(ValueError): + policy.add_column(coldesc, os.urandom(key_size), "blob") + + def test_add_column_null_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + policy.add_column(None, self._random_block(), "blob") + + def test_add_column_null_key_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, None, "blob") + + def test_add_column_null_type_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_block(), None) + + def test_add_column_unknown_type_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_block(), "foobar") + + def test_encode_and_encrypt_null_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(None, self._random_block()) + + def test_encode_and_encrypt_null_obj_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(coldesc, None) + + def test_encode_and_encrypt_unknown_coldesc_raises(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encode_and_encrypt(ColDesc('ks2','table2','col2'), self._random_block()) + + def test_contains_column(self): + coldesc = ColDesc('ks1','table1','col1') + policy = AES256ColumnEncryptionPolicy() + policy.add_column(coldesc, self._random_key(), "blob") + self.assertTrue(policy.contains_column(coldesc)) + self.assertFalse(policy.contains_column(ColDesc('ks2','table1','col1'))) + self.assertFalse(policy.contains_column(ColDesc('ks1','table2','col1'))) + self.assertFalse(policy.contains_column(ColDesc('ks1','table1','col2'))) + self.assertFalse(policy.contains_column(ColDesc('ks2','table2','col2'))) + + def test_encrypt_unknown_column(self): + with self.assertRaises(ValueError): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + policy.encrypt(ColDesc('ks2','table2','col2'), self._random_block()) + + def test_decrypt_unknown_column(self): + policy = AES256ColumnEncryptionPolicy() + coldesc = ColDesc('ks1','table1','col1') + policy.add_column(coldesc, self._random_key(), "blob") + encrypted_bytes = policy.encrypt(coldesc, self._random_block()) + with self.assertRaises(ValueError): + policy.decrypt(ColDesc('ks2','table2','col2'), encrypted_bytes) + + def test_cache_info(self): + coldesc1 = ColDesc('ks1','table1','col1') + coldesc2 = ColDesc('ks2','table2','col2') + coldesc3 = ColDesc('ks3','table3','col3') + policy = AES256ColumnEncryptionPolicy() + for coldesc in [coldesc1, coldesc2, coldesc3]: + policy.add_column(coldesc, self._random_key(), "blob") + + # First run for this coldesc should be a miss, everything else should be a cache hit + for _ in range(10): + policy.encrypt(coldesc1, self._random_block()) + cache_info = policy.cache_info() + self.assertEqual(cache_info.hits, 9) + self.assertEqual(cache_info.misses, 1) + self.assertEqual(cache_info.maxsize, 128) + + # Important note: we're measuring the size of the cache of ciphers, NOT stored + # keys. We won't have a cipher here until we actually encrypt something + self.assertEqual(cache_info.currsize, 1) + policy.encrypt(coldesc2, self._random_block()) + self.assertEqual(policy.cache_info().currsize, 2) + policy.encrypt(coldesc3, self._random_block()) + self.assertEqual(policy.cache_info().currsize, 3) diff --git a/tests/unit/test_policies.py b/tests/unit/test_policies.py index 849a6c5e94..24016ebc8e 100644 --- a/tests/unit/test_policies.py +++ b/tests/unit/test_policies.py @@ -16,7 +16,6 @@ from itertools import islice, cycle from mock import Mock, patch, call -import os from random import randint import pytest from _thread import LockType @@ -33,8 +32,7 @@ RetryPolicy, WriteType, DowngradingConsistencyRetryPolicy, ConstantReconnectionPolicy, LoadBalancingPolicy, ConvictionPolicy, ReconnectionPolicy, FallthroughRetryPolicy, - IdentityTranslator, EC2MultiRegionTranslator, HostFilterPolicy, ExponentialBackoffRetryPolicy, - ColDesc, AES256ColumnEncryptionPolicy, AES256_BLOCK_SIZE_BYTES, AES256_KEY_SIZE_BYTES) + IdentityTranslator, EC2MultiRegionTranslator, HostFilterPolicy, ExponentialBackoffRetryPolicy) from cassandra.connection import DefaultEndPoint, UnixSocketEndPoint from cassandra.pool import Host from cassandra.query import Statement @@ -1581,132 +1579,3 @@ def test_create_whitelist(self): # Only the filtered replicas should be allowed self.assertEqual(set(query_plan), {Host(DefaultEndPoint("127.0.0.1"), SimpleConvictionPolicy), Host(DefaultEndPoint("127.0.0.4"), SimpleConvictionPolicy)}) - -class AES256ColumnEncryptionPolicyTest(unittest.TestCase): - - def _random_block(self): - return os.urandom(AES256_BLOCK_SIZE_BYTES) - - def _random_key(self): - return os.urandom(AES256_KEY_SIZE_BYTES) - - def _test_round_trip(self, bytes): - coldesc = ColDesc('ks1','table1','col1') - policy = AES256ColumnEncryptionPolicy() - policy.add_column(coldesc, self._random_key(), "blob") - encrypted_bytes = policy.encrypt(coldesc, bytes) - self.assertEqual(bytes, policy.decrypt(coldesc, encrypted_bytes)) - - def test_no_padding_necessary(self): - self._test_round_trip(self._random_block()) - - def test_some_padding_required(self): - for byte_size in range(1,AES256_BLOCK_SIZE_BYTES - 1): - bytes = os.urandom(byte_size) - self._test_round_trip(bytes) - for byte_size in range(AES256_BLOCK_SIZE_BYTES + 1,(2 * AES256_BLOCK_SIZE_BYTES) - 1): - bytes = os.urandom(byte_size) - self._test_round_trip(bytes) - - def test_add_column_invalid_key_size_raises(self): - coldesc = ColDesc('ks1','table1','col1') - policy = AES256ColumnEncryptionPolicy() - for key_size in range(1,AES256_KEY_SIZE_BYTES - 1): - with self.assertRaises(ValueError): - policy.add_column(coldesc, os.urandom(key_size), "blob") - for key_size in range(AES256_KEY_SIZE_BYTES + 1,(2 * AES256_KEY_SIZE_BYTES) - 1): - with self.assertRaises(ValueError): - policy.add_column(coldesc, os.urandom(key_size), "blob") - - def test_add_column_null_coldesc_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - policy.add_column(None, self._random_block(), "blob") - - def test_add_column_null_key_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, None, "blob") - - def test_add_column_null_type_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_block(), None) - - def test_add_column_unknown_type_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_block(), "foobar") - - def test_encode_and_encrypt_null_coldesc_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_key(), "blob") - policy.encode_and_encrypt(None, self._random_block()) - - def test_encode_and_encrypt_null_obj_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_key(), "blob") - policy.encode_and_encrypt(coldesc, None) - - def test_encode_and_encrypt_unknown_coldesc_raises(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_key(), "blob") - policy.encode_and_encrypt(ColDesc('ks2','table2','col2'), self._random_block()) - - def test_contains_column(self): - coldesc = ColDesc('ks1','table1','col1') - policy = AES256ColumnEncryptionPolicy() - policy.add_column(coldesc, self._random_key(), "blob") - self.assertTrue(policy.contains_column(coldesc)) - self.assertFalse(policy.contains_column(ColDesc('ks2','table1','col1'))) - self.assertFalse(policy.contains_column(ColDesc('ks1','table2','col1'))) - self.assertFalse(policy.contains_column(ColDesc('ks1','table1','col2'))) - self.assertFalse(policy.contains_column(ColDesc('ks2','table2','col2'))) - - def test_encrypt_unknown_column(self): - with self.assertRaises(ValueError): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_key(), "blob") - policy.encrypt(ColDesc('ks2','table2','col2'), self._random_block()) - - def test_decrypt_unknown_column(self): - policy = AES256ColumnEncryptionPolicy() - coldesc = ColDesc('ks1','table1','col1') - policy.add_column(coldesc, self._random_key(), "blob") - encrypted_bytes = policy.encrypt(coldesc, self._random_block()) - with self.assertRaises(ValueError): - policy.decrypt(ColDesc('ks2','table2','col2'), encrypted_bytes) - - def test_cache_info(self): - coldesc1 = ColDesc('ks1','table1','col1') - coldesc2 = ColDesc('ks2','table2','col2') - coldesc3 = ColDesc('ks3','table3','col3') - policy = AES256ColumnEncryptionPolicy() - for coldesc in [coldesc1, coldesc2, coldesc3]: - policy.add_column(coldesc, self._random_key(), "blob") - - # First run for this coldesc should be a miss, everything else should be a cache hit - for _ in range(10): - policy.encrypt(coldesc1, self._random_block()) - cache_info = policy.cache_info() - self.assertEqual(cache_info.hits, 9) - self.assertEqual(cache_info.misses, 1) - self.assertEqual(cache_info.maxsize, 128) - - # Important note: we're measuring the size of the cache of ciphers, NOT stored - # keys. We won't have a cipher here until we actually encrypt something - self.assertEqual(cache_info.currsize, 1) - policy.encrypt(coldesc2, self._random_block()) - self.assertEqual(policy.cache_info().currsize, 2) - policy.encrypt(coldesc3, self._random_block()) - self.assertEqual(policy.cache_info().currsize, 3) From a1c6bf758ad2e6189c83b3d5121a966ed140ccb4 Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Fri, 2 Jun 2023 10:52:01 -0500 Subject: [PATCH 14/20] PYTHON-1350 Store IV along with encrypted text when using column-level encryption (#1160) --- cassandra/column_encryption/_policies.py | 45 +++++++----- .../column_encryption/test_policies.py | 71 +++++++++++++++---- tests/unit/column_encryption/test_policies.py | 20 ++++++ 3 files changed, 105 insertions(+), 31 deletions(-) diff --git a/cassandra/column_encryption/_policies.py b/cassandra/column_encryption/_policies.py index e049ba2d22..ef8097bfbd 100644 --- a/cassandra/column_encryption/_policies.py +++ b/cassandra/column_encryption/_policies.py @@ -35,15 +35,27 @@ class AES256ColumnEncryptionPolicy(ColumnEncryptionPolicy): - # CBC uses an IV that's the same size as the block size - # - # TODO: Need to find some way to expose mode options - # (CBC etc.) without leaking classes from the underlying - # impl here - def __init__(self, mode = modes.CBC, iv = os.urandom(AES256_BLOCK_SIZE_BYTES)): - - self.mode = mode + # Fix block cipher mode for now. IV size is a function of block cipher used + # so fixing this avoids (possibly unnecessary) validation logic here. + mode = modes.CBC + + # "iv" param here expects a bytearray that's the same size as the block + # size for AES-256 (128 bits or 16 bytes). If none is provided a new one + # will be randomly generated, but in this case the IV should be recorded and + # preserved or else you will not be able to decrypt any data encrypted by this + # policy. + def __init__(self, iv=None): + + # CBC uses an IV that's the same size as the block size + # + # Avoid defining IV with a default arg in order to stay away from + # any issues around the caching of default args self.iv = iv + if self.iv: + if not len(self.iv) == AES256_BLOCK_SIZE_BYTES: + raise ValueError("This policy uses AES-256 with CBC mode and therefore expects a 128-bit initialization vector") + else: + self.iv = os.urandom(AES256_BLOCK_SIZE_BYTES) # ColData for a given ColDesc is always preserved. We only create a Cipher # when there's an actual need to for a given ColDesc @@ -64,11 +76,13 @@ def encrypt(self, coldesc, obj_bytes): cipher = self._get_cipher(coldesc) encryptor = cipher.encryptor() - return encryptor.update(padded_bytes) + encryptor.finalize() + return self.iv + encryptor.update(padded_bytes) + encryptor.finalize() - def decrypt(self, coldesc, encrypted_bytes): + def decrypt(self, coldesc, bytes): - cipher = self._get_cipher(coldesc) + iv = bytes[:AES256_BLOCK_SIZE_BYTES] + encrypted_bytes = bytes[AES256_BLOCK_SIZE_BYTES:] + cipher = self._get_cipher(coldesc, iv=iv) decryptor = cipher.decryptor() padded_bytes = decryptor.update(encrypted_bytes) + decryptor.finalize() @@ -108,19 +122,18 @@ def cache_info(self): def column_type(self, coldesc): return self.coldata[coldesc].type - def _get_cipher(self, coldesc): + def _get_cipher(self, coldesc, iv=None): """ Access relevant state from this instance necessary to create a Cipher and then get one, hopefully returning a cached instance if we've already done so (and it hasn't been evicted) """ - try: coldata = self.coldata[coldesc] - return AES256ColumnEncryptionPolicy._build_cipher(coldata.key, self.mode, self.iv) + return AES256ColumnEncryptionPolicy._build_cipher(coldata.key, iv or self.iv) except KeyError: raise ValueError("Could not find column {}".format(coldesc)) # Explicitly use a class method here to avoid caching self @lru_cache(maxsize=128) - def _build_cipher(key, mode, iv): - return Cipher(algorithms.AES256(key), mode(iv)) + def _build_cipher(key, iv): + return Cipher(algorithms.AES256(key), AES256ColumnEncryptionPolicy.mode(iv)) diff --git a/tests/integration/standard/column_encryption/test_policies.py b/tests/integration/standard/column_encryption/test_policies.py index 87bfde3c31..bb84c0352c 100644 --- a/tests/integration/standard/column_encryption/test_policies.py +++ b/tests/integration/standard/column_encryption/test_policies.py @@ -20,7 +20,7 @@ from cassandra.policies import ColDesc from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, \ - AES256_KEY_SIZE_BYTES + AES256_KEY_SIZE_BYTES, AES256_BLOCK_SIZE_BYTES def setup_module(): use_singledc() @@ -32,25 +32,28 @@ def _recreate_keyspace(self, session): session.execute("CREATE KEYSPACE foo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}") session.execute("CREATE TABLE foo.bar(encrypted blob, unencrypted int, primary key(unencrypted))") + def _create_policy(self, key, iv = None): + cl_policy = AES256ColumnEncryptionPolicy() + col_desc = ColDesc('foo','bar','encrypted') + cl_policy.add_column(col_desc, key, "int") + return (col_desc, cl_policy) + def test_end_to_end_prepared(self): # We only currently perform testing on a single type/expected value pair since CLE functionality is essentially # independent of the underlying type. We intercept data after it's been encoded when it's going out and before it's # encoded when coming back; the actual types of the data involved don't impact us. - expected = 12345 - expected_type = "int" + expected = 0 key = os.urandom(AES256_KEY_SIZE_BYTES) - cl_policy = AES256ColumnEncryptionPolicy() - col_desc = ColDesc('foo','bar','encrypted') - cl_policy.add_column(col_desc, key, expected_type) - + (_, cl_policy) = self._create_policy(key) cluster = TestCluster(column_encryption_policy=cl_policy) session = cluster.connect() self._recreate_keyspace(session) prepared = session.prepare("insert into foo.bar (encrypted, unencrypted) values (?,?)") - session.execute(prepared, (expected,expected)) + for i in range(100): + session.execute(prepared, (i, i)) # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted # values here to confirm that we don't interfere with regular processing of unencrypted vals. @@ -66,20 +69,19 @@ def test_end_to_end_prepared(self): def test_end_to_end_simple(self): - expected = 67890 - expected_type = "int" + expected = 1 key = os.urandom(AES256_KEY_SIZE_BYTES) - cl_policy = AES256ColumnEncryptionPolicy() - col_desc = ColDesc('foo','bar','encrypted') - cl_policy.add_column(col_desc, key, expected_type) - + (col_desc, cl_policy) = self._create_policy(key) cluster = TestCluster(column_encryption_policy=cl_policy) session = cluster.connect() self._recreate_keyspace(session) # Use encode_and_encrypt helper function to populate date - session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)",(cl_policy.encode_and_encrypt(col_desc, expected), expected)) + for i in range(1,100): + self.assertIsNotNone(i) + encrypted = cl_policy.encode_and_encrypt(col_desc, i) + session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)", (encrypted, i)) # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted # values here to confirm that we don't interfere with regular processing of unencrypted vals. @@ -92,3 +94,42 @@ def test_end_to_end_simple(self): (encrypted,unencrypted) = session.execute(prepared, [expected]).one() self.assertEquals(expected, encrypted) self.assertEquals(expected, unencrypted) + + def test_end_to_end_different_cle_contexts(self): + + expected = 2 + + key = os.urandom(AES256_KEY_SIZE_BYTES) + + # Simulate the creation of two AES256 policies at two different times. Python caches + # default param args at function definition time so a single value will be used any time + # the default val is used. Upshot is that within the same test we'll always have the same + # IV if we rely on the default args, so manually introduce some variation here to simulate + # what actually happens if you have two distinct sessions created at two different times. + iv1 = os.urandom(AES256_BLOCK_SIZE_BYTES) + (col_desc1, cl_policy1) = self._create_policy(key, iv=iv1) + cluster1 = TestCluster(column_encryption_policy=cl_policy1) + session1 = cluster1.connect() + self._recreate_keyspace(session1) + + # Use encode_and_encrypt helper function to populate date + for i in range(1,100): + self.assertIsNotNone(i) + encrypted = cl_policy1.encode_and_encrypt(col_desc1, i) + session1.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)", (encrypted, i)) + session1.shutdown() + cluster1.shutdown() + + # Explicitly clear the class-level cache here; we're trying to simulate a second connection from a completely new process and + # that would entail not re-using any cached ciphers + AES256ColumnEncryptionPolicy._build_cipher.cache_clear() + cache_info = cl_policy1.cache_info() + self.assertEqual(cache_info.currsize, 0) + + iv2 = os.urandom(AES256_BLOCK_SIZE_BYTES) + (_, cl_policy2) = self._create_policy(key, iv=iv2) + cluster2 = TestCluster(column_encryption_policy=cl_policy2) + session2 = cluster2.connect() + (encrypted,unencrypted) = session2.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(expected, encrypted) + self.assertEquals(expected, unencrypted) diff --git a/tests/unit/column_encryption/test_policies.py b/tests/unit/column_encryption/test_policies.py index f6b06a3ade..38136c69d4 100644 --- a/tests/unit/column_encryption/test_policies.py +++ b/tests/unit/column_encryption/test_policies.py @@ -55,6 +55,23 @@ def test_add_column_invalid_key_size_raises(self): with self.assertRaises(ValueError): policy.add_column(coldesc, os.urandom(key_size), "blob") + def test_add_column_invalid_iv_size_raises(self): + def test_iv_size(iv_size): + policy = AES256ColumnEncryptionPolicy(iv = os.urandom(iv_size)) + policy.add_column(coldesc, os.urandom(AES256_KEY_SIZE_BYTES), "blob") + policy.encrypt(coldesc, os.urandom(128)) + + coldesc = ColDesc('ks1','table1','col1') + for iv_size in range(1,AES256_BLOCK_SIZE_BYTES - 1): + with self.assertRaises(ValueError): + test_iv_size(iv_size) + for iv_size in range(AES256_BLOCK_SIZE_BYTES + 1,(2 * AES256_BLOCK_SIZE_BYTES) - 1): + with self.assertRaises(ValueError): + test_iv_size(iv_size) + + # Finally, confirm that the expected IV size has no issue + test_iv_size(AES256_BLOCK_SIZE_BYTES) + def test_add_column_null_coldesc_raises(self): with self.assertRaises(ValueError): policy = AES256ColumnEncryptionPolicy() @@ -125,6 +142,9 @@ def test_decrypt_unknown_column(self): policy.decrypt(ColDesc('ks2','table2','col2'), encrypted_bytes) def test_cache_info(self): + # Exclude any interference from tests above + AES256ColumnEncryptionPolicy._build_cipher.cache_clear() + coldesc1 = ColDesc('ks1','table1','col1') coldesc2 = ColDesc('ks2','table2','col2') coldesc3 = ColDesc('ks3','table3','col3') From 912449213ee08709492eb249848d8a7098e435cd Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Fri, 2 Jun 2023 14:57:09 -0500 Subject: [PATCH 15/20] PYTHON-1356 Create session-specific protocol handlers to contain session-specific CLE policies (#1165) --- cassandra/cluster.py | 15 +++++--- .../column_encryption/test_policies.py | 37 ++++++++++++++++++- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/cassandra/cluster.py b/cassandra/cluster.py index 2c3f458eff..ee96203aa0 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -2668,12 +2668,6 @@ def __init__(self, cluster, hosts, keyspace=None): self.encoder = Encoder() - if self.cluster.column_encryption_policy is not None: - try: - self.client_protocol_handler.column_encryption_policy = self.cluster.column_encryption_policy - except AttributeError: - log.info("Unable to set column encryption policy for session") - # create connection pools in parallel self._initial_connect_futures = set() for host in hosts: @@ -2694,6 +2688,15 @@ def __init__(self, cluster, hosts, keyspace=None): self.session_id = uuid.uuid4() self._graph_paging_available = self._check_graph_paging_available() + if self.cluster.column_encryption_policy is not None: + try: + self.client_protocol_handler = type( + str(self.session_id) + "-ProtocolHandler", + (ProtocolHandler,), + {"column_encryption_policy": self.cluster.column_encryption_policy}) + except AttributeError: + log.info("Unable to set column encryption policy for session") + if self.cluster.monitor_reporting_enabled: cc_host = self.cluster.get_control_connection_host() valid_insights_version = (cc_host and version_supports_insights(cc_host.dse_version)) diff --git a/tests/integration/standard/column_encryption/test_policies.py b/tests/integration/standard/column_encryption/test_policies.py index bb84c0352c..dea6b6d39e 100644 --- a/tests/integration/standard/column_encryption/test_policies.py +++ b/tests/integration/standard/column_encryption/test_policies.py @@ -95,7 +95,11 @@ def test_end_to_end_simple(self): self.assertEquals(expected, encrypted) self.assertEquals(expected, unencrypted) - def test_end_to_end_different_cle_contexts(self): + def test_end_to_end_different_cle_contexts_different_ivs(self): + """ + Test to validate PYTHON-1350. We should be able to decode the data from two different contexts (with two different IVs) + since the IV used to decrypt the data is actually now stored with the data. + """ expected = 2 @@ -133,3 +137,34 @@ def test_end_to_end_different_cle_contexts(self): (encrypted,unencrypted) = session2.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() self.assertEquals(expected, encrypted) self.assertEquals(expected, unencrypted) + + def test_end_to_end_different_cle_contexts_different_policies(self): + """ + Test to validate PYTHON-1356. Class variables used to pass CLE policy down to protocol handler shouldn't persist. + """ + + expected = 3 + + key = os.urandom(AES256_KEY_SIZE_BYTES) + (col_desc, cl_policy) = self._create_policy(key) + cluster = TestCluster(column_encryption_policy=cl_policy) + session = cluster.connect() + self._recreate_keyspace(session) + + # Use encode_and_encrypt helper function to populate date + session.execute("insert into foo.bar (encrypted, unencrypted) values (%s,%s)",(cl_policy.encode_and_encrypt(col_desc, expected), expected)) + + # We now open a new session _without_ the CLE policy specified. We should _not_ be able to read decrypted bits from this session. + cluster2 = TestCluster() + session2 = cluster2.connect() + + # A straight select from the database will now return the decrypted bits. We select both encrypted and unencrypted + # values here to confirm that we don't interfere with regular processing of unencrypted vals. + (encrypted,unencrypted) = session2.execute("select encrypted, unencrypted from foo.bar where unencrypted = %s allow filtering", (expected,)).one() + self.assertEquals(cl_policy.encode_and_encrypt(col_desc, expected), encrypted) + self.assertEquals(expected, unencrypted) + + # Confirm the same behaviour from a subsequent prepared statement as well + prepared = session2.prepare("select encrypted, unencrypted from foo.bar where unencrypted = ? allow filtering") + (encrypted,unencrypted) = session2.execute(prepared, [expected]).one() + self.assertEquals(cl_policy.encode_and_encrypt(col_desc, expected), encrypted) From 3a9ac29db32ec6d9b6a973242076bbd2a4ea2098 Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 5 Jun 2023 09:23:25 -0500 Subject: [PATCH 16/20] CONN-38 Notes for 3.27.0 on PYTHON-1350 (#1166) --- docs/column_encryption.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst index 5cfb736c1f..e18b9286ed 100644 --- a/docs/column_encryption.rst +++ b/docs/column_encryption.rst @@ -14,6 +14,20 @@ also available, although in this case values must be manually encrypted and/or d Client-side encryption and decryption should work against all versions of Cassandra and DSE. It does not utilize any server-side functionality to do its work. +WARNING: Consider upgrading to 3.28.0 or later +------------------------------------------------ +There is a significant issue with the column encryption functionality in Python driver 3.27.0. +To be able to decrypt your data, you must preserve the cipher initialization vector (IV) used by +the :class:`~.AES256ColumnEncryptionPolicy` when your data was written. +To decrypt your data, you must supply this IV when creating a policy to read this data. +If you do not supply this IV in the policy to read this data, you will **NOT BE ABLE TO DECRYPT YOUR DATA**. +See +`PYTHON-1350 `_ for more detail. + +DataStax recommends upgrading to Python driver 3.28.0 or later to avoid this issue. 3.28.0 or later manages the IV automatically. +Because of this change in functionality, any encrypted data written in 3.27.0 will **NOT** be readable by 3.28.0 or later. +After upgrading to Python driver 3.28.0 or later, it is critical that you re-encrypt your data with the new driver version. + Configuration ------------- Client-side encryption is enabled by creating an instance of a subclass of :class:`~.ColumnEncryptionPolicy` From 6b46906927a8518a0a4f02625c9f49b99321248b Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 5 Jun 2023 09:36:50 -0500 Subject: [PATCH 17/20] PYTHON-1352 Add vector type, codec + support for parsing CQL type (#1161) --- cassandra/__init__.py | 2 +- cassandra/cqltypes.py | 36 +++++++++++++++++++++++++++++++++--- tests/unit/test_types.py | 22 +++++++++++++++++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/cassandra/__init__.py b/cassandra/__init__.py index 301bcbf1c1..2a39f08a4c 100644 --- a/cassandra/__init__.py +++ b/cassandra/__init__.py @@ -23,7 +23,7 @@ def emit(self, record): logging.getLogger('cassandra').addHandler(NullHandler()) -__version_info__ = (3, 27, 0) +__version_info__ = (3, 28, 0b1) __version__ = '.'.join(map(str, __version_info__)) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index 2daa1603a4..8529897958 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -226,13 +226,15 @@ def parse_casstype_args(typestring): else: names.append(None) - ctype = lookup_casstype_simple(tok) + try: + ctype = int(tok) + except ValueError: + ctype = lookup_casstype_simple(tok) types.append(ctype) # return the first (outer) type, which will have all parameters applied return args[0][0][0] - def lookup_casstype(casstype): """ Given a Cassandra type as a string (possibly including parameters), hand @@ -286,7 +288,7 @@ class _CassandraType(object, metaclass=CassandraTypeType): """ def __repr__(self): - return '<%s( %r )>' % (self.cql_parameterized_type(), self.val) + return '<%s>' % (self.cql_parameterized_type()) @classmethod def from_binary(cls, byts, protocol_version): @@ -1402,3 +1404,31 @@ def serialize(cls, v, protocol_version): buf.write(int8_pack(cls._encode_precision(bound.precision))) return buf.getvalue() + +class VectorType(_CassandraType): + typename = 'org.apache.cassandra.db.marshal.VectorType' + vector_size = 0 + subtype = None + + @classmethod + def apply_parameters(cls, params, names): + assert len(params) == 2 + subtype = lookup_casstype(params[0]) + vsize = params[1] + return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), {'vector_size': vsize, 'subtype': subtype}) + + @classmethod + def deserialize(cls, byts, protocol_version): + indexes = (4 * x for x in range(0, cls.vector_size)) + return [cls.subtype.deserialize(byts[idx:idx + 4], protocol_version) for idx in indexes] + + @classmethod + def serialize(cls, v, protocol_version): + buf = io.BytesIO() + for item in v: + buf.write(cls.subtype.serialize(item, protocol_version)) + return buf.getvalue() + + @classmethod + def cql_parameterized_type(cls): + return "%s<%s, %s>" % (cls.typename, cls.subtype.typename, cls.vector_size) diff --git a/tests/unit/test_types.py b/tests/unit/test_types.py index b77c9dcdb4..a06bbd452d 100644 --- a/tests/unit/test_types.py +++ b/tests/unit/test_types.py @@ -25,7 +25,8 @@ EmptyValue, LongType, SetType, UTF8Type, cql_typename, int8_pack, int64_pack, lookup_casstype, lookup_casstype_simple, parse_casstype_args, - int32_pack, Int32Type, ListType, MapType + int32_pack, Int32Type, ListType, MapType, VectorType, + FloatType ) from cassandra.encoder import cql_quote from cassandra.pool import Host @@ -188,6 +189,12 @@ class BarType(FooType): self.assertEqual(UTF8Type, ctype.subtypes[2]) self.assertEqual([b'city', None, b'zip'], ctype.names) + def test_parse_casstype_vector(self): + ctype = parse_casstype_args("org.apache.cassandra.db.marshal.VectorType(org.apache.cassandra.db.marshal.FloatType, 3)") + self.assertTrue(issubclass(ctype, VectorType)) + self.assertEqual(3, ctype.vector_size) + self.assertEqual(FloatType, ctype.subtype) + def test_empty_value(self): self.assertEqual(str(EmptyValue()), 'EMPTY') @@ -301,6 +308,19 @@ def test_cql_quote(self): self.assertEqual(cql_quote('test'), "'test'") self.assertEqual(cql_quote(0), '0') + def test_vector_round_trip(self): + base = [3.4, 2.9, 41.6, 12.0] + ctype = parse_casstype_args("org.apache.cassandra.db.marshal.VectorType(org.apache.cassandra.db.marshal.FloatType, 4)") + base_bytes = ctype.serialize(base, 0) + self.assertEqual(16, len(base_bytes)) + result = ctype.deserialize(base_bytes, 0) + self.assertEqual(len(base), len(result)) + for idx in range(0,len(base)): + self.assertAlmostEqual(base[idx], result[idx], places=5) + + def test_vector_cql_parameterized_type(self): + ctype = parse_casstype_args("org.apache.cassandra.db.marshal.VectorType(org.apache.cassandra.db.marshal.FloatType, 4)") + self.assertEqual(ctype.cql_parameterized_type(), "org.apache.cassandra.db.marshal.VectorType") ZERO = datetime.timedelta(0) From b2af730f03cbdbb8bd3d29a491ec5703d7ee03a1 Mon Sep 17 00:00:00 2001 From: Bret McGuire Date: Mon, 5 Jun 2023 10:08:34 -0500 Subject: [PATCH 18/20] Release 3.28.0: changelog & version --- CHANGELOG.rst | 22 ++++++++++++++++++++++ cassandra/__init__.py | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5599e37dcb..c8a4fd3e90 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,25 @@ +3.28.0 +====== +June 5, 2023 + +Features +-------- +* Add support for vector type (PYTHON-1352) +* Cryptography module is now an optional dependency (PYTHON-1351) + +Bug Fixes +--------- +* Store IV along with encrypted text when using column-level encryption (PYTHON-1350) +* Create session-specific protocol handlers to contain session-specific CLE policies (PYTHON-1356) + +Others +------ +* Use Cython for smoke builds (PYTHON-1343) +* Don't fail when inserting UDTs with prepared queries with some missing fields (PR 1151) +* Convert print statement to function in docs (PR 1157) +* Update comment for retry policy (DOC-3278) +* Added error handling blog reference (DOC-2813) + 3.27.0 ====== May 1, 2023 diff --git a/cassandra/__init__.py b/cassandra/__init__.py index 2a39f08a4c..89fee0a9ef 100644 --- a/cassandra/__init__.py +++ b/cassandra/__init__.py @@ -23,7 +23,7 @@ def emit(self, record): logging.getLogger('cassandra').addHandler(NullHandler()) -__version_info__ = (3, 28, 0b1) +__version_info__ = (3, 28, 0) __version__ = '.'.join(map(str, __version_info__)) From 1c5edd784def77e75d18dc123f10401aef326c3a Mon Sep 17 00:00:00 2001 From: Dmitry Kropachev Date: Fri, 9 Aug 2024 10:01:38 -0400 Subject: [PATCH 19/20] Add .eggs to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 80739b7453..26af6e1dcb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.swo *.so *.egg +*.eggs *.egg-info *.attr .tox From 569271fac654a1993ee126f8ca16469bb28ea618 Mon Sep 17 00:00:00 2001 From: Dmitry Kropachev Date: Tue, 13 Aug 2024 06:17:04 -0400 Subject: [PATCH 20/20] Temporary disable AES256ColumnEncryptionPolicy There was some seriouse issues in policy implementation in `3.27.0`. We want to inspect the feature before making it available. --- cassandra/cluster.py | 2 + docs/column_encryption.rst | 109 ------------------ docs/index.rst | 4 - .../column_encryption/test_policies.py | 1 + tests/unit/column_encryption/test_policies.py | 1 + 5 files changed, 4 insertions(+), 113 deletions(-) delete mode 100644 docs/column_encryption.rst diff --git a/cassandra/cluster.py b/cassandra/cluster.py index ee96203aa0..bbad441118 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -2696,6 +2696,8 @@ def __init__(self, cluster, hosts, keyspace=None): {"column_encryption_policy": self.cluster.column_encryption_policy}) except AttributeError: log.info("Unable to set column encryption policy for session") + raise Exception( + "column_encryption_policy is temporary disabled, until https://github.com/scylladb/python-driver/issues/365 is sorted out") if self.cluster.monitor_reporting_enabled: cc_host = self.cluster.get_control_connection_host() diff --git a/docs/column_encryption.rst b/docs/column_encryption.rst deleted file mode 100644 index e18b9286ed..0000000000 --- a/docs/column_encryption.rst +++ /dev/null @@ -1,109 +0,0 @@ -Column Encryption -================= - -Overview --------- -Support for client-side encryption of data was added in version 3.27.0 of the Python driver. When using -this feature data will be encrypted on-the-fly according to a specified :class:`~.ColumnEncryptionPolicy` -instance. This policy is also used to decrypt data in returned rows. If a prepared statement is used -this decryption is transparent to the user; retrieved data will be decrypted and converted into the original -type (according to definitions in the encryption policy). Support for simple (i.e. non-prepared) queries is -also available, although in this case values must be manually encrypted and/or decrypted. The -:class:`~.ColumnEncryptionPolicy` instance provides methods to assist with these operations. - -Client-side encryption and decryption should work against all versions of Cassandra and DSE. It does not -utilize any server-side functionality to do its work. - -WARNING: Consider upgrading to 3.28.0 or later ------------------------------------------------- -There is a significant issue with the column encryption functionality in Python driver 3.27.0. -To be able to decrypt your data, you must preserve the cipher initialization vector (IV) used by -the :class:`~.AES256ColumnEncryptionPolicy` when your data was written. -To decrypt your data, you must supply this IV when creating a policy to read this data. -If you do not supply this IV in the policy to read this data, you will **NOT BE ABLE TO DECRYPT YOUR DATA**. -See -`PYTHON-1350 `_ for more detail. - -DataStax recommends upgrading to Python driver 3.28.0 or later to avoid this issue. 3.28.0 or later manages the IV automatically. -Because of this change in functionality, any encrypted data written in 3.27.0 will **NOT** be readable by 3.28.0 or later. -After upgrading to Python driver 3.28.0 or later, it is critical that you re-encrypt your data with the new driver version. - -Configuration -------------- -Client-side encryption is enabled by creating an instance of a subclass of :class:`~.ColumnEncryptionPolicy` -and adding information about columns to be encrypted to it. This policy is then supplied to :class:`~.Cluster` -when it's created. - -.. code-block:: python - - import os - - from cassandra.policies import ColDesc - from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, AES256_KEY_SIZE_BYTES - - key = os.urandom(AES256_KEY_SIZE_BYTES) - cl_policy = AES256ColumnEncryptionPolicy() - col_desc = ColDesc('ks1','table1','column1') - cql_type = "int" - cl_policy.add_column(col_desc, key, cql_type) - cluster = Cluster(column_encryption_policy=cl_policy) - -:class:`~.AES256ColumnEncryptionPolicy` is a subclass of :class:`~.ColumnEncryptionPolicy` which provides -encryption and decryption via AES-256. This class is currently the only available column encryption policy -implementation, although users can certainly implement their own by subclassing :class:`~.ColumnEncryptionPolicy`. - -:class:`~.ColDesc` is a named tuple which uniquely identifies a column in a given keyspace and table. When we -have this tuple, the encryption key and the CQL type contained by this column we can add the column to the policy -using :func:`~.ColumnEncryptionPolicy.add_column`. Once we have added all column definitions to the policy we -pass it along to the cluster. - -The CQL type for the column only has meaning at the client; it is never sent to Cassandra. The encryption key -is also never sent to the server; all the server ever sees are random bytes reflecting the encrypted data. As a -result all columns containing client-side encrypted values should be declared with the CQL type "blob" at the -Cassandra server. - -Usage ------ - -Encryption -^^^^^^^^^^ -Client-side encryption shines most when used with prepared statements. A prepared statement is aware of information -about the columns in the query it was built from and we can use this information to transparently encrypt any -supplied parameters. For example, we can create a prepared statement to insert a value into column1 (as defined above) -by executing the following code after creating a :class:`~.Cluster` in the manner described above: - -.. code-block:: python - - session = cluster.connect() - prepared = session.prepare("insert into ks1.table1 (column1) values (?)") - session.execute(prepared, (1000,)) - -Our encryption policy will detect that "column1" is an encrypted column and take appropriate action. - -As mentioned above client-side encryption can also be used with simple queries, although such use cases are -certainly not transparent. :class:`~.ColumnEncryptionPolicy` provides a helper named -:func:`~.ColumnEncryptionPolicy.encode_and_encrypt` which will convert an input value into bytes using the -standard serialization methods employed by the driver. The result is then encrypted according to the configuration -of the policy. Using this approach the example above could be implemented along the lines of the following: - -.. code-block:: python - - session = cluster.connect() - session.execute("insert into ks1.table1 (column1) values (%s)",(cl_policy.encode_and_encrypt(col_desc, 1000),)) - -Decryption -^^^^^^^^^^ -Decryption of values returned from the server is always transparent. Whether we're executing a simple or prepared -statement encrypted columns will be decrypted automatically and made available via rows just like any other -result. - -Limitations ------------ -:class:`~.AES256ColumnEncryptionPolicy` uses the implementation of AES-256 provided by the -`cryptography `_ module. Any limitations of this module should be considered -when deploying client-side encryption. Note specifically that a Rust compiler is required for modern versions -of the cryptography package, although wheels exist for many common platforms. - -Client-side encryption has been implemented for both the default Cython and pure Python row processing logic. -This functionality has not yet been ported to the NumPy Cython implementation. During testing, -the NumPy processing works on Python 3.7 but fails for Python 3.8. diff --git a/docs/index.rst b/docs/index.rst index 248e44b7c6..ccbd8437ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,9 +56,6 @@ Contents :doc:`scylla-cloud-serverless` Connect to ScyllaDB Cloud Serverless -:doc:`column_encryption` - Transparent client-side per-column encryption and decryption - :doc:`faq` A collection of Frequently Asked Questions @@ -83,7 +80,6 @@ Contents dates-and-times scylla-cloud scylla-cloud-serverless - column-encryption faq Getting Help diff --git a/tests/integration/standard/column_encryption/test_policies.py b/tests/integration/standard/column_encryption/test_policies.py index dea6b6d39e..8af27ab87a 100644 --- a/tests/integration/standard/column_encryption/test_policies.py +++ b/tests/integration/standard/column_encryption/test_policies.py @@ -25,6 +25,7 @@ def setup_module(): use_singledc() +@unittest.skip("Skip until https://github.com/scylladb/python-driver/issues/365 is sorted out") class ColumnEncryptionPolicyTest(unittest.TestCase): def _recreate_keyspace(self, session): diff --git a/tests/unit/column_encryption/test_policies.py b/tests/unit/column_encryption/test_policies.py index 38136c69d4..27e7c62ce7 100644 --- a/tests/unit/column_encryption/test_policies.py +++ b/tests/unit/column_encryption/test_policies.py @@ -19,6 +19,7 @@ from cassandra.column_encryption.policies import AES256ColumnEncryptionPolicy, \ AES256_BLOCK_SIZE_BYTES, AES256_KEY_SIZE_BYTES +@unittest.skip("Skip until https://github.com/scylladb/python-driver/issues/365 is sorted out") class AES256ColumnEncryptionPolicyTest(unittest.TestCase): def _random_block(self):