From 42adaa1efabed13a4b672a66fc6110a1eb82db8a Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 6 Jul 2022 05:23:36 -0600 Subject: [PATCH 01/25] Format control checkpoint --- clickhouse_connect/datatypes/base.py | 13 +- clickhouse_connect/datatypes/registry.py | 26 +++- clickhouse_connect/driver/native.py | 134 ++++++++++--------- clickhouse_connect/driver/transform.py | 57 ++++++++ tests/unit_tests/test_driver/test_formats.py | 11 ++ 5 files changed, 173 insertions(+), 68 deletions(-) create mode 100644 clickhouse_connect/driver/transform.py create mode 100644 tests/unit_tests/test_driver/test_formats.py diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index d6130e06..484de49e 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -1,4 +1,5 @@ import array +import threading from abc import abstractmethod, ABC from math import log from typing import NamedTuple, Dict, Type, Any, Sequence, MutableSequence, Optional, Union, Tuple @@ -15,7 +16,6 @@ class TypeDef(NamedTuple): wrappers: tuple = () keys: tuple = () values: tuple = () - format: str = None @property def arg_str(self): @@ -26,7 +26,7 @@ class ClickHouseType(ABC): """ Base class for all ClickHouseType objects. """ - __slots__ = 'nullable', 'low_card', 'wrappers', 'format', 'type_def', '__dict__' + __slots__ = 'nullable', 'low_card', 'wrappers', 'type_def', '__dict__' _ch_name = None _name_suffix = '' np_type = 'O' @@ -42,6 +42,13 @@ def __init_subclass__(cls, registered: bool = True): def build(cls: Type['ClickHouseType'], type_def: TypeDef): return cls(type_def) + @classmethod + def read_format(cls): + try: + return threading.local.ch_read_format(cls) + except AttributeError: + return 'native' + def __init__(self, type_def: TypeDef): """ Base class constructor that sets Nullable and LowCardinality wrappers and currently assigns the row_binary conversion @@ -51,8 +58,6 @@ def __init__(self, type_def: TypeDef): self.type_def = type_def self.wrappers = type_def.wrappers self.low_card = 'LowCardinality' in self.wrappers - if type_def.format: - self.format = type_def.format self.nullable = 'Nullable' in self.wrappers if self.nullable: self.from_row_binary = self._nullable_from_row_binary diff --git a/clickhouse_connect/datatypes/registry.py b/clickhouse_connect/datatypes/registry.py index a3b27bc4..3a11ffb3 100644 --- a/clickhouse_connect/datatypes/registry.py +++ b/clickhouse_connect/datatypes/registry.py @@ -1,8 +1,9 @@ import logging +import re -from typing import Tuple, Dict +from typing import Tuple, Dict, Sequence, Type, Optional from clickhouse_connect.datatypes.base import TypeDef, ClickHouseType, type_map -from clickhouse_connect.driver.exceptions import InternalError +from clickhouse_connect.driver.exceptions import InternalError, ProgrammingError from clickhouse_connect.driver.parser import parse_enum, parse_callable, parse_columns logger = logging.getLogger(__name__) @@ -57,3 +58,24 @@ def get_from_name(name: str) -> ClickHouseType: raise InternalError(err_str) from None type_cache[name] = ch_type return ch_type + + +def matching_types(fmt_map: Optional[Dict[str, str]]) -> Dict[Type[ClickHouseType], str]: + if not fmt_map: + return {} + matches = {} + for pattern, fmt in fmt_map.items(): + if '*' in pattern: + re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) + for type_name, ch_type in type_map.items(): + if re_pattern.match(type_name): + matches[ch_type] = fmt + else: + try: + matches[type_map[pattern]] = fmt + except KeyError: + pass + if not matches: + raise ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') + return matches + diff --git a/clickhouse_connect/driver/native.py b/clickhouse_connect/driver/native.py index b166cd31..78ff5380 100644 --- a/clickhouse_connect/driver/native.py +++ b/clickhouse_connect/driver/native.py @@ -1,71 +1,81 @@ -from typing import Any, Sequence +import threading +from typing import Any, Sequence, Dict, Union from clickhouse_connect.datatypes import registry from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.common import read_leb128, read_leb128_str, write_leb128 from clickhouse_connect.driver.query import DataResult +from clickhouse_connect.driver.transform import DataTransform -# pylint: disable=too-many-locals -def parse_response(source: Sequence, use_none: bool = True) -> DataResult: - """ - Decodes the ClickHouse byte byte buffer response into rows of native Python data - :param source: A byte buffer or similar source - :param use_none: Use None values for ClickHouse NULLs (otherwise use zero/empty values) - :return: DataResult -- data matrix, column names, column types - """ - if not isinstance(source, memoryview): - source = memoryview(source) - loc = 0 - names = [] - col_types = [] - result = [] - total_size = len(source) - block = 0 - while loc < total_size: - result_block = [] - num_cols, loc = read_leb128(source, loc) - num_rows, loc = read_leb128(source, loc) - for col_num in range(num_cols): - name, loc = read_leb128_str(source, loc) - if block == 0: - names.append(name) - type_name, loc = read_leb128_str(source, loc) - if block == 0: - col_type = registry.get_from_name(type_name) - col_types.append(col_type) - else: - col_type = col_types[col_num] - column, loc = col_type.read_native_column(source, loc, num_rows, use_none=use_none) - result_block.append(column) - block += 1 - result.extend(list(zip(*result_block))) - return DataResult(result, tuple(names), tuple(col_types)) +class NativeTransform(DataTransform) + # pylint: disable=too-many-locals + def parse_response(self, source: Sequence, type_formats: Dict[str, str], + column_formats:Dict[str, Union[str, Dict[str, str]]]) -> DataResult: + """ + Decodes the ClickHouse byte byte buffer response into rows of native Python data + :param source: A byte buffer or similar source + :param column_formats: Use None values for ClickHouse NULLs (otherwise use zero/empty values) + :return: DataResult -- data matrix, column names, column types + """ + threading.local.ch_read_format = self.base_format.read_format + if not isinstance(source, memoryview): + source = memoryview(source) + loc = 0 + names = [] + col_types = [] + result = [] + total_size = len(source) + block = 0 + while loc < total_size: + result_block = [] + num_cols, loc = read_leb128(source, loc) + num_rows, loc = read_leb128(source, loc) + for col_num in range(num_cols): + name, loc = read_leb128_str(source, loc) + if block == 0: + names.append(name) + type_name, loc = read_leb128_str(source, loc) + if block == 0: + col_type = registry.get_from_name(type_name) + col_types.append(col_type) + else: + col_type = col_types[col_num] + col_fmt = column_formats.get(name, None) + if col_fmt: + if isinstance() + else: + self.base_format.read_overrides + column, loc = col_type.read_native_column(source, loc, num_rows) + result_block.append(column) + block += 1 + result.extend(list(zip(*result_block))) + return DataResult(result, tuple(names), tuple(col_types)) -def build_insert(data: Sequence[Sequence[Any]], *, column_names: Sequence[str], - column_type_names: Sequence[str] = None, - column_types: Sequence[ClickHouseType] = None, - column_oriented: bool = False): - """ - Encoding a dataset of Python sequences into native binary format - :param data: Matrix of rows and columns of data - :param column_names: Column names of the data to insert - :param column_type_names: Column type names of the data - :param column_types: Column types used to encode data in ClickHouse native format - :param column_oriented: If true the dataset does not need to be "pivoted" - :return: bytearray containing the dataset in ClickHouse native insert format - """ - if not column_types: - column_types = [registry.get_from_name(name) for name in column_type_names] - output = bytearray() - columns = data if column_oriented else tuple(zip(*data)) - write_leb128(len(columns), output) - write_leb128(len(columns[0]), output) - for col_name, col_type, column in zip(column_names, column_types, columns): - write_leb128(len(col_name), output) - output += col_name.encode() - write_leb128(len(col_type.name), output) - output += col_type.name.encode() - col_type.write_native_column(column, output) - return output + def build_insert(data: Sequence[Sequence[Any]], *, column_names: Sequence[str], + column_type_names: Sequence[str] = None, + column_types: Sequence[ClickHouseType] = None, + column_oriented: bool = False): + """ + Encoding a dataset of Python sequences into native binary format + :param data: Matrix of rows and columns of data + :param column_names: Column names of the data to insert + :param column_type_names: Column type names of the data + :param column_types: Column types used to encode data in ClickHouse native format + :param column_oriented: If true the dataset does not need to be "pivoted" + :return: bytearray containing the dataset in ClickHouse native insert format + """ + if not column_types: + column_types = [registry.get_from_name(name) for name in column_type_names] + output = bytearray() + columns = data if column_oriented else tuple(zip(*data)) + write_leb128(len(columns), output) + write_leb128(len(columns[0]), output) + for col_name, col_type, column in zip(column_names, column_types, columns): + write_leb128(len(col_name), output) + output += col_name.encode() + write_leb128(len(col_type.name), output) + output += col_type.name.encode() + col_type.write_native_column(column, output) + return output diff --git a/clickhouse_connect/driver/transform.py b/clickhouse_connect/driver/transform.py new file mode 100644 index 00000000..338a170c --- /dev/null +++ b/clickhouse_connect/driver/transform.py @@ -0,0 +1,57 @@ +from abc import ABC, abstractmethod +from typing import Sequence, Dict, Union, Type + +from clickhouse_connect.datatypes.base import ClickHouseType +from clickhouse_connect.datatypes.registry import matching_types +from clickhouse_connect.driver.query import DataResult + + +class FormatControl: + + def __init__(self, + default_formats: Dict[str, str] = None, + read_formats: Dict[str, str] = None, + write_formats: Dict[str, str] = None): + default_formats = matching_types(default_formats) + self.read_formats = default_formats.copy() + self.read_formats.update(matching_types(read_formats)) + self.write_formats = default_formats.copy() + self.write_formats.update(matching_types(write_formats)) + self.read_overrides = {} + self.write_overrides = {} + + def set_read_overrides(self, read_overrides: Dict[str, str]) -> None: + self.read_overrides = matching_types(read_overrides) + + def set_writes_overrides(self, write_overrides: Dict[str, str]) -> None: + self.write_overrides = matching_types(write_overrides) + + def read_format(self, ch_type: Type[ClickHouseType]) -> str: + return self.read_overrides.get(ch_type, self.read_formats.get(ch_type, 'native')) + + def write_format(self, ch_type: Type[ClickHouseType]) -> str: + return self.write_overrides.get(ch_type, self.write_formats.get(ch_type, 'native')) + + def clear_read_overrides(self): + self.read_overrides = {} + + def clear_write_override(self): + self.write_overrides = {} + + + +class QueryFormatter: + def __init__(self, + type_formats: Dict[str, str] = None, + column_formats: Dict[str, str] = None, + sub_column_formats: Dict[str, Dict[str, str]] = None): + pass + + +class DataTransform(ABC): + + def __init__(self, fmt_ctl: FormatControl): + self.base_format = fmt_ctl + + def parse_response(self, source: Sequence, type_formats: Dict[column_formats:Dict[str, Union[str, Dict[str, str]]]) -> DataResult: + pass diff --git a/tests/unit_tests/test_driver/test_formats.py b/tests/unit_tests/test_driver/test_formats.py new file mode 100644 index 00000000..1082326b --- /dev/null +++ b/tests/unit_tests/test_driver/test_formats.py @@ -0,0 +1,11 @@ +from clickhouse_connect.datatypes.network import IPv6 +from clickhouse_connect.datatypes.numeric import Int32 +from clickhouse_connect.datatypes.string import FixedString +from clickhouse_connect.driver.transform import FormatControl + + +def test_format_control(): + fmt_ctl = FormatControl(default_formats={'Int32': 'string'}, read_formats={'IP*': 'string'}) + assert fmt_ctl.read_format(IPv6) == 'string' + assert fmt_ctl.write_format(Int32) == 'string' + assert fmt_ctl.read_format(FixedString) == 'native' From 417cd96a5c521860ed8bd1913133190dd0094f76 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 7 Jul 2022 14:50:26 -0600 Subject: [PATCH 02/25] Checkpoint on data format handling --- clickhouse_connect/datatypes/base.py | 21 +++-- clickhouse_connect/datatypes/format.py | 66 ++++++++++++++ clickhouse_connect/datatypes/registry.py | 24 +---- clickhouse_connect/driver/httpclient.py | 14 ++- clickhouse_connect/driver/native.py | 36 ++------ clickhouse_connect/driver/rowbinary.py | 87 +++++++++--------- clickhouse_connect/driver/transform.py | 88 ++++++++++--------- tests/unit_tests/test_driver/test_formats.py | 13 +-- .../test_driver/test_native_fuzz.py | 12 +-- .../test_driver/test_native_read.py | 7 +- .../test_driver/test_native_write.py | 5 +- 11 files changed, 202 insertions(+), 171 deletions(-) create mode 100644 clickhouse_connect/datatypes/format.py diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index 18e9b957..794c17a3 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -8,6 +8,9 @@ write_uint64, low_card_version from clickhouse_connect.driver.exceptions import NotSupportedError +ch_read_formats = {} +ch_write_formats = {} + class TypeDef(NamedTuple): """ @@ -44,10 +47,13 @@ def build(cls: Type['ClickHouseType'], type_def: TypeDef): @classmethod def read_format(cls): - try: - return threading.local.ch_read_format(cls) - except AttributeError: - return 'native' + overrides = getattr(threading.local, 'ch_column_overrides', None) + if overrides and cls in overrides: + return overrides[cls] + overrides = getattr(threading.local, 'ch_query_overrides)', None) + if overrides and cls in overrides: + return overrides[cls] + return ch_read_formats.get(cls, 'native') def __init__(self, type_def: TypeDef): """ @@ -120,8 +126,8 @@ def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none=T :param source: Native protocol binary read buffer :param loc: Moving location for the read buffer :param num_rows: Number of rows expected in the column - :param use_none: Use the Python None type for ClickHouse nulls. Otherwise use the empty or zero type. Allows support for - pandas data frames that do not support None + :param use_none: Use the Python None type for ClickHouse nulls. Otherwise use the empty or zero type. + Allows support for pandas data frames that do not support None :return: The decoded column plust the updated location pointer """ if self.low_card: @@ -143,7 +149,8 @@ def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none=T # delegate binary operations to their elements # pylint: disable=no-self-use - def _read_native_binary(self, _source: Sequence, _loc: int, _num_rows: int) -> Tuple[Union[Sequence, MutableSequence], int]: + def _read_native_binary(self, _source: Sequence, _loc: int, _num_rows: int) \ + -> Tuple[Union[Sequence, MutableSequence], int]: """ Lowest level read method for ClickHouseType native data columns :param _source: Native protocol binary read buffer diff --git a/clickhouse_connect/datatypes/format.py b/clickhouse_connect/datatypes/format.py new file mode 100644 index 00000000..c27856f5 --- /dev/null +++ b/clickhouse_connect/datatypes/format.py @@ -0,0 +1,66 @@ +import re + +from typing import Dict, Type, Sequence + +from clickhouse_connect.datatypes.base import ClickHouseType, type_map, ch_read_formats, ch_write_formats +from clickhouse_connect.driver import ProgrammingError + + +def set_default_formats(*args, **kwargs): + fmt_map = format_map(_convert_arguments(*args, **kwargs)) + ch_read_formats.update(fmt_map) + ch_write_formats.update(fmt_map) + + +def clear_all_formats(): + ch_read_formats.clear() + ch_write_formats.clear() + + +def clear_default_format(pattern: str): + for ch_type in _matching_types(pattern): + ch_read_formats.pop(ch_type, None) + ch_write_formats.pop(ch_type, None) + + +def clear_write_format(pattern: str): + for ch_type in _matching_types(pattern): + ch_write_formats.pop(ch_type, None) + + +def clear_read_format(pattern: str): + for ch_type in _matching_types(pattern): + ch_read_formats.pop(ch_type, None) + + +def format_map(fmt_map: Dict[str, str]) -> Dict[Type[ClickHouseType], str]: + if not fmt_map: + return {} + final_map = {} + for pattern, fmt in fmt_map.items(): + matches = _matching_types(pattern) + if not matches: + raise ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') + for ch_type in matches: + final_map[ch_type] = fmt + return final_map + + +def _convert_arguments(*args, **kwargs) -> Dict[str, str]: + fmt_map = {} + try: + for x in range(0, len(args), 2): + fmt_map[args[x]] = args[x + 1] + except (IndexError, TypeError, ValueError): + raise ProgrammingError('Invalid type/format arguments for format method') + fmt_map.update(kwargs) + return fmt_map + + +def _matching_types(pattern: str) -> Sequence[Type[ClickHouseType]]: + if '*' in pattern: + re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) + return [ch_type for type_name, ch_type in type_map.items() if re_pattern.match(type_name)] + if pattern in type_map: + return [type_map[pattern]] + return [] diff --git a/clickhouse_connect/datatypes/registry.py b/clickhouse_connect/datatypes/registry.py index 3a11ffb3..af897884 100644 --- a/clickhouse_connect/datatypes/registry.py +++ b/clickhouse_connect/datatypes/registry.py @@ -1,7 +1,6 @@ import logging -import re -from typing import Tuple, Dict, Sequence, Type, Optional +from typing import Tuple, Dict, Type, Optional from clickhouse_connect.datatypes.base import TypeDef, ClickHouseType, type_map from clickhouse_connect.driver.exceptions import InternalError, ProgrammingError from clickhouse_connect.driver.parser import parse_enum, parse_callable, parse_columns @@ -58,24 +57,3 @@ def get_from_name(name: str) -> ClickHouseType: raise InternalError(err_str) from None type_cache[name] = ch_type return ch_type - - -def matching_types(fmt_map: Optional[Dict[str, str]]) -> Dict[Type[ClickHouseType], str]: - if not fmt_map: - return {} - matches = {} - for pattern, fmt in fmt_map.items(): - if '*' in pattern: - re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) - for type_name, ch_type in type_map.items(): - if re_pattern.match(type_name): - matches[ch_type] = fmt - else: - try: - matches[type_map[pattern]] = fmt - except KeyError: - pass - if not matches: - raise ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') - return matches - diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index 14224d4f..d7aaec41 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -9,13 +9,13 @@ from requests.exceptions import RequestException from clickhouse_connect.datatypes import registry -from clickhouse_connect.driver import native -from clickhouse_connect.driver import rowbinary from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.client import Client from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError, ProgrammingError from clickhouse_connect.driver.httpadapter import KeepAliveAdapter +from clickhouse_connect.driver.native import NativeTransform from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value +from clickhouse_connect.driver.rowbinary import RowBinaryTransform logger = logging.getLogger(__name__) columns_only_re = re.compile(r'LIMIT 0\s*$', re.IGNORECASE) @@ -110,15 +110,13 @@ def __init__(self, if data_format == 'native': self.read_format = self.write_format = 'Native' - self.build_insert = native.build_insert - self.parse_response = native.parse_response self.column_inserts = True + self.transform = NativeTransform() elif data_format in ('row_binary', 'rb'): self.read_format = 'RowBinaryWithNamesAndTypes' self.write_format = 'RowBinary' - self.build_insert = rowbinary.build_insert - self.parse_response = rowbinary.parse_response self.column_inserts = False + self.transform = RowBinaryTransform() self.session = session self.connect_timeout = connect_timeout self.read_timeout = send_receive_timeout @@ -169,7 +167,7 @@ def query(self, query: str, data_result = DataResult([], tuple(names), tuple(types)) else: response = self._raw_request(self._format_query(final_query), params, headers, retries=2) - data_result = self.parse_response(response.content, use_none) + data_result = self.transform.parse_response(response.content, use_none=use_none) summary = {} if 'X-ClickHouse-Summary' in response.headers: try: @@ -193,7 +191,7 @@ def data_insert(self, params = {'query': f"INSERT INTO {table} ({', '.join(column_names)}) FORMAT {self.write_format}", 'database': self.database} params.update(self._validate_settings(settings, True)) - insert_block = self.build_insert(data, column_types=column_types, column_names=column_names, + insert_block = self.transform.build_insert(data, column_types=column_types, column_names=column_names, column_oriented=column_oriented) response = self._raw_request(insert_block, params, headers) logger.debug('Insert response code: %d, content: %s', response.status_code, response.content) diff --git a/clickhouse_connect/driver/native.py b/clickhouse_connect/driver/native.py index 78ff5380..3c738350 100644 --- a/clickhouse_connect/driver/native.py +++ b/clickhouse_connect/driver/native.py @@ -1,24 +1,15 @@ -import threading -from typing import Any, Sequence, Dict, Union +from typing import Any, Sequence from clickhouse_connect.datatypes import registry from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.common import read_leb128, read_leb128_str, write_leb128 from clickhouse_connect.driver.query import DataResult -from clickhouse_connect.driver.transform import DataTransform +from clickhouse_connect.driver.transform import DataTransform, QueryContext -class NativeTransform(DataTransform) +class NativeTransform(DataTransform): # pylint: disable=too-many-locals - def parse_response(self, source: Sequence, type_formats: Dict[str, str], - column_formats:Dict[str, Union[str, Dict[str, str]]]) -> DataResult: - """ - Decodes the ClickHouse byte byte buffer response into rows of native Python data - :param source: A byte buffer or similar source - :param column_formats: Use None values for ClickHouse NULLs (otherwise use zero/empty values) - :return: DataResult -- data matrix, column names, column types - """ - threading.local.ch_read_format = self.base_format.read_format + def _transform_response(self, source: Sequence, context: QueryContext) -> DataResult: if not isinstance(source, memoryview): source = memoryview(source) loc = 0 @@ -41,31 +32,16 @@ def parse_response(self, source: Sequence, type_formats: Dict[str, str], col_types.append(col_type) else: col_type = col_types[col_num] - col_fmt = column_formats.get(name, None) - if col_fmt: - if isinstance() - else: - self.base_format.read_overrides - column, loc = col_type.read_native_column(source, loc, num_rows) + column, loc = col_type.read_native_column(source, loc, num_rows, use_none=context.use_none) result_block.append(column) block += 1 result.extend(list(zip(*result_block))) return DataResult(result, tuple(names), tuple(col_types)) - - def build_insert(data: Sequence[Sequence[Any]], *, column_names: Sequence[str], + def build_insert(self, data: Sequence[Sequence[Any]], *, column_names: Sequence[str], column_type_names: Sequence[str] = None, column_types: Sequence[ClickHouseType] = None, column_oriented: bool = False): - """ - Encoding a dataset of Python sequences into native binary format - :param data: Matrix of rows and columns of data - :param column_names: Column names of the data to insert - :param column_type_names: Column type names of the data - :param column_types: Column types used to encode data in ClickHouse native format - :param column_oriented: If true the dataset does not need to be "pivoted" - :return: bytearray containing the dataset in ClickHouse native insert format - """ if not column_types: column_types = [registry.get_from_name(name) for name in column_type_names] output = bytearray() diff --git a/clickhouse_connect/driver/rowbinary.py b/clickhouse_connect/driver/rowbinary.py index 701a8b9f..ef0cc65d 100644 --- a/clickhouse_connect/driver/rowbinary.py +++ b/clickhouse_connect/driver/rowbinary.py @@ -1,60 +1,55 @@ import logging -from typing import Any, Sequence, Union +from typing import Any, Sequence from clickhouse_connect.datatypes import registry from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.common import read_leb128, read_leb128_str from clickhouse_connect.driver.exceptions import InterfaceError from clickhouse_connect.driver.query import DataResult +from clickhouse_connect.driver.transform import DataTransform, QueryContext logger = logging.getLogger(__name__) -def parse_response(source: Union[bytes, bytearray, memoryview], _use_none: bool = True) -> DataResult: - """ - Decodes the ClickHouse rowbinary format byte buffer response into rows of native Python data - :param source: A byte buffer or similar source - :param _use_none: Use None values for ClickHouse NULLs -- Using defaults is not supported by rowbinary, - so conversion of nulls for certain outputs (pandas/numpy arrays) will fail - :return: DataResult -- data matrix, column names, column types - """ - if not isinstance(source, memoryview): - source = memoryview(source) - response_size = len(source) - loc = 0 - num_columns, loc = read_leb128(source, loc) - names = [] - for _ in range(num_columns): - name, loc = read_leb128_str(source, loc) - names.append(name) - col_types = [] - for _ in range(num_columns): - col_type, loc = read_leb128_str(source, loc) - try: - col_types.append(registry.get_from_name(col_type)) - except KeyError: - raise InterfaceError(f'Unknown ClickHouse type returned for type {col_type}') from None - convs = tuple(t.from_row_binary for t in col_types) - result = [] - while loc < response_size: - row = [] - for conv in convs: - v, loc = conv(source, loc) - row.append(v) - result.append(row) - return DataResult(result, tuple(names), tuple(col_types)) +class RowBinaryTransform(DataTransform): + def _transform_response(self, source: Sequence, context: QueryContext) -> DataResult: + if not isinstance(source, memoryview): + source = memoryview(source) + response_size = len(source) + loc = 0 + num_columns, loc = read_leb128(source, loc) + names = [] + for _ in range(num_columns): + name, loc = read_leb128_str(source, loc) + names.append(name) + col_types = [] + for _ in range(num_columns): + col_type, loc = read_leb128_str(source, loc) + try: + col_types.append(registry.get_from_name(col_type)) + except KeyError: + raise InterfaceError(f'Unknown ClickHouse type returned for type {col_type}') from None + convs = tuple(t.from_row_binary for t in col_types) + result = [] + while loc < response_size: + row = [] + for conv in convs: + v, loc = conv(source, loc) + row.append(v) + result.append(row) + return DataResult(result, tuple(names), tuple(col_types)) -def build_insert(data: Sequence[Sequence[Any]], *, column_type_names: Sequence[str] = None, - column_types: Sequence[ClickHouseType] = None, column_oriented: bool = False, **_): - if not column_types: - column_types = [registry.get_from_name(name) for name in column_type_names] - convs = tuple(t.to_row_binary for t in column_types) - if column_oriented: - data = tuple(zip(*data)) - output = bytearray() - for row in data: - for (value, conv) in zip(row, convs): - conv(value, output) - return output + def build_insert(self, data: Sequence[Sequence[Any]], *, column_type_names: Sequence[str] = None, + column_types: Sequence[ClickHouseType] = None, column_oriented: bool = False, **_): + if not column_types: + column_types = [registry.get_from_name(name) for name in column_type_names] + convs = tuple(t.to_row_binary for t in column_types) + if column_oriented: + data = tuple(zip(*data)) + output = bytearray() + for row in data: + for (value, conv) in zip(row, convs): + conv(value, output) + return output diff --git a/clickhouse_connect/driver/transform.py b/clickhouse_connect/driver/transform.py index 338a170c..85cf79ae 100644 --- a/clickhouse_connect/driver/transform.py +++ b/clickhouse_connect/driver/transform.py @@ -1,57 +1,59 @@ +import threading from abc import ABC, abstractmethod -from typing import Sequence, Dict, Union, Type +from typing import Sequence, Dict, Union, Any, Optional from clickhouse_connect.datatypes.base import ClickHouseType -from clickhouse_connect.datatypes.registry import matching_types +from clickhouse_connect.datatypes.format import format_map from clickhouse_connect.driver.query import DataResult -class FormatControl: +class QueryContext: + def __init__(self, use_none: bool, type_formats: Optional[Dict[str, str]], + column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]]): + self.query_overrides = format_map(type_formats) + self.use_none = use_none - def __init__(self, - default_formats: Dict[str, str] = None, - read_formats: Dict[str, str] = None, - write_formats: Dict[str, str] = None): - default_formats = matching_types(default_formats) - self.read_formats = default_formats.copy() - self.read_formats.update(matching_types(read_formats)) - self.write_formats = default_formats.copy() - self.write_formats.update(matching_types(write_formats)) - self.read_overrides = {} - self.write_overrides = {} + def __enter__(self): + if self.query_overrides: + threading.local.ch_query_overrides = self.query_overrides + return self - def set_read_overrides(self, read_overrides: Dict[str, str]) -> None: - self.read_overrides = matching_types(read_overrides) - - def set_writes_overrides(self, write_overrides: Dict[str, str]) -> None: - self.write_overrides = matching_types(write_overrides) - - def read_format(self, ch_type: Type[ClickHouseType]) -> str: - return self.read_overrides.get(ch_type, self.read_formats.get(ch_type, 'native')) - - def write_format(self, ch_type: Type[ClickHouseType]) -> str: - return self.write_overrides.get(ch_type, self.write_formats.get(ch_type, 'native')) - - def clear_read_overrides(self): - self.read_overrides = {} - - def clear_write_override(self): - self.write_overrides = {} - - - -class QueryFormatter: - def __init__(self, - type_formats: Dict[str, str] = None, - column_formats: Dict[str, str] = None, - sub_column_formats: Dict[str, Dict[str, str]] = None): - pass + def __exit__(self, exc_type, exc_val, exc_tb): + if self.query_overrides: + del threading.local.ch_query_overrides class DataTransform(ABC): - def __init__(self, fmt_ctl: FormatControl): - self.base_format = fmt_ctl + def parse_response(self, source: Sequence, type_formats: Dict[str, str] = None, use_none: bool = True, + column_formats: Dict[str, Union[str, Dict[str, str]]] = None) -> DataResult: + """ + Decodes the ClickHouse byte buffer response into rows of native Python data + :param source: A byte buffer or similar source + :param use_none: Use None python value for ClickHouse nulls (otherwise use type "zero value") + :param type_formats: Dictionary of ClickHouse type names/patterns and response formats + :param column_formats: Use None values for ClickHouse NULLs (otherwise use zero/empty values) + :return: DataResult -- data matrix, column names, column types + """ + with QueryContext(use_none, type_formats, column_formats) as query_context: + return self._transform_response(source, query_context) + + @abstractmethod + def build_insert(self, data: Sequence[Sequence[Any]], *, column_names: Sequence[str], + column_type_names: Sequence[str] = None, + column_types: Sequence[ClickHouseType] = None, + column_oriented: bool = False): + """ + Encodes a dataset of Python sequences into a ClickHouse format + :param data: Matrix of rows and columns of data + :param column_names: Column names of the data to insert + :param column_type_names: Column type names of the data + :param column_types: Column types used to encode data in ClickHouse native format + :param column_oriented: If true the dataset does not need to be "pivoted" + :return: bytearray containing the dataset in the appropriate format + """ + pass - def parse_response(self, source: Sequence, type_formats: Dict[column_formats:Dict[str, Union[str, Dict[str, str]]]) -> DataResult: + @abstractmethod + def _transform_response(self, source: Sequence, context: QueryContext) -> DataResult: pass diff --git a/tests/unit_tests/test_driver/test_formats.py b/tests/unit_tests/test_driver/test_formats.py index 1082326b..a03c8156 100644 --- a/tests/unit_tests/test_driver/test_formats.py +++ b/tests/unit_tests/test_driver/test_formats.py @@ -1,11 +1,12 @@ +from clickhouse_connect.datatypes.format import clear_all_formats, set_default_formats from clickhouse_connect.datatypes.network import IPv6 from clickhouse_connect.datatypes.numeric import Int32 from clickhouse_connect.datatypes.string import FixedString -from clickhouse_connect.driver.transform import FormatControl -def test_format_control(): - fmt_ctl = FormatControl(default_formats={'Int32': 'string'}, read_formats={'IP*': 'string'}) - assert fmt_ctl.read_format(IPv6) == 'string' - assert fmt_ctl.write_format(Int32) == 'string' - assert fmt_ctl.read_format(FixedString) == 'native' +def test_default_formats(): + clear_all_formats() + set_default_formats('Int32', 'string', 'IP*', 'string') + assert IPv6.read_format() == 'string' + assert Int32.read_format() == 'string' + assert FixedString.read_format() == 'native' diff --git a/tests/unit_tests/test_driver/test_native_fuzz.py b/tests/unit_tests/test_driver/test_native_fuzz.py index 6670db30..157f9019 100644 --- a/tests/unit_tests/test_driver/test_native_fuzz.py +++ b/tests/unit_tests/test_driver/test_native_fuzz.py @@ -1,13 +1,15 @@ import random from clickhouse_connect.datatypes.registry import get_from_name -from clickhouse_connect.driver.native import build_insert, parse_response +from clickhouse_connect.driver.native import NativeTransform from tests.helpers import random_columns, random_data TEST_RUNS = 200 TEST_COLUMNS = 12 MAX_DATA_ROWS = 100 +transform = NativeTransform() + # pylint: disable=duplicate-code def test_native_round_trips(): @@ -18,8 +20,8 @@ def test_native_round_trips(): col_names = ('row_id',) + col_names col_types = (get_from_name('UInt32'),) + col_types assert len(data) == data_rows - output = build_insert(data, column_names=col_names, column_types=col_types) - data_result = parse_response(output) + output = transform.build_insert(data, column_names=col_names, column_types=col_types) + data_result = transform.parse_response(output) assert data_result.column_names == col_names assert data_result.column_types == col_types dataset = data_result.result @@ -34,8 +36,8 @@ def test_native_small(): data = random_data(col_types, 2) col_names = ('row_id',) + col_names col_types = (get_from_name('UInt32'),) + col_types - output = build_insert(data, column_names=col_names, column_types=col_types) - data_result = parse_response(output) + output = transform.build_insert(data, column_names=col_names, column_types=col_types) + data_result = transform.parse_response(output) assert data_result.column_names == col_names assert data_result.column_types == col_types assert data_result.result == data diff --git a/tests/unit_tests/test_driver/test_native_read.py b/tests/unit_tests/test_driver/test_native_read.py index 720dec45..3a243a4d 100644 --- a/tests/unit_tests/test_driver/test_native_read.py +++ b/tests/unit_tests/test_driver/test_native_read.py @@ -2,7 +2,7 @@ from uuid import UUID from clickhouse_connect.datatypes import registry -from clickhouse_connect.driver.native import parse_response +from clickhouse_connect.driver.native import NativeTransform from tests.helpers import to_bytes from tests.unit_tests.test_driver.binary import NESTED_BINARY @@ -54,6 +54,9 @@ """ +parse_response = NativeTransform().parse_response + + def check_result(result, expected, row_num=0, col_num=0): result_set = result[0] row = result_set[row_num] @@ -93,5 +96,5 @@ def test_ip(): def test_nested(): - result = parse_response(to_bytes(NESTED_BINARY)) + result = parse_response (to_bytes(NESTED_BINARY)) check_result(result, [{'str1': 'one', 'int32': 5}, {'str1': 'two', 'int32': 55}], 2, 0) diff --git a/tests/unit_tests/test_driver/test_native_write.py b/tests/unit_tests/test_driver/test_native_write.py index 2242a3d9..3ee25684 100644 --- a/tests/unit_tests/test_driver/test_native_write.py +++ b/tests/unit_tests/test_driver/test_native_write.py @@ -1,5 +1,5 @@ from clickhouse_connect.datatypes.registry import get_from_name -from clickhouse_connect.driver.native import build_insert +from clickhouse_connect.driver.native import NativeTransform from tests.helpers import to_bytes from tests.unit_tests.test_driver.binary import NESTED_BINARY @@ -29,6 +29,9 @@ """ +build_insert = NativeTransform().build_insert + + def test_low_card_null(): data = [['three']] names = ['value'] From b882a66fb41549f45aee327ee17031807f224720 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 7 Jul 2022 14:59:17 -0600 Subject: [PATCH 03/25] Lint fixes --- clickhouse_connect/datatypes/format.py | 6 +++--- clickhouse_connect/datatypes/registry.py | 2 +- clickhouse_connect/driver/transform.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/clickhouse_connect/datatypes/format.py b/clickhouse_connect/datatypes/format.py index c27856f5..5b198e2c 100644 --- a/clickhouse_connect/datatypes/format.py +++ b/clickhouse_connect/datatypes/format.py @@ -3,7 +3,7 @@ from typing import Dict, Type, Sequence from clickhouse_connect.datatypes.base import ClickHouseType, type_map, ch_read_formats, ch_write_formats -from clickhouse_connect.driver import ProgrammingError +from clickhouse_connect.driver.exceptions import ProgrammingError def set_default_formats(*args, **kwargs): @@ -51,8 +51,8 @@ def _convert_arguments(*args, **kwargs) -> Dict[str, str]: try: for x in range(0, len(args), 2): fmt_map[args[x]] = args[x + 1] - except (IndexError, TypeError, ValueError): - raise ProgrammingError('Invalid type/format arguments for format method') + except (IndexError, TypeError, ValueError) as ex: + raise ProgrammingError('Invalid type/format arguments for format method') from ex fmt_map.update(kwargs) return fmt_map diff --git a/clickhouse_connect/datatypes/registry.py b/clickhouse_connect/datatypes/registry.py index af897884..1ab80e03 100644 --- a/clickhouse_connect/datatypes/registry.py +++ b/clickhouse_connect/datatypes/registry.py @@ -1,6 +1,6 @@ import logging -from typing import Tuple, Dict, Type, Optional +from typing import Tuple, Dict from clickhouse_connect.datatypes.base import TypeDef, ClickHouseType, type_map from clickhouse_connect.driver.exceptions import InternalError, ProgrammingError from clickhouse_connect.driver.parser import parse_enum, parse_callable, parse_columns diff --git a/clickhouse_connect/driver/transform.py b/clickhouse_connect/driver/transform.py index 85cf79ae..16f2250a 100644 --- a/clickhouse_connect/driver/transform.py +++ b/clickhouse_connect/driver/transform.py @@ -9,7 +9,7 @@ class QueryContext: def __init__(self, use_none: bool, type_formats: Optional[Dict[str, str]], - column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]]): + _column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]]): self.query_overrides = format_map(type_formats) self.use_none = use_none @@ -52,7 +52,6 @@ def build_insert(self, data: Sequence[Sequence[Any]], *, column_names: Sequence[ :param column_oriented: If true the dataset does not need to be "pivoted" :return: bytearray containing the dataset in the appropriate format """ - pass @abstractmethod def _transform_response(self, source: Sequence, context: QueryContext) -> DataResult: From a5387db94ca66ba54e81b1f68d8111436aab7951 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 7 Jul 2022 15:02:44 -0600 Subject: [PATCH 04/25] Lint fixes --- clickhouse_connect/datatypes/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_connect/datatypes/registry.py b/clickhouse_connect/datatypes/registry.py index 1ab80e03..a3b27bc4 100644 --- a/clickhouse_connect/datatypes/registry.py +++ b/clickhouse_connect/datatypes/registry.py @@ -2,7 +2,7 @@ from typing import Tuple, Dict from clickhouse_connect.datatypes.base import TypeDef, ClickHouseType, type_map -from clickhouse_connect.driver.exceptions import InternalError, ProgrammingError +from clickhouse_connect.driver.exceptions import InternalError from clickhouse_connect.driver.parser import parse_enum, parse_callable, parse_columns logger = logging.getLogger(__name__) From 6a498092f3bac1415f3656b3ade8845137bbabdf Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sun, 10 Jul 2022 16:06:08 -0600 Subject: [PATCH 05/25] Format control checkpoint --- clickhouse_connect/cc_superset/datatypes.py | 10 ++-- clickhouse_connect/datatypes/__init__.py | 46 ---------------- clickhouse_connect/datatypes/base.py | 13 +++-- clickhouse_connect/datatypes/network.py | 54 +++++++++---------- clickhouse_connect/datatypes/numeric.py | 21 +++----- clickhouse_connect/datatypes/special.py | 7 +-- clickhouse_connect/datatypes/string.py | 16 +++--- tests/conftest.py | 8 +++ tests/integration_tests/docker-compose.yml | 2 +- tests/integration_tests/test_native_fuzz.py | 2 + .../test_sqlalchemy/test_ddl.py | 4 +- tests/unit_tests/test_driver/test_formats.py | 3 +- 12 files changed, 70 insertions(+), 116 deletions(-) diff --git a/clickhouse_connect/cc_superset/datatypes.py b/clickhouse_connect/cc_superset/datatypes.py index b2e7e3de..82b38727 100644 --- a/clickhouse_connect/cc_superset/datatypes.py +++ b/clickhouse_connect/cc_superset/datatypes.py @@ -2,7 +2,7 @@ from superset.utils.core import GenericDataType from clickhouse_connect.cc_sqlalchemy.datatypes.base import sqla_type_map -from clickhouse_connect.datatypes import fixed_string_format, uint64_format, ip_format, uuid_format +from clickhouse_connect.datatypes.format import set_default_formats type_mapping = ( (r'^(FLOAT|DECIMAL|INT|UINT)', GenericDataType.NUMERIC), @@ -16,10 +16,10 @@ def configure_types(): Monkey patch the Superset generic_type onto the clickhouse type, also set defaults for certain type formatting to be better compatible with superset """ - fixed_string_format('string', 'utf8') - uint64_format('signed') - ip_format('string') - uuid_format('string') + set_default_formats(FixedString='string', + IPv4='string', + UInt64='signed', + UUID='string') compiled = [(re.compile(pattern, re.IGNORECASE), gen_type) for pattern, gen_type in type_mapping] for name, sqla_type in sqla_type_map.items(): for pattern, gen_type in compiled: diff --git a/clickhouse_connect/datatypes/__init__.py b/clickhouse_connect/datatypes/__init__.py index b7896ba5..b9006515 100644 --- a/clickhouse_connect/datatypes/__init__.py +++ b/clickhouse_connect/datatypes/__init__.py @@ -8,7 +8,6 @@ import clickhouse_connect.datatypes.temporal import clickhouse_connect.datatypes.registry -from clickhouse_connect.driver.exceptions import ProgrammingError logger = logging.getLogger(__name__) @@ -22,48 +21,3 @@ except ImportError: logger.warning('Unable to connect optimized C driver functions, falling back to pure Python', exc_info=True) - -def fixed_string_format(fmt: str, encoding: str = 'utf8'): - if fmt == 'string': - dt_string.FixedString.format = 'string' - dt_string.FixedString.encoding = encoding - elif fmt == 'bytes': - dt_string.FixedString.format = 'bytes' - dt_string.FixedString.encoding = 'utf8' - else: - raise ProgrammingError(f'Unrecognized fixed string default format {fmt}') - - -def big_int_format(fmt: str): - if fmt in ('string', 'int'): - dt_numeric.BigInt.format = fmt - else: - raise ProgrammingError(f'Unrecognized Big Integer default format {fmt}') - - -def uint64_format(fmt: str): - if fmt == 'unsigned': - dt_numeric.UInt64.format = 'unsigned' - dt_numeric.UInt64._array_type = 'Q' - dt_numeric.UInt64.np_format = 'u8' - elif fmt == 'signed': - dt_numeric.UInt64.format = 'signed' - dt_numeric.UInt64._array_type = 'q' - dt_numeric.UInt64.np_format = 'i8' - else: - raise ProgrammingError(f'Unrecognized UInt64 default format {fmt}') - - -def uuid_format(fmt: str): - if fmt in ('uuid', 'string'): - dt_special.UUID.format = fmt - else: - raise ProgrammingError(f'Unrecognized UUID default format {fmt}') - - -def ip_format(fmt: str): - if fmt in ('string', 'ip'): - dt_network.IPv4.format = fmt - dt_network.IPv6.format = fmt - else: - raise ProgrammingError(f'Unrecognized IPv4/IPv6 default format {fmt}') diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index 794c17a3..9a57d396 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -1,5 +1,7 @@ import array import threading +import logging + from abc import abstractmethod, ABC from math import log from typing import NamedTuple, Dict, Type, Any, Sequence, MutableSequence, Optional, Union, Tuple @@ -8,6 +10,7 @@ write_uint64, low_card_version from clickhouse_connect.driver.exceptions import NotSupportedError +logger = logging.getLogger(__name__) ch_read_formats = {} ch_write_formats = {} @@ -97,14 +100,15 @@ def write_native_prefix(self, dest: MutableSequence): def read_native_prefix(self, source: Sequence, loc: int): """ - Read the low cardinality version. Like the write, this has to happen immediately for container classes + Read the low cardinality version. Like the write method, this has to happen immediately for container classes :param source: The native protocol binary read buffer :param loc: Moving location pointer for the read buffer :return: updated read pointer """ if self.low_card: v, loc = read_uint64(source, loc) - assert v == low_card_version + if v != low_card_version: + logger.warning(f'Unexpected low cardinality version {v} reading type {self.name}') return loc def read_native_column(self, source: Sequence, loc: int, num_rows: int, **kwargs) -> Tuple[Sequence, int]: @@ -296,7 +300,7 @@ def __init_subclass__(cls, registered: bool = True): super().__init_subclass__(registered) if cls._array_type in ('i', 'I') and int_size == 2: cls._array_type = 'L' if cls._array_type.isupper() else 'l' - if cls._array_type: + if isinstance(cls._array_type, str) and cls._array_type: cls._struct_type = '<' + cls._array_type def _read_native_binary(self, source: Sequence, loc: int, num_rows: int): @@ -318,7 +322,8 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M class UnsupportedType(ClickHouseType, ABC, registered=False): """ - Base class for ClickHouse types that can't be serialized/deserialized into Python types. Mostly useful just for DDL statements + Base class for ClickHouse types that can't be serialized/deserialized into Python types. + Mostly useful just for DDL statements """ def __init__(self, type_def: TypeDef): super().__init__(type_def) diff --git a/clickhouse_connect/datatypes/network.py b/clickhouse_connect/datatypes/network.py index 72ff31c8..b485e133 100644 --- a/clickhouse_connect/datatypes/network.py +++ b/clickhouse_connect/datatypes/network.py @@ -2,30 +2,29 @@ from ipaddress import IPv4Address, IPv6Address from typing import Union, MutableSequence, Sequence -from clickhouse_connect.datatypes.base import ArrayType, ClickHouseType, TypeDef +from clickhouse_connect.datatypes.base import ArrayType, ClickHouseType from clickhouse_connect.driver.common import write_array, array_column -from clickhouse_connect.driver.exceptions import ProgrammingError IPV4_V6_MASK = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff' V6_NULL = bytes(b'\x00' * 16) +V4_NULL = IPv4Address(0) # pylint: disable=protected-access class IPv4(ArrayType): _array_type = 'I' - python_null = IPv4Address(0) - format = 'ip' - - def __init__(self, type_def: TypeDef): - super().__init__(type_def) - if self.format == 'string': - self.python_type = str - self.np_type = 'U' - elif self.format == 'ip': - self.python_type = IPv4Address - self.np_type = 'O' - else: - raise ProgrammingError('Unrecognized output format for IP4 type') + + @property + def python_type(self): + return str if self.read_format() == 'string' else IPv4Address + + @property + def np_type(self): + return 'U' if self.read_format() == 'string' else 'O' + + @property + def python_null(self): + return '' if self.read_format() == 'string' else V4_NULL def _from_row_binary(self, source: bytes, loc: int): ipv4 = IPv4Address.__new__(IPv4Address) @@ -41,7 +40,7 @@ def _to_row_binary(self, value: [int, IPv4Address, str], dest: bytearray): dest += value.to_bytes(4, 'little') def _read_native_binary(self, source: Sequence, loc: int, num_rows: int): - if self.format == 'string': + if self.read_format() == 'string': return self._from_native_str(source, loc, num_rows) return self._from_native_ip(source, loc, num_rows) @@ -76,19 +75,14 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M # pylint: disable=protected-access class IPv6(ClickHouseType): - python_null = IPv6Address(0) - format = 'ip' - - def __init__(self, type_def: TypeDef): - super().__init__(type_def) - if self.format == 'string': - self.python_type = str - self.np_type = 'U' - elif self.format == 'ip': - self.python_type = IPv6Address - self.np_type = 'O' - else: - raise ProgrammingError('Unrecognized output format for IP6 type') + + @property + def python_type(self): + return str if self.read_format() == 'string' else IPv6Address + + @property + def python_null(self): + return '' if self.read_format() == 'string' else V6_NULL def _from_row_binary(self, source: Sequence, loc: int): end = loc + 16 @@ -116,7 +110,7 @@ def _to_row_binary(self, value: Union[str, IPv4Address, IPv6Address, bytes, byte dest += value def _read_native_binary(self, source: Sequence, loc: int, num_rows: int): - if self.format == 'string': + if self.read_format() == 'string': return self._read_native_str(source, loc, num_rows) return self._read_native_ip(source, loc, num_rows) diff --git a/clickhouse_connect/datatypes/numeric.py b/clickhouse_connect/datatypes/numeric.py index 73de49d4..1e65edf2 100644 --- a/clickhouse_connect/datatypes/numeric.py +++ b/clickhouse_connect/datatypes/numeric.py @@ -88,18 +88,14 @@ def _to_row_binary(self, value: int, dest: MutableSequence): class UInt64(ArrayType): - _array_type = 'Q' - np_type = 'u8' - format = 'unsigned' - def __init__(self, type_def: TypeDef): - super().__init__(type_def) - if self.format == 'unsigned': - self._array_type = 'Q' - self.np_type = 'u8' - else: - self._array_type = 'q' - self.np_type = 'i8' + @property + def _array_type(self): + return 'Q' if self.read_format() == 'unsigned' else 'q' + + @property + def np_type(self): + return 'u8' if self.read_format() == 'unsigned' else 'q' def _from_row_binary(self, source: Sequence, loc: int): return suf(' Date: Sun, 10 Jul 2022 16:37:49 -0600 Subject: [PATCH 06/25] Fix lint --- clickhouse_connect/datatypes/base.py | 12 +++++++++++- clickhouse_connect/datatypes/string.py | 11 ++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index 9a57d396..3b8fbfd3 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -58,6 +58,16 @@ def read_format(cls): return overrides[cls] return ch_read_formats.get(cls, 'native') + @classmethod + def write_format(cls): + overrides = getattr(threading.local, 'ch_column_overrides', None) + if overrides and cls in overrides: + return overrides[cls] + overrides = getattr(threading.local, 'ch_write_overrides)', None) + if overrides and cls in overrides: + return overrides[cls] + return ch_write_formats.get(cls, 'native') + def __init__(self, type_def: TypeDef): """ Base class constructor that sets Nullable and LowCardinality wrappers and currently assigns the row_binary conversion @@ -108,7 +118,7 @@ def read_native_prefix(self, source: Sequence, loc: int): if self.low_card: v, loc = read_uint64(source, loc) if v != low_card_version: - logger.warning(f'Unexpected low cardinality version {v} reading type {self.name}') + logger.warning(f'Unexpected low cardinality version %d reading type %s', v, self.name) return loc def read_native_column(self, source: Sequence, loc: int, num_rows: int, **kwargs) -> Tuple[Sequence, int]: diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py index c6ec1151..3ad823bf 100644 --- a/clickhouse_connect/datatypes/string.py +++ b/clickhouse_connect/datatypes/string.py @@ -88,22 +88,23 @@ def __init__(self, type_def: TypeDef): pass self._name_suffix = type_def.arg_str self._empty_bytes = bytes(b'\x00' * self._byte_size) + self.to_row_binary = self._to_rb_internal @property def python_null(self): return self._empty_bytes if self.read_format() == 'bytes' else '' - @property - def _to_row_binary(self): - return self._to_row_binary_bytes if self.read_format() == 'bytes' else self._to_row_binary_str - def _from_row_binary(self, source: Sequence, loc: int): return bytes(source[loc:loc + self._byte_size]), loc + self._byte_size @staticmethod - def _to_row_binary_bytes(value: Sequence, dest: MutableSequence): + def _to_row_binary(value: Sequence, dest: MutableSequence): dest += value + @property + def _to_rb_internal(self): + return self._to_row_binary_str if self.write_format() == 'string' else self._to_row_binary + def _to_row_binary_str(self, value, dest: bytearray): value = str.encode(value, self.encoding) dest += value From 3a5ae3b6db0f0563fb3a9e3b93defb986ae62648 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sun, 10 Jul 2022 16:44:39 -0600 Subject: [PATCH 07/25] Fix lint --- clickhouse_connect/datatypes/__init__.py | 1 - clickhouse_connect/datatypes/base.py | 2 +- clickhouse_connect/datatypes/string.py | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/clickhouse_connect/datatypes/__init__.py b/clickhouse_connect/datatypes/__init__.py index b9006515..941a2a5d 100644 --- a/clickhouse_connect/datatypes/__init__.py +++ b/clickhouse_connect/datatypes/__init__.py @@ -20,4 +20,3 @@ dt_string.FixedString._read_native_bytes = creaders.read_fixed_string_bytes except ImportError: logger.warning('Unable to connect optimized C driver functions, falling back to pure Python', exc_info=True) - diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index 3b8fbfd3..bdc95d86 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -118,7 +118,7 @@ def read_native_prefix(self, source: Sequence, loc: int): if self.low_card: v, loc = read_uint64(source, loc) if v != low_card_version: - logger.warning(f'Unexpected low cardinality version %d reading type %s', v, self.name) + logger.warning('Unexpected low cardinality version %d reading type %s', v, self.name) return loc def read_native_column(self, source: Sequence, loc: int, num_rows: int, **kwargs) -> Tuple[Sequence, int]: diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py index 3ad823bf..aab098aa 100644 --- a/clickhouse_connect/datatypes/string.py +++ b/clickhouse_connect/datatypes/string.py @@ -97,8 +97,7 @@ def python_null(self): def _from_row_binary(self, source: Sequence, loc: int): return bytes(source[loc:loc + self._byte_size]), loc + self._byte_size - @staticmethod - def _to_row_binary(value: Sequence, dest: MutableSequence): + def _to_row_binary(self, value: Sequence, dest: MutableSequence): dest += value @property From a7ecf1d287ecd7057548f8f524980b583852dceb Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 25 Jul 2022 16:00:11 -0600 Subject: [PATCH 08/25] Format checkpoint --- clickhouse_connect/datatypes/base.py | 37 ++++++++++++++------ clickhouse_connect/datatypes/format.py | 37 +++++++++++++++----- clickhouse_connect/datatypes/network.py | 2 ++ clickhouse_connect/datatypes/numeric.py | 4 ++- clickhouse_connect/datatypes/special.py | 4 ++- clickhouse_connect/datatypes/string.py | 18 ++-------- tests/unit_tests/test_driver/test_formats.py | 8 ++++- 7 files changed, 72 insertions(+), 38 deletions(-) diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index bdc95d86..ef925da9 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -35,7 +35,10 @@ class ClickHouseType(ABC): __slots__ = 'nullable', 'low_card', 'wrappers', 'type_def', '__dict__' _ch_name = None _name_suffix = '' - np_type = 'O' + _encoding = 'utf8' + np_type = 'O' # Default to Numpy Object type + valid_formats = 'native' + python_null = 0 python_type = None @@ -49,24 +52,22 @@ def build(cls: Type['ClickHouseType'], type_def: TypeDef): return cls(type_def) @classmethod - def read_format(cls): + def _active_format(cls, fmt_map: Dict[Type['ClickHouseType'], str]): overrides = getattr(threading.local, 'ch_column_overrides', None) if overrides and cls in overrides: return overrides[cls] overrides = getattr(threading.local, 'ch_query_overrides)', None) if overrides and cls in overrides: return overrides[cls] - return ch_read_formats.get(cls, 'native') + return fmt_map.get(cls, 'native') + + @classmethod + def read_format(cls): + return cls._active_format(ch_read_formats) @classmethod def write_format(cls): - overrides = getattr(threading.local, 'ch_column_overrides', None) - if overrides and cls in overrides: - return overrides[cls] - overrides = getattr(threading.local, 'ch_write_overrides)', None) - if overrides and cls in overrides: - return overrides[cls] - return ch_write_formats.get(cls, 'native') + return cls._active_format(ch_write_formats) def __init__(self, type_def: TypeDef): """ @@ -98,6 +99,16 @@ def name(self): name = f'{wrapper}({name})' return name + @property + def encoding(self): + override = getattr(threading.local, 'ch_column_encoding', None) + if override: + return override + override = getattr(threading.local, 'ch_query_encoding', None) + if override: + return override + return self._encoding + def write_native_prefix(self, dest: MutableSequence): """ This is something of a hack, as the only "prefix" currently used is for the LowCardinality version. Because of the @@ -304,6 +315,7 @@ class ArrayType(ClickHouseType, ABC, registered=False): _signed = True _array_type = None _struct_type = None + valid_formats = 'string', 'native' python_type = int def __init_subclass__(cls, registered: bool = True): @@ -314,7 +326,10 @@ def __init_subclass__(cls, registered: bool = True): cls._struct_type = '<' + cls._array_type def _read_native_binary(self, source: Sequence, loc: int, num_rows: int): - return array_column(self._array_type, source, loc, num_rows) + column, loc = array_column(self._array_type, source, loc, num_rows) + if self.read_format() == 'string': + column = [str(x) for x in column] + return column, loc def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: MutableSequence): if column and self.nullable: diff --git a/clickhouse_connect/datatypes/format.py b/clickhouse_connect/datatypes/format.py index 5b198e2c..b643e80b 100644 --- a/clickhouse_connect/datatypes/format.py +++ b/clickhouse_connect/datatypes/format.py @@ -6,6 +6,10 @@ from clickhouse_connect.driver.exceptions import ProgrammingError +def set_encoding(encoding: str): + ClickHouseType._encoding = encoding + + def set_default_formats(*args, **kwargs): fmt_map = format_map(_convert_arguments(*args, **kwargs)) ch_read_formats.update(fmt_map) @@ -23,11 +27,21 @@ def clear_default_format(pattern: str): ch_write_formats.pop(ch_type, None) +def set_write_format(pattern: str, fmt: str): + for ch_type in _matching_types(pattern): + ch_write_formats[ch_type] = fmt + + def clear_write_format(pattern: str): for ch_type in _matching_types(pattern): ch_write_formats.pop(ch_type, None) +def set_read_format(pattern: str, fmt: str): + for ch_type in _matching_types(pattern): + ch_read_formats[ch_type] = fmt + + def clear_read_format(pattern: str): for ch_type in _matching_types(pattern): ch_read_formats.pop(ch_type, None) @@ -38,10 +52,7 @@ def format_map(fmt_map: Dict[str, str]) -> Dict[Type[ClickHouseType], str]: return {} final_map = {} for pattern, fmt in fmt_map.items(): - matches = _matching_types(pattern) - if not matches: - raise ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') - for ch_type in matches: + for ch_type in _matching_types(pattern, fmt): final_map[ch_type] = fmt return final_map @@ -57,10 +68,18 @@ def _convert_arguments(*args, **kwargs) -> Dict[str, str]: return fmt_map -def _matching_types(pattern: str) -> Sequence[Type[ClickHouseType]]: +def _matching_types(pattern: str, fmt: str = None) -> Sequence[Type[ClickHouseType]]: if '*' in pattern: re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) - return [ch_type for type_name, ch_type in type_map.items() if re_pattern.match(type_name)] - if pattern in type_map: - return [type_map[pattern]] - return [] + matches = [ch_type for type_name, ch_type in type_map.items() if re_pattern.match(type_name)] + elif pattern in type_map: + matches = [type_map[pattern]] + else: + matches = [] + if not matches: + ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') + if fmt: + invalid = [ch_type.__name__ for ch_type in matches if fmt not in ch_type.valid_formats] + if invalid: + raise ProgrammingError(f"{fmt} is not a valid format for ClickHouse types {','.join(invalid)}.") + return matches diff --git a/clickhouse_connect/datatypes/network.py b/clickhouse_connect/datatypes/network.py index b485e133..503e2211 100644 --- a/clickhouse_connect/datatypes/network.py +++ b/clickhouse_connect/datatypes/network.py @@ -13,6 +13,7 @@ # pylint: disable=protected-access class IPv4(ArrayType): _array_type = 'I' + valid_formats = 'string', 'native' @property def python_type(self): @@ -75,6 +76,7 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M # pylint: disable=protected-access class IPv6(ClickHouseType): + valid_formats = 'string', 'native' @property def python_type(self): diff --git a/clickhouse_connect/datatypes/numeric.py b/clickhouse_connect/datatypes/numeric.py index 1e65edf2..7e4750d6 100644 --- a/clickhouse_connect/datatypes/numeric.py +++ b/clickhouse_connect/datatypes/numeric.py @@ -88,6 +88,7 @@ def _to_row_binary(self, value: int, dest: MutableSequence): class UInt64(ArrayType): + valid_formats = 'signed', 'native' @property def _array_type(self): @@ -108,6 +109,7 @@ def _to_row_binary(self, value: int, dest: MutableSequence): class BigInt(ClickHouseType, registered=False): _signed = True _byte_size = 0 + valid_formats = 'string', 'native' def _read_native_binary(self, source: Sequence, loc: int, num_rows: int): signed = self._signed @@ -133,7 +135,7 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M signed = self._signed empty = bytes(b'\x00' * sz) ext = dest.extend - if isinstance(first, str): + if isinstance(first, str) or self.write_format() == 'string': if self.nullable: for x in column: if x: diff --git a/clickhouse_connect/datatypes/special.py b/clickhouse_connect/datatypes/special.py index d50b31cc..a9bfb924 100644 --- a/clickhouse_connect/datatypes/special.py +++ b/clickhouse_connect/datatypes/special.py @@ -9,6 +9,8 @@ class UUID(ClickHouseType): + valid_formats = 'string', 'native' + @property def python_null(self): return PYUUID(int=0) if self.read_format() == 'uuid' else '' @@ -68,7 +70,7 @@ def _read_native_str(source: Sequence, loc: int, num_rows: int): def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: MutableSequence): first = self._first_value(column) empty = empty_uuid_b - if isinstance(first, str): + if isinstance(first, str) or self.write_format() == 'string': for v in column: if v: x = int(v, 16) diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py index aab098aa..45c0ef78 100644 --- a/clickhouse_connect/datatypes/string.py +++ b/clickhouse_connect/datatypes/string.py @@ -5,16 +5,8 @@ class String(ClickHouseType): - encoding = 'utf8' python_null = '' - def __init__(self, type_def: TypeDef): - super().__init__(type_def) - try: - self.encoding = type_def.values[0] - except IndexError: - pass - def _from_row_binary(self, source, loc): length, loc = read_leb128(source, loc) return str(source[loc:loc + length], self.encoding), loc + length @@ -77,22 +69,18 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M class FixedString(ClickHouseType): - encoding = 'utf8' + valid_formats = 'string', 'native' def __init__(self, type_def: TypeDef): super().__init__(type_def) self._byte_size = type_def.values[0] - try: - self.encoding = type_def.values[1] - except IndexError: - pass self._name_suffix = type_def.arg_str self._empty_bytes = bytes(b'\x00' * self._byte_size) self.to_row_binary = self._to_rb_internal @property def python_null(self): - return self._empty_bytes if self.read_format() == 'bytes' else '' + return self._empty_bytes if self.read_format() == 'native' else '' def _from_row_binary(self, source: Sequence, loc: int): return bytes(source[loc:loc + self._byte_size]), loc + self._byte_size @@ -140,7 +128,7 @@ def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: M str_enc = str.encode enc = self.encoding first = self._first_value(column) - if isinstance(first, str): + if isinstance(first, str) or self.write_format() == 'string': if self.nullable: for x in column: if x is None: diff --git a/tests/unit_tests/test_driver/test_formats.py b/tests/unit_tests/test_driver/test_formats.py index 3ddf9979..eee76926 100644 --- a/tests/unit_tests/test_driver/test_formats.py +++ b/tests/unit_tests/test_driver/test_formats.py @@ -1,4 +1,4 @@ -from clickhouse_connect.datatypes.format import set_default_formats +from clickhouse_connect.datatypes.format import set_default_formats, set_write_format from clickhouse_connect.datatypes.network import IPv6 from clickhouse_connect.datatypes.numeric import Int32 from clickhouse_connect.datatypes.string import FixedString @@ -9,3 +9,9 @@ def test_default_formats(): assert IPv6.read_format() == 'string' assert Int32.read_format() == 'string' assert FixedString.read_format() == 'native' + + +def test_fixed_str_format(): + set_write_format('FixedString', 'string') + + From f2a429953f977cf141172aeaa49f1df2bbf63a55 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 25 Jul 2022 19:54:44 -0600 Subject: [PATCH 09/25] Fix named tuples --- clickhouse_connect/datatypes/container.py | 3 +- clickhouse_connect/datatypes/format.py | 4 +- clickhouse_connect/datatypes/registry.py | 3 ++ clickhouse_connect/driver/client.py | 6 +-- clickhouse_connect/driver/parser.py | 66 +++++++++++------------ clickhouse_connect/driver/query.py | 5 +- tests/integration_tests/conftest.py | 3 ++ tests/integration_tests/test_formats.py | 16 ++++-- tests/unit_tests/test_chtypes.py | 5 ++ 9 files changed, 64 insertions(+), 47 deletions(-) diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 6b492015..382eacdc 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -84,11 +84,12 @@ def write_native_data(self, column: Sequence, dest: MutableSequence): class Tuple(ClickHouseType): - _slots = 'element_types', 'from_rb_funcs', 'to_rb_funcs' + _slots = 'element_names', 'element_types', 'from_rb_funcs', 'to_rb_funcs' python_type = tuple def __init__(self, type_def: TypeDef): super().__init__(type_def) + self.element_names = type_def.keys self.element_types = [get_from_name(name) for name in type_def.values] self.from_rb_funcs = tuple((t.from_row_binary for t in self.element_types)) self.to_rb_funcs = tuple((t.to_row_binary for t in self.element_types)) diff --git a/clickhouse_connect/datatypes/format.py b/clickhouse_connect/datatypes/format.py index b643e80b..9cbab1f9 100644 --- a/clickhouse_connect/datatypes/format.py +++ b/clickhouse_connect/datatypes/format.py @@ -6,8 +6,8 @@ from clickhouse_connect.driver.exceptions import ProgrammingError -def set_encoding(encoding: str): - ClickHouseType._encoding = encoding +def default_encoding(encoding: str): + ClickHouseType._encoding = encoding # pylint: disable=protected-access def set_default_formats(*args, **kwargs): diff --git a/clickhouse_connect/datatypes/registry.py b/clickhouse_connect/datatypes/registry.py index a3b27bc4..47da6e05 100644 --- a/clickhouse_connect/datatypes/registry.py +++ b/clickhouse_connect/datatypes/registry.py @@ -32,6 +32,9 @@ def parse_name(name: str) -> Tuple[str, str, TypeDef]: elif base.startswith('Nested'): keys, values = parse_columns(base[6:]) base = 'Nested' + elif base.startswith('Tuple'): + keys, values = parse_columns(base[5:]) + base = 'Tuple' else: try: base, values, _ = parse_callable(base) diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 0c48ae82..86109144 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -131,17 +131,17 @@ def query_arrow(self, settings: Optional[Dict[str, Any]] = None, use_strings: bool = True): """ - Query method using the ClickHouse ArrowStream format to return a PyArrow result + Query method using the ClickHouse Arrow format to return a PyArrow table :param query: Query statement/format string :param parameters: Optional dictionary used to format the query :param settings: Optional dictionary of ClickHouse settings (key/string values) :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary) - :return: Tuple of the PyArrow schema and a single record batch + :return: PyArrow.Table """ arrow_settings = {} if not settings else settings.copy() if 'output_format_arrow_string_as_string' not in arrow_settings: arrow_settings['output_format_arrow_string_as_string'] = '1' if use_strings else '0' - return to_arrow(self.raw_query(query, parameters, arrow_settings, 'ArrowStream')) + return to_arrow(self.raw_query(query, parameters, arrow_settings, 'Arrow')) @abstractmethod def command(self, diff --git a/clickhouse_connect/driver/parser.py b/clickhouse_connect/driver/parser.py index b74d31dd..76ce66b1 100644 --- a/clickhouse_connect/driver/parser.py +++ b/clickhouse_connect/driver/parser.py @@ -124,44 +124,42 @@ def parse_columns(expr: str): names = [] columns = [] pos = 1 - in_column = False + named = False level = 0 - name = [] - column = '' + label = '' in_str = False while True: char = expr[pos] pos += 1 - if in_column: - if in_str: - column += '' - if "'" == char: - in_str = False - elif char == '\\' and expr[pos] == "'" and expr[pos:pos + 4] != "' = " and expr[pos:pos + 2] != "')": - column += expr[pos] - pos += 1 - else: - if level == 0: - if char == ',': - columns.append(column) - column = '' - in_column = False - continue - if char == ')': - columns.append(column) - break - if char == "'" and (not column or 'Enum' in column): - in_str = True - if char == '(': - level += 1 - elif char == ')': - level -= 1 - column += char - elif char == ' ': - if name: - names.append(''.join(name)) - name = [] - in_column = True + if in_str: + if "'" == char: + in_str = False + elif char == '\\' and expr[pos] == "'" and expr[pos:pos + 4] != "' = " and expr[pos:pos + 2] != "')": + label += expr[pos] + pos += 1 else: - name.append(char) + if level == 0: + if char == ' ': + if label: + names.append(label) + label = '' + named = True + char = '' + elif char == ',': + columns.append(label) + if not named: + names.append('') + named = False + label = '' + continue + elif char == ')': + columns.append(label) + break + if char == "'" and (not label or 'Enum' in label): + in_str = True + elif char == '(': + level += 1 + elif char == ')': + level -= 1 + label += char return tuple(names), tuple(columns) diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index c7c2cd37..f37bf974 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -115,6 +115,5 @@ def from_pandas_df(df: 'pa.DataFrame'): def to_arrow(content: bytes): check_arrow() - buf = pyarrow.BufferReader(content) - schema = pyarrow.read_schema(buf) - return schema, pyarrow.read_record_batch(buf, schema) + reader = pyarrow.RecordBatchFileReader(content) + return reader.read_all() diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index b9e17313..58396f68 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -69,6 +69,9 @@ def test_client_fixture(test_config: TestConfig, test_db: str) -> Iterator[Clien if test_config.use_docker: run_cmd(['docker-compose', '-f', compose_file, 'down', '-v']) sys.stderr.write('Starting docker compose') + pull_result = run_cmd(['docker-compose', '-f', compose_file, 'pull']) + if pull_result[0]: + raise Exception(f'Failed to pull latest docker image(s): {pull_result[2]}') up_result = run_cmd(['docker-compose', '-f', compose_file, 'up', '-d']) if up_result[0]: raise Exception(f'Failed to start docker: {up_result[2]}') diff --git a/tests/integration_tests/test_formats.py b/tests/integration_tests/test_formats.py index c7abc03f..4b344003 100644 --- a/tests/integration_tests/test_formats.py +++ b/tests/integration_tests/test_formats.py @@ -7,12 +7,20 @@ def test_arrow(test_client: Client): if not HAS_ARROW: pytest.skip('PyArrow package not available') - arrow_schema, arrow_batch = test_client.query_arrow('SELECT database, name, total_rows FROM system.tables') + arrow_table = test_client.query_arrow('SELECT database, name, total_rows FROM system.tables') + arrow_schema = arrow_table.schema assert arrow_schema.field(0).name == 'database' - assert arrow_schema.field(2).type.id == 8 + assert arrow_schema.field(1).type.id == 13 assert arrow_schema.field(2).type.bit_width == 64 - assert arrow_batch.num_rows > 20 - assert len(arrow_batch.columns) == 3 + assert arrow_table.num_rows > 20 + assert len(arrow_table.columns) == 3 + + arrow_table = test_client.query_arrow('SELECT number from system.numbers LIMIT 500', + settings={'max_block_size': 50}) + arrow_schema = arrow_table.schema + assert arrow_schema.field(0).name == 'number' + assert arrow_schema.field(0).type.id == 8 + assert arrow_table.num_rows == 500 def test_numpy(test_client: Client): diff --git a/tests/unit_tests/test_chtypes.py b/tests/unit_tests/test_chtypes.py index fd096ab7..df66018c 100644 --- a/tests/unit_tests/test_chtypes.py +++ b/tests/unit_tests/test_chtypes.py @@ -36,3 +36,8 @@ def test_nested_parse(): nested_name = f'Nested({nest})' nested_type = gfn(nested_name) assert nested_type.name == nested_name + + +def test_named_tuple(): + pass + From 9adc694d52ddd112f9df70dc35dd724d3d3bc72f Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 26 Jul 2022 04:35:49 -0600 Subject: [PATCH 10/25] Named tuple tweaks and test --- clickhouse_connect/datatypes/container.py | 5 ++++- clickhouse_connect/driver/parser.py | 7 +++---- tests/integration_tests/test_formats.py | 4 ++-- tests/unit_tests/test_chtypes.py | 6 ++++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 382eacdc..84a7ea47 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -93,7 +93,10 @@ def __init__(self, type_def: TypeDef): self.element_types = [get_from_name(name) for name in type_def.values] self.from_rb_funcs = tuple((t.from_row_binary for t in self.element_types)) self.to_rb_funcs = tuple((t.to_row_binary for t in self.element_types)) - self._name_suffix = type_def.arg_str + if self.element_names: + self._name_suffix = f"({', '.join(k + ' ' + str(v) for k, v in zip(type_def.keys, type_def.values))})" + else: + self._name_suffix = type_def.arg_str def _from_row_binary(self, source: bytes, loc: int): values = [] diff --git a/clickhouse_connect/driver/parser.py b/clickhouse_connect/driver/parser.py index 76ce66b1..0064302c 100644 --- a/clickhouse_connect/driver/parser.py +++ b/clickhouse_connect/driver/parser.py @@ -117,7 +117,8 @@ def parse_enum(expr) -> Tuple[Tuple[str], Tuple[int]]: def parse_columns(expr: str): """ - Parse a ClickHouse column list of the form (col1 String, col2 Array(Tuple(String, Int32))) + Parse a ClickHouse column list of the form (col1 String, col2 Array(Tuple(String, Int32))). This also handles + unnamed columns (such as Tuple definitions). Mixed named and unnamed columns are not currently supported. :param expr: ClickHouse enum expression/arguments :return: Parallel tuples of column types and column types (strings) """ @@ -140,15 +141,13 @@ def parse_columns(expr: str): else: if level == 0: if char == ' ': - if label: + if label and not named: names.append(label) label = '' named = True char = '' elif char == ',': columns.append(label) - if not named: - names.append('') named = False label = '' continue diff --git a/tests/integration_tests/test_formats.py b/tests/integration_tests/test_formats.py index 4b344003..28ddf82c 100644 --- a/tests/integration_tests/test_formats.py +++ b/tests/integration_tests/test_formats.py @@ -7,10 +7,10 @@ def test_arrow(test_client: Client): if not HAS_ARROW: pytest.skip('PyArrow package not available') - arrow_table = test_client.query_arrow('SELECT database, name, total_rows FROM system.tables') + arrow_table = test_client.query_arrow('SELECT database, name, total_rows FROM system.tables', use_strings=False) arrow_schema = arrow_table.schema assert arrow_schema.field(0).name == 'database' - assert arrow_schema.field(1).type.id == 13 + assert arrow_schema.field(1).type.id == 14 assert arrow_schema.field(2).type.bit_width == 64 assert arrow_table.num_rows > 20 assert len(arrow_table.columns) == 3 diff --git a/tests/unit_tests/test_chtypes.py b/tests/unit_tests/test_chtypes.py index df66018c..140cdb1d 100644 --- a/tests/unit_tests/test_chtypes.py +++ b/tests/unit_tests/test_chtypes.py @@ -39,5 +39,7 @@ def test_nested_parse(): def test_named_tuple(): - pass - + tuple_type = gfn('Tuple(Int64, String)') + assert tuple_type.name == 'Tuple(Int64, String)' + tuple_type = gfn('Tuple(key Int64, value String)') + assert tuple_type.name == 'Tuple(key Int64, value String)' From 454e5151c71a565453bc08e216ee6d7a54184fd0 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 26 Jul 2022 05:09:50 -0600 Subject: [PATCH 11/25] Fix lint --- .../{test_formats.py => test_data_libraries.py} | 1 - tests/unit_tests/test_driver/test_formats.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) rename tests/integration_tests/{test_formats.py => test_data_libraries.py} (99%) diff --git a/tests/integration_tests/test_formats.py b/tests/integration_tests/test_data_libraries.py similarity index 99% rename from tests/integration_tests/test_formats.py rename to tests/integration_tests/test_data_libraries.py index 28ddf82c..c2c2c81b 100644 --- a/tests/integration_tests/test_formats.py +++ b/tests/integration_tests/test_data_libraries.py @@ -14,7 +14,6 @@ def test_arrow(test_client: Client): assert arrow_schema.field(2).type.bit_width == 64 assert arrow_table.num_rows > 20 assert len(arrow_table.columns) == 3 - arrow_table = test_client.query_arrow('SELECT number from system.numbers LIMIT 500', settings={'max_block_size': 50}) arrow_schema = arrow_table.schema diff --git a/tests/unit_tests/test_driver/test_formats.py b/tests/unit_tests/test_driver/test_formats.py index eee76926..a9d79b92 100644 --- a/tests/unit_tests/test_driver/test_formats.py +++ b/tests/unit_tests/test_driver/test_formats.py @@ -13,5 +13,4 @@ def test_default_formats(): def test_fixed_str_format(): set_write_format('FixedString', 'string') - - + assert FixedString.write_format() == 'string' From 24552563be4ce055a6611eb5b20a93d10eb6a04e Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 27 Jul 2022 17:29:25 -0600 Subject: [PATCH 12/25] JSON support checkpoint --- README.md | 4 +- .../cc_sqlalchemy/sql/__init__.py | 4 +- clickhouse_connect/datatypes/container.py | 53 ++++++++++++++++--- clickhouse_connect/driver/client.py | 25 +++++++++ clickhouse_connect/driver/common.py | 8 ++- clickhouse_connect/driver/httpclient.py | 9 +++- clickhouse_connect/driver/parser.py | 5 +- clickhouse_connect/json_impl.py | 17 ++++++ tests/integration_tests/conftest.py | 2 + tests/integration_tests/test_native.py | 33 +++++++++++- tests/integration_tests/test_native_fuzz.py | 3 +- 11 files changed, 143 insertions(+), 20 deletions(-) create mode 100644 clickhouse_connect/json_impl.py diff --git a/README.md b/README.md index 14f06747..dc27a96b 100644 --- a/README.md +++ b/README.md @@ -104,8 +104,8 @@ Create a ClickHouse client using the `clickhouse_connect.driver.create_client(.. Native format is preferred for performance reasons * `query_limit:int` LIMIT value added to all queries. Defaults to 5,000 rows. Unlimited queries are not supported to prevent crashing the driver -* `connect_timeout:int` HTTP connection timeout in seconds -* `send_receive_timeout:int` HTTP read timeout in seconds +* `connect_timeout:int` HTTP connection timeout in seconds. Default 10 seconds. +* `send_receive_timeout:int` HTTP read timeout in seconds. Default 300 seconds. * `client_name:str` HTTP User-Agent header. Defaults to `clickhouse-connect` * `verify:bool` For HTTPS connections, validate the ClickHouse server TLS certificate, including matching hostname, expiration, and signed by a trusted Certificate Authority. Defaults to True. diff --git a/clickhouse_connect/cc_sqlalchemy/sql/__init__.py b/clickhouse_connect/cc_sqlalchemy/sql/__init__.py index 19af42e1..e4040382 100644 --- a/clickhouse_connect/cc_sqlalchemy/sql/__init__.py +++ b/clickhouse_connect/cc_sqlalchemy/sql/__init__.py @@ -1,11 +1,11 @@ -import re from typing import Optional from sqlalchemy import Table from sqlalchemy.sql.compiler import RESERVED_WORDS +from clickhouse_connect.driver.common import identifier_re + reserved_words = RESERVED_WORDS | set('index') -identifier_re = re.compile(r'^[a-zA-Z_][0-9a-zA-Z_]*$') def quote_id(v: str) -> str: diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 84a7ea47..d3e01ce1 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -1,9 +1,11 @@ import array -from typing import Dict, Sequence, MutableSequence +from typing import Dict, Sequence, MutableSequence, Any -from clickhouse_connect.datatypes.base import UnsupportedType, ClickHouseType, TypeDef -from clickhouse_connect.driver.common import read_leb128, to_leb128, array_column, must_swap +from clickhouse_connect.datatypes.base import UnsupportedType, ClickHouseType, TypeDef, EMPTY_TYPE_DEF +from clickhouse_connect.datatypes.string import String +from clickhouse_connect.driver.common import read_leb128, to_leb128, array_column, must_swap, write_uint64 from clickhouse_connect.datatypes.registry import get_from_name +from clickhouse_connect.json_impl import json_impl class Array(ClickHouseType): @@ -116,9 +118,16 @@ def read_native_prefix(self, source: Sequence, loc: int): def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none = True): columns = [] + e_names = self.element_names for e_type in self.element_types: column, loc = e_type.read_native_data(source, loc, num_rows, use_none) columns.append(column) + if e_names and self.read_format != 'tuple': + dicts = [{} for _ in range(num_rows)] + for ix, x in enumerate(dicts): + for n, key in enumerate(e_names): + x[key] = columns[n][ix] + return dicts, loc return tuple(zip(*columns)), loc def write_native_prefix(self, dest: MutableSequence): @@ -239,13 +248,41 @@ def write_native_data(self, column: Sequence, dest: MutableSequence): self.tuple_array.write_native_data(data, dest) -class Object(UnsupportedType): +class JSON(ClickHouseType): python_type = dict - def __init__(self, type_def): - super().__init__(type_def) - self._name_suffix = type_def.arg_str + def _to_row_binary(self, value: Any, dest: MutableSequence): + value = bytes(json_impl.dumps(value)) + dest += to_leb128(len(value)) + value + def _from_row_binary(self, source: Sequence, loc: int): + length, loc = read_leb128(source, loc) + return json_impl.loads(str(source[loc:loc + length])), loc + length + + def write_native_prefix(self, dest: MutableSequence): + dest.append(0x01) -class JSON(UnsupportedType): + def write_native_data(self, column: Sequence, dest: MutableSequence): + app = dest.append + to_json = json_impl.dumps + for x in column: + v = to_json(x) + sz = len(v) + while True: + b = sz & 0x7f + sz >>= 7 + if sz == 0: + app(b) + break + app(0x80 | b) + dest += v + + +class Object(JSON): python_type = dict + + def __init__(self, type_def): + if type_def.values[0].lower() != "'json'": + raise NotImplementedError('Only json Object type is currently supported') + super().__init__(type_def) + self._name_suffix = type_def.arg_str diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 86109144..0079201d 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -69,6 +69,14 @@ def _prep_query(self, query: str, parameters: Optional[Dict[str, Any]] = None): query += f' LIMIT {self.limit}' return query + @abstractmethod + def client_setting(self, name, value): + """ + Set a clickhouse setting for the client after initialization + :param name: Setting name + :param value: Setting value + """ + @abstractmethod def query(self, query: str, @@ -245,6 +253,23 @@ def normalize_table(self, table: str, database: Optional[str]) -> Tuple[str, str full_name = f'{database}.{name}' return table, database, full_name + def min_version(self, version_str: str) -> bool: + """ + Determine whether the connected server is at least the submitted version + :param version_str: Version string consisting of up to 4 integers delimited by dots + :return: 1 if the version_str is greater than the server_version, 0 if equal, -1 if less than + """ + server_parts = [int(x) for x in self.server_version.split('.')] + server_parts.extend([0] * (4 - len(server_parts))) + version_parts = [int(x) for x in version_str.split('.')] + version_parts.extend([0] * (4 - len(version_parts))) + for x, y in zip(server_parts, version_parts): + if x > y: + return True + if x < y: + return False + return True + def table_columns(self, table: str, database: str) -> Tuple[ColumnDef]: """ Return complete column definitions for a ClickHouse table diff --git a/clickhouse_connect/driver/common.py b/clickhouse_connect/driver/common.py index f1ae9c97..4b448796 100644 --- a/clickhouse_connect/driver/common.py +++ b/clickhouse_connect/driver/common.py @@ -1,5 +1,6 @@ import array import sys +import re from typing import Tuple, Sequence, MutableSequence @@ -9,6 +10,8 @@ low_card_version = 1 array_map = {1: 'b', 2: 'h', 4: 'i', 8: 'q'} +decimal_prec = {32: 9, 64: 18, 128: 38, 256: 79} +identifier_re = re.compile('^[a-zA-Z_][0-9a-zA-Z_]*$') if int_size == 2: array_map[4] = 'l' @@ -166,4 +169,7 @@ def decimal_size(prec: int): return 256 -decimal_prec = {32: 9, 64: 18, 128: 38, 256: 79} +def unescape_identifier(x: str) -> str: + if x.startswith('`') and x.endswith('`'): + return x[1:-1] + return x \ No newline at end of file diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index d7aaec41..fb6b081d 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -47,7 +47,7 @@ def __init__(self, data_format: str = 'native', query_limit: int = 5000, connect_timeout: int = 10, - send_receive_timeout=60, + send_receive_timeout = 300, client_name: str = 'clickhouse-connect', send_progress: bool = True, verify: bool = True, @@ -143,6 +143,11 @@ def _format_query(self, query: str) -> str: query += f' FORMAT {self.read_format}' return query + def client_setting(self, name, value): + if isinstance(value, bool): + value = '1' if value else '0' + self.session.params[name] = str(value) + def query(self, query: str, parameters: Optional[Dict[str, Any]] = None, settings: Dict[str, Any] = None, @@ -192,7 +197,7 @@ def data_insert(self, 'database': self.database} params.update(self._validate_settings(settings, True)) insert_block = self.transform.build_insert(data, column_types=column_types, column_names=column_names, - column_oriented=column_oriented) + column_oriented=column_oriented) response = self._raw_request(insert_block, params, headers) logger.debug('Insert response code: %d, content: %s', response.status_code, response.content) diff --git a/clickhouse_connect/driver/parser.py b/clickhouse_connect/driver/parser.py index 0064302c..1815734c 100644 --- a/clickhouse_connect/driver/parser.py +++ b/clickhouse_connect/driver/parser.py @@ -2,6 +2,9 @@ # pylint: disable=too-many-branches +from clickhouse_connect.driver.common import unescape_identifier + + def parse_callable(expr) -> Tuple[str, Tuple[Union[str, int], ...], str]: """ Parses a single level ClickHouse optionally 'callable' function/identifier. The identifier is returned as the @@ -142,7 +145,7 @@ def parse_columns(expr: str): if level == 0: if char == ' ': if label and not named: - names.append(label) + names.append(unescape_identifier(label)) label = '' named = True char = '' diff --git a/clickhouse_connect/json_impl.py b/clickhouse_connect/json_impl.py new file mode 100644 index 00000000..5728520a --- /dev/null +++ b/clickhouse_connect/json_impl.py @@ -0,0 +1,17 @@ +import logging + +logger = logging.getLogger(__name__) + +try: + import orjson as json_impl + logger.info('Using orjson as the JSON implementation') +except ImportError: + try: + import ujson as json_impl + logger.info('Using ujson as the JSON implementation') + except ImportError: + import json as json_impl + logger.info('Using default JSON implementation') + + + diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 58396f68..c46feeca 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -91,6 +91,8 @@ def test_client_fixture(test_config: TestConfig, test_db: str) -> Iterator[Clien if tries > 15: raise Exception('Failed to connect to ClickHouse server after 30 seconds') from ex sleep(1) + if client.min_version('22.6.1'): + client.client_setting('allow_experimental_object_type', 1) if test_db != 'default': client.command(f'CREATE DATABASE IF NOT EXISTS {test_db}', use_database=False) client.database = test_db diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index 1c149801..2602a8dc 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -1,7 +1,36 @@ -def test_low_card(test_client): +import pytest + +from clickhouse_connect.driver import Client + + +def test_low_card(test_client: Client, test_table_engine: str): test_client.command('DROP TABLE IF EXISTS native_test') test_client.command('CREATE TABLE native_test (key LowCardinality(Int32), value_1 LowCardinality(String)) ' + - 'Engine MergeTree ORDER BY key') + f'Engine {test_table_engine} ORDER BY key') test_client.insert('native_test', [[55, 'TV1'], [-578328, 'TV38882'], [57372, 'Kabc/defXX']]) result = test_client.query("SELECT * FROM native_test WHERE value_1 LIKE '%abc/def%'") assert len(result.result_set) == 1 + + +def test_json_insert(test_client: Client, test_table_engine: str): + if not test_client.min_version('22.6.1'): + pytest.skip('JSON test skipped for old version {test_client.server_version}') + test_client.command('DROP TABLE IF EXISTS native_json_test') + test_client.command('CREATE TABLE native_json_test (key Int32, value JSON, e2 Int32)' + + f'Engine {test_table_engine} ORDER BY key') + jv1 = {'key1': 337, 'value.2': 'vvvv', 'HKD@spéçiäl': 'Special K', 'blank': 'not_really_blank'} + jv3 = {'key3': 752, 'value.2': 'v2_rules', 'blank': None} + test_client.insert('native_json_test', [[5, jv1, -44], [20, None, 5200], [25, jv3, 7302]]) + + result = test_client.query('SELECT * FROM native_json_test ORDER BY key') + json1 = result.result_set[0][1] + assert json1['HKD@spéçiäl'] == 'Special K' + assert json1['key3'] == 0 + json3 = result.result_set[2][1] + assert json3['value.2'] == 'v2_rules' + assert json3['key1'] == 0 + assert json3['key3'] == 752 + + +def test_read_formats(test_client: Client, test_table_engine: str): + pass diff --git a/tests/integration_tests/test_native_fuzz.py b/tests/integration_tests/test_native_fuzz.py index e24fc7db..4f5770e4 100644 --- a/tests/integration_tests/test_native_fuzz.py +++ b/tests/integration_tests/test_native_fuzz.py @@ -12,8 +12,7 @@ # pylint: disable=duplicate-code def test_query_fuzz(test_client: Client, test_table_engine: str): - server_major = test_client.server_version.split('.')[0] - if int(server_major) < 22: + if not test_client.min_version('22.1'): unsupported_types.add('Date32') unsupported_types.add('Bool') unsupported_types.add('UInt128') From 37e4847e3ae3e0fb5b245a1a80d64c15c8d43205 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 05:05:01 -0600 Subject: [PATCH 13/25] Enhance JSON library selection and usage, use QueryContext for all queries --- CHANGELOG.md | 18 +++++- clickhouse_connect/datatypes/base.py | 15 ++--- clickhouse_connect/datatypes/container.py | 26 ++++---- clickhouse_connect/datatypes/format.py | 9 +-- clickhouse_connect/datatypes/string.py | 1 + clickhouse_connect/driver/client.py | 63 ++++++++++++++++--- clickhouse_connect/driver/common.py | 2 +- clickhouse_connect/driver/httpclient.py | 18 ++---- clickhouse_connect/driver/native.py | 4 +- clickhouse_connect/driver/query.py | 75 ++++++++++++++++++++++- clickhouse_connect/driver/transform.py | 32 +++------- clickhouse_connect/json_impl.py | 45 +++++++++++--- setup.py | 3 +- tests/integration_tests/test_native.py | 29 ++++++++- 14 files changed, 247 insertions(+), 93 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60dc14f8..96a786b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,24 @@ ## ClickHouse Connect ChangeLog +### Release 0.1.7, 2022-07-28 + +#### Improvements + +* Support (experimental) JSON/Object datatype. ClickHouse Connect will take advantage of the fast orjson library if available. +* Standardize read format handling and allow setting a return data format per column or per query. + +#### Bug Fixes +* Named Tuples were not supported and would result in throwing an exception. This has been fixed. +* The client query_arrow function would return incomplete results if the query result exceeded the ClickHouse max_block_size. This has been fixed. As part of the fix query_arrow method returns a PyArrow Table object. While this is a breaking change in the API it should be easy to work around. + + ### Release 0.1.6, 2022-07-06 #### Improvements -* Support Nested data types +* Support Nested data types. #### Bug Fixes -* Fix issue with native reads of Nullable(LowCardinality) numeric and date types -* Empty inserts will now just log a debug message instead of throwing an IndexError \ No newline at end of file +* Fix issue with native reads of Nullable(LowCardinality) numeric and date types. +* Empty inserts will now just log a debug message instead of throwing an IndexError. \ No newline at end of file diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index ef925da9..3bccf150 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -53,10 +53,11 @@ def build(cls: Type['ClickHouseType'], type_def: TypeDef): @classmethod def _active_format(cls, fmt_map: Dict[Type['ClickHouseType'], str]): - overrides = getattr(threading.local, 'ch_column_overrides', None) + t_local = threading.local() + overrides = getattr(t_local, 'ch_column_overrides', None) if overrides and cls in overrides: return overrides[cls] - overrides = getattr(threading.local, 'ch_query_overrides)', None) + overrides = getattr(t_local, 'ch_query_overrides)', None) if overrides and cls in overrides: return overrides[cls] return fmt_map.get(cls, 'native') @@ -101,19 +102,19 @@ def name(self): @property def encoding(self): - override = getattr(threading.local, 'ch_column_encoding', None) + override = getattr(threading.local(), 'ch_column_encoding', None) if override: return override - override = getattr(threading.local, 'ch_query_encoding', None) + override = getattr(threading.local(), 'ch_query_encoding', None) if override: return override return self._encoding def write_native_prefix(self, dest: MutableSequence): """ - This is something of a hack, as the only "prefix" currently used is for the LowCardinality version. Because of the - way the ClickHouse C++ code is written, this must be done before any data is written even if the LowCardinality column - is within a container + Prefix is primarily used is for the LowCardinality version (but see the JSON data type). Because of the + way the ClickHouse C++ code is written, this must be done before any data is written even if the + LowCardinality column is within a container. The only recognized low cardinality version is 1 :param dest: The native protocol binary write buffer """ if self.low_card: diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index d3e01ce1..930310ef 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -1,11 +1,10 @@ import array from typing import Dict, Sequence, MutableSequence, Any -from clickhouse_connect.datatypes.base import UnsupportedType, ClickHouseType, TypeDef, EMPTY_TYPE_DEF -from clickhouse_connect.datatypes.string import String -from clickhouse_connect.driver.common import read_leb128, to_leb128, array_column, must_swap, write_uint64 +from clickhouse_connect import json_impl +from clickhouse_connect.datatypes.base import ClickHouseType, TypeDef +from clickhouse_connect.driver.common import read_leb128, to_leb128, array_column, must_swap from clickhouse_connect.datatypes.registry import get_from_name -from clickhouse_connect.json_impl import json_impl class Array(ClickHouseType): @@ -116,17 +115,17 @@ def read_native_prefix(self, source: Sequence, loc: int): loc = e_type.read_native_prefix(source, loc) return loc - def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none = True): + def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none=True): columns = [] e_names = self.element_names for e_type in self.element_types: column, loc = e_type.read_native_data(source, loc, num_rows, use_none) columns.append(column) - if e_names and self.read_format != 'tuple': + if e_names and self.read_format() != 'tuple': dicts = [{} for _ in range(num_rows)] for ix, x in enumerate(dicts): - for n, key in enumerate(e_names): - x[key] = columns[n][ix] + for y, key in enumerate(e_names): + x[key] = columns[y][ix] return dicts, loc return tuple(zip(*columns)), loc @@ -176,7 +175,7 @@ def read_native_prefix(self, source: Sequence, loc: int): return loc # pylint: disable=too-many-locals - def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none = True): + def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none=True): offsets, loc = array_column('Q', source, loc, num_rows) total_rows = offsets[-1] keys, loc = self.key_type.read_native_data(source, loc, total_rows, use_none) @@ -252,19 +251,20 @@ class JSON(ClickHouseType): python_type = dict def _to_row_binary(self, value: Any, dest: MutableSequence): - value = bytes(json_impl.dumps(value)) + value = json_impl.any_to_json(value) dest += to_leb128(len(value)) + value def _from_row_binary(self, source: Sequence, loc: int): - length, loc = read_leb128(source, loc) - return json_impl.loads(str(source[loc:loc + length])), loc + length + # ClickHouse will never return JSON/Object types, just tuples + return None, 0 def write_native_prefix(self, dest: MutableSequence): dest.append(0x01) + # pylint: disable=duplicate-code def write_native_data(self, column: Sequence, dest: MutableSequence): app = dest.append - to_json = json_impl.dumps + to_json = json_impl.any_to_json for x in column: v = to_json(x) sz = len(v) diff --git a/clickhouse_connect/datatypes/format.py b/clickhouse_connect/datatypes/format.py index 9cbab1f9..d8708566 100644 --- a/clickhouse_connect/datatypes/format.py +++ b/clickhouse_connect/datatypes/format.py @@ -69,13 +69,8 @@ def _convert_arguments(*args, **kwargs) -> Dict[str, str]: def _matching_types(pattern: str, fmt: str = None) -> Sequence[Type[ClickHouseType]]: - if '*' in pattern: - re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) - matches = [ch_type for type_name, ch_type in type_map.items() if re_pattern.match(type_name)] - elif pattern in type_map: - matches = [type_map[pattern]] - else: - matches = [] + re_pattern = re.compile(pattern.replace('*', '.*'), re.IGNORECASE) + matches = [ch_type for type_name, ch_type in type_map.items() if re_pattern.match(type_name)] if not matches: ProgrammingError(f'Unrecognized ClickHouse type {pattern} when setting formats') if fmt: diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py index 45c0ef78..911a3050 100644 --- a/clickhouse_connect/datatypes/string.py +++ b/clickhouse_connect/datatypes/string.py @@ -36,6 +36,7 @@ def _read_native_python(source, loc, num_rows, encoding: str): loc += length return column, loc + # pylint: disable=duplicate-code def _write_native_binary(self, column: Union[Sequence, MutableSequence], dest: MutableSequence): encoding = self.encoding app = dest.append diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 0079201d..5e538c17 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -9,7 +9,7 @@ from clickhouse_connect.driver.exceptions import ProgrammingError, InternalError from clickhouse_connect.driver.models import ColumnDef, SettingDef from clickhouse_connect.driver.query import QueryResult, np_result, to_pandas_df, from_pandas_df, format_query_value, \ - to_arrow + to_arrow, QueryContext logger = logging.getLogger(__name__) limit_re = re.compile(r'\s+LIMIT[$|\s]', re.IGNORECASE) @@ -69,6 +69,10 @@ def _prep_query(self, query: str, parameters: Optional[Dict[str, Any]] = None): query += f' LIMIT {self.limit}' return query + @abstractmethod + def _query_with_context(self, context: QueryContext): + pass + @abstractmethod def client_setting(self, name, value): """ @@ -77,20 +81,35 @@ def client_setting(self, name, value): :param value: Setting value """ - @abstractmethod def query(self, - query: str, + query: str = None, parameters: Optional[Dict[str, Any]] = None, settings: Optional[Dict[str, Any]] = None, - use_none: bool = True) -> QueryResult: + query_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, str]] = None, + use_none: bool = True, + context: QueryContext = None) -> QueryResult: """ Main query method for SELECT, DESCRIBE and other commands that result a result matrix :param query: Query statement/format string :param parameters: Optional dictionary used to format the query :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param query_formats: See QueryContext __init__ docstring + :param column_formats: See QueryContext __init__ docstring :param use_none: Use None for ClickHouse nulls instead of empty values + :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: QueryResult -- data and metadata from response """ + if context: + query_context = context.updated_copy(query, + parameters, + settings, + query_formats, + column_formats, + False) + else: + query_context = QueryContext(query, parameters, settings, query_formats, column_formats, use_none) + return self._query_with_context(query_context) @abstractmethod def raw_query(self, @@ -108,30 +127,54 @@ def raw_query(self, """ def query_np(self, - query: str, + query: str = None, parameters: Optional[Dict[str, Any]] = None, - settings: Optional[Dict[str, Any]] = None): + settings: Optional[Dict[str, Any]] = None, + query_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, str]] = None, + context: QueryContext = None): """ Query method that results the results as a numpy array :param query: Query statement/format string :param parameters: Optional dictionary used to format the query :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param query_formats: See QueryContext __init__ docstring + :param column_formats: See QueryContext __init__ docstring. + :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: Numpy array representing the result set """ - return np_result(self.query(query, parameters, settings, use_none=False)) + return np_result(self.query(query, + parameters, + settings, + query_formats, + column_formats, + False, + context)) def query_df(self, - query: str, + query: str = None, parameters: Optional[Dict[str, Any]] = None, - settings: Optional[Dict[str, Any]] = None): + settings: Optional[Dict[str, Any]] = None, + query_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, str]] = None, + context: QueryContext = None): """ Query method that results the results as a pandas dataframe :param query: Query statement/format string :param parameters: Optional dictionary used to format the query :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param query_formats: See QueryContext __init__ docstring + :param column_formats: See QueryContext __init__ docstring + :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: Numpy array representing the result set """ - return to_pandas_df(self.query(query, parameters, settings, use_none=False)) + return to_pandas_df(self.query(query, + parameters, + settings, + query_formats, + column_formats, + False, + context)) def query_arrow(self, query: str, diff --git a/clickhouse_connect/driver/common.py b/clickhouse_connect/driver/common.py index 4b448796..a0480ed9 100644 --- a/clickhouse_connect/driver/common.py +++ b/clickhouse_connect/driver/common.py @@ -172,4 +172,4 @@ def decimal_size(prec: int): def unescape_identifier(x: str) -> str: if x.startswith('`') and x.endswith('`'): return x[1:-1] - return x \ No newline at end of file + return x diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index fb6b081d..d4a4bf7a 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -14,7 +14,7 @@ from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError, ProgrammingError from clickhouse_connect.driver.httpadapter import KeepAliveAdapter from clickhouse_connect.driver.native import NativeTransform -from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value +from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value, QueryContext from clickhouse_connect.driver.rowbinary import RowBinaryTransform logger = logging.getLogger(__name__) @@ -148,18 +148,12 @@ def client_setting(self, name, value): value = '1' if value else '0' self.session.params[name] = str(value) - def query(self, query: str, - parameters: Optional[Dict[str, Any]] = None, - settings: Dict[str, Any] = None, - use_none: bool = True) -> QueryResult: - """ - See BaseClient doc_string for this method - """ - final_query = self._prep_query(query, parameters) + def _query_with_context(self, context: QueryContext) -> QueryResult: + final_query = self._prep_query(context.query, context.parameters) headers = {'Content-Type': 'text/plain; charset=utf-8'} params = {'database': self.database} - params.update(self._validate_settings(settings, True)) - if columns_only_re.search(query): + params.update(self._validate_settings(context.settings, True)) + if columns_only_re.search(final_query): response = self._raw_request(final_query + ' FORMAT JSON', params, headers, retries=2) json_result = json.loads(response.content) # ClickHouse will respond with a JSON object of meta, data, and some other objects @@ -172,7 +166,7 @@ def query(self, query: str, data_result = DataResult([], tuple(names), tuple(types)) else: response = self._raw_request(self._format_query(final_query), params, headers, retries=2) - data_result = self.transform.parse_response(response.content, use_none=use_none) + data_result = self.transform.parse_response(response.content, context) summary = {} if 'X-ClickHouse-Summary' in response.headers: try: diff --git a/clickhouse_connect/driver/native.py b/clickhouse_connect/driver/native.py index 3c738350..7356c2b9 100644 --- a/clickhouse_connect/driver/native.py +++ b/clickhouse_connect/driver/native.py @@ -18,6 +18,7 @@ def _transform_response(self, source: Sequence, context: QueryContext) -> DataRe result = [] total_size = len(source) block = 0 + use_none = context.use_none while loc < total_size: result_block = [] num_cols, loc = read_leb128(source, loc) @@ -32,7 +33,8 @@ def _transform_response(self, source: Sequence, context: QueryContext) -> DataRe col_types.append(col_type) else: col_type = col_types[col_num] - column, loc = col_type.read_native_column(source, loc, num_rows, use_none=context.use_none) + context.start_column(name, col_type) + column, loc = col_type.read_native_column(source, loc, num_rows, use_none=use_none) result_block.append(column) block += 1 result.extend(list(zip(*result_block))) diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index f37bf974..d41a6cc2 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -1,12 +1,15 @@ import ipaddress +import threading import uuid from enum import Enum -from typing import NamedTuple, Any, Tuple, Dict, Sequence +from typing import NamedTuple, Any, Tuple, Dict, Sequence, Optional, Union from datetime import date, datetime from pytz import UTC from clickhouse_connect.datatypes.base import ClickHouseType +from clickhouse_connect.datatypes.container import Array +from clickhouse_connect.datatypes.format import format_map from clickhouse_connect.driver.options import HAS_NUMPY, HAS_PANDAS, check_pandas, check_numpy, HAS_ARROW, check_arrow if HAS_PANDAS: @@ -19,10 +22,78 @@ import pyarrow +class QueryContext: + """ + Argument/parameter object for queries + """ + def __init__(self, + query: str = None, + parameters: Optional[Dict[str, Any]] = None, + settings: Optional[Dict[str, Any]] = None, + query_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, + use_none: bool = True): + self.query = query + self.parameters = parameters or {} + self.settings = settings or {} + self.query_formats = format_map(query_formats) + self.column_formats = column_formats or {} + self.use_none = use_none + + def updated_copy(self, + query: Optional[str] = None, + parameters: Optional[Dict[str, Any]] = None, + settings: Optional[Dict[str, Any]] = None, + query_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, str]] = None, + use_none: Optional[bool] = None) -> 'QueryContext': + copy = QueryContext() + copy.query = query or self.query + copy.parameters = self.parameters.update(parameters or {}) + copy.settings = self.settings.update(settings or {}) + copy.query_formats = self.query_formats.update(query_formats or {}) + copy.column_formats = self.column_formats.update(column_formats or {}) + copy.use_none = use_none if use_none is not None else self.use_none + return copy + + def __enter__(self): + if self.query_formats: + threading.local().ch_query_overrides = self.query_formats + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + t_local = threading.local() + if self.query_formats: + del t_local.ch_query_overrides + try: + del t_local.ch_column_overrides + except AttributeError: + pass + + def start_column(self, name: str, ch_type: ClickHouseType): + t_local = threading.local() + if name in self.column_formats: + fmts = self.column_formats[name] + if isinstance(fmts, str): + if isinstance(ch_type, Array): + fmt_map = {ch_type.element_type: fmts} + else: + fmt_map = {ch_type: fmts} + else: + fmt_map = format_map(fmts) + t_local.ch_column_overrides = fmt_map + else: + try: + del t_local.ch_column_overrides + except AttributeError: + pass + + class QueryResult(): """ Wrapper class for query return values and metadata """ + def __init__(self, result_set: Sequence[Sequence[Any]], column_names: Tuple[str, ...], column_types: Tuple[ClickHouseType, ...], query_id: str = None, summary: Dict[str, Any] = None): self.result_set = result_set @@ -115,5 +186,5 @@ def from_pandas_df(df: 'pa.DataFrame'): def to_arrow(content: bytes): check_arrow() - reader = pyarrow.RecordBatchFileReader(content) + reader = pyarrow.ipc.RecordBatchFileReader(content) return reader.read_all() diff --git a/clickhouse_connect/driver/transform.py b/clickhouse_connect/driver/transform.py index 16f2250a..9409329c 100644 --- a/clickhouse_connect/driver/transform.py +++ b/clickhouse_connect/driver/transform.py @@ -1,42 +1,24 @@ -import threading from abc import ABC, abstractmethod -from typing import Sequence, Dict, Union, Any, Optional +from typing import Sequence, Dict, Union, Any from clickhouse_connect.datatypes.base import ClickHouseType -from clickhouse_connect.datatypes.format import format_map -from clickhouse_connect.driver.query import DataResult +from clickhouse_connect.driver.query import DataResult, QueryContext -class QueryContext: - def __init__(self, use_none: bool, type_formats: Optional[Dict[str, str]], - _column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]]): - self.query_overrides = format_map(type_formats) - self.use_none = use_none - - def __enter__(self): - if self.query_overrides: - threading.local.ch_query_overrides = self.query_overrides - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.query_overrides: - del threading.local.ch_query_overrides +_EMPTY_CONTEXT = QueryContext() class DataTransform(ABC): - def parse_response(self, source: Sequence, type_formats: Dict[str, str] = None, use_none: bool = True, - column_formats: Dict[str, Union[str, Dict[str, str]]] = None) -> DataResult: + def parse_response(self, source: Sequence, context: QueryContext = _EMPTY_CONTEXT) -> DataResult: """ Decodes the ClickHouse byte buffer response into rows of native Python data :param source: A byte buffer or similar source - :param use_none: Use None python value for ClickHouse nulls (otherwise use type "zero value") - :param type_formats: Dictionary of ClickHouse type names/patterns and response formats - :param column_formats: Use None values for ClickHouse NULLs (otherwise use zero/empty values) + :param context: The QueryContext to use in processing the response :return: DataResult -- data matrix, column names, column types """ - with QueryContext(use_none, type_formats, column_formats) as query_context: - return self._transform_response(source, query_context) + with context: + return self._transform_response(source, context) @abstractmethod def build_insert(self, data: Sequence[Sequence[Any]], *, column_names: Sequence[str], diff --git a/clickhouse_connect/json_impl.py b/clickhouse_connect/json_impl.py index 5728520a..037d21e9 100644 --- a/clickhouse_connect/json_impl.py +++ b/clickhouse_connect/json_impl.py @@ -1,17 +1,44 @@ import logging +import json as py_json +from collections import OrderedDict +from typing import Any -logger = logging.getLogger(__name__) +try: + import orjson + any_to_json = orjson.dumps +except ImportError: + orjson = None try: - import orjson as json_impl - logger.info('Using orjson as the JSON implementation') + import ujson except ImportError: - try: - import ujson as json_impl - logger.info('Using ujson as the JSON implementation') - except ImportError: - import json as json_impl - logger.info('Using default JSON implementation') + ujson = None + + +def _pyjson_to_json(obj: Any) -> bytes: + return py_json.dumps(obj).encode() + + +logger = logging.getLogger(__name__) +_to_json = OrderedDict() +_to_json['orjson'] = orjson.dumps if orjson else None +_to_json['ujson'] = ujson.dumps if ujson else None +_to_json['python'] = _pyjson_to_json + +any_to_json = _pyjson_to_json + +def set_json_library(impl: str = None): + global any_to_json + if impl: + func = _to_json.get(impl) + if not func: + raise NotImplementedError(f'JSON library {impl} is not supported') + for library, func in _to_json.items(): + if func: + logger.info('Using %s library for writing JSON byte strings', library) + any_to_json = func + break +set_json_library() \ No newline at end of file diff --git a/setup.py b/setup.py index 12084cf8..46b7e956 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,8 @@ def run_setup(try_c: bool = True): 'superset': ['apache_superset>=1.4.1', 'sqlalchemy>1.3.21, <1.4'], 'numpy': ['numpy'], 'pandas': ['pandas'], - 'arrow': ['pyarrow'] + 'arrow': ['pyarrow'], + 'orjson': ['orjson'] }, entry_points={ 'sqlalchemy.dialects': ['clickhousedb.connect=clickhouse_connect.cc_sqlalchemy.dialect:ClickHouseDialect', diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index 2602a8dc..fc89a8d8 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -1,5 +1,9 @@ +import uuid +from ipaddress import IPv4Address + import pytest +from clickhouse_connect.datatypes.format import set_default_formats, clear_default_format from clickhouse_connect.driver import Client @@ -12,7 +16,7 @@ def test_low_card(test_client: Client, test_table_engine: str): assert len(result.result_set) == 1 -def test_json_insert(test_client: Client, test_table_engine: str): +def test_json(test_client: Client, test_table_engine: str): if not test_client.min_version('22.6.1'): pytest.skip('JSON test skipped for old version {test_client.server_version}') test_client.command('DROP TABLE IF EXISTS native_json_test') @@ -33,4 +37,25 @@ def test_json_insert(test_client: Client, test_table_engine: str): def test_read_formats(test_client: Client, test_table_engine: str): - pass + test_client.command('DROP TABLE IF EXISTS read_format_test') + test_client.command('CREATE TABLE read_format_test (key Int32, uuid UUID, fs FixedString(10), ipv4 IPv4)' + + f'Engine {test_table_engine} ORDER BY key') + uuid1 = uuid.UUID('23E45688e89B-12D3-3273-426614174000') + uuid2 = uuid.UUID('77AA3278-3728-12d3-5372-000377723832') + row1 = (1, uuid1, '530055777k', '10.251.30.50') + row2 = (2, uuid2, 'short str', '10.44.75.20') + test_client.insert('read_format_test', [row1, row2]) + result = test_client.query('SELECT * FROM read_format_test').result_set + assert result[0][1] == uuid1 + assert result[1][3] == IPv4Address('10.44.75.20') + assert result[0][2] == b'\x35\x33\x30\x30\x35\x35\x37\x37\x37\x6b' + set_default_formats('uuid', 'string', 'ip*', 'string', 'FixedString', 'string') + result = test_client.query('SELECT * FROM read_format_test').result_set + assert result[0][1] == '23e45688-e89b-12d3-3273-426614174000' + assert result[1][3] == '10.44.75.20' + assert result[0][2] == '530055777k' + clear_default_format('ipv4') + result = test_client.query('SELECT * FROM read_format_test').result_set + assert result[0][1] == '23e45688-e89b-12d3-3273-426614174000' + assert result[1][3] == IPv4Address('10.44.75.20') + assert result[0][2] == '530055777k' From cf7b2643fb91c909372de503ee24f556da46d254 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 05:10:23 -0600 Subject: [PATCH 14/25] Fix lint --- clickhouse_connect/driver/transform.py | 2 +- clickhouse_connect/json_impl.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clickhouse_connect/driver/transform.py b/clickhouse_connect/driver/transform.py index 9409329c..c52686ff 100644 --- a/clickhouse_connect/driver/transform.py +++ b/clickhouse_connect/driver/transform.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Sequence, Dict, Union, Any +from typing import Sequence, Any from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.query import DataResult, QueryContext diff --git a/clickhouse_connect/json_impl.py b/clickhouse_connect/json_impl.py index 037d21e9..c0c2e0df 100644 --- a/clickhouse_connect/json_impl.py +++ b/clickhouse_connect/json_impl.py @@ -29,7 +29,7 @@ def _pyjson_to_json(obj: Any) -> bytes: def set_json_library(impl: str = None): - global any_to_json + global any_to_json # pylint: disable=global-statement if impl: func = _to_json.get(impl) if not func: @@ -41,4 +41,4 @@ def set_json_library(impl: str = None): break -set_json_library() \ No newline at end of file +set_json_library() From 6081c15effd1d6a027abffffc82a7ab28dbbc4de Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 10:19:58 -0600 Subject: [PATCH 15/25] Formatting fixes plus more format tests --- clickhouse_connect/datatypes/base.py | 11 ++++---- clickhouse_connect/datatypes/container.py | 3 +++ clickhouse_connect/driver/query.py | 21 +++++++-------- clickhouse_connect/driver/threads.py | 5 ++++ tests/integration_tests/test_native.py | 33 ++++++++++++++++++----- 5 files changed, 50 insertions(+), 23 deletions(-) create mode 100644 clickhouse_connect/driver/threads.py diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index 3bccf150..ca0427e9 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -1,5 +1,4 @@ import array -import threading import logging from abc import abstractmethod, ABC @@ -9,6 +8,7 @@ from clickhouse_connect.driver.common import array_column, array_type, int_size, read_uint64, write_array, \ write_uint64, low_card_version from clickhouse_connect.driver.exceptions import NotSupportedError +from clickhouse_connect.driver.threads import query_settings logger = logging.getLogger(__name__) ch_read_formats = {} @@ -53,11 +53,10 @@ def build(cls: Type['ClickHouseType'], type_def: TypeDef): @classmethod def _active_format(cls, fmt_map: Dict[Type['ClickHouseType'], str]): - t_local = threading.local() - overrides = getattr(t_local, 'ch_column_overrides', None) + overrides = getattr(query_settings, 'column_overrides', None) if overrides and cls in overrides: return overrides[cls] - overrides = getattr(t_local, 'ch_query_overrides)', None) + overrides = getattr(query_settings, 'query_overrides', None) if overrides and cls in overrides: return overrides[cls] return fmt_map.get(cls, 'native') @@ -102,10 +101,10 @@ def name(self): @property def encoding(self): - override = getattr(threading.local(), 'ch_column_encoding', None) + override = getattr(query_settings, 'column_encoding', None) if override: return override - override = getattr(threading.local(), 'ch_query_encoding', None) + override = getattr(query_settings, 'query_encoding', None) if override: return override return self._encoding diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 930310ef..0148543f 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -126,6 +126,9 @@ def read_native_data(self, source: Sequence, loc: int, num_rows: int, use_none=T for ix, x in enumerate(dicts): for y, key in enumerate(e_names): x[key] = columns[y][ix] + if self.read_format() == 'json': + to_json = json_impl.any_to_json + return [to_json(x) for x in dicts], loc return dicts, loc return tuple(zip(*columns)), loc diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index d41a6cc2..9287d774 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -1,5 +1,4 @@ import ipaddress -import threading import uuid from enum import Enum @@ -11,6 +10,7 @@ from clickhouse_connect.datatypes.container import Array from clickhouse_connect.datatypes.format import format_map from clickhouse_connect.driver.options import HAS_NUMPY, HAS_PANDAS, check_pandas, check_numpy, HAS_ARROW, check_arrow +from clickhouse_connect.driver.threads import query_settings if HAS_PANDAS: import pandas as pa @@ -39,6 +39,7 @@ def __init__(self, self.query_formats = format_map(query_formats) self.column_formats = column_formats or {} self.use_none = use_none + self.thread_local = None def updated_copy(self, query: Optional[str] = None, @@ -58,38 +59,36 @@ def updated_copy(self, def __enter__(self): if self.query_formats: - threading.local().ch_query_overrides = self.query_formats + query_settings.query_overrides = self.query_formats return self def __exit__(self, exc_type, exc_val, exc_tb): - t_local = threading.local() if self.query_formats: - del t_local.ch_query_overrides + del query_settings.query_overrides try: - del t_local.ch_column_overrides + del query_settings.column_overrides except AttributeError: pass def start_column(self, name: str, ch_type: ClickHouseType): - t_local = threading.local() if name in self.column_formats: fmts = self.column_formats[name] if isinstance(fmts, str): if isinstance(ch_type, Array): - fmt_map = {ch_type.element_type: fmts} + fmt_map = {ch_type.element_type.__class__: fmts} else: - fmt_map = {ch_type: fmts} + fmt_map = {ch_type.__class__: fmts} else: fmt_map = format_map(fmts) - t_local.ch_column_overrides = fmt_map + query_settings.column_overrides = fmt_map else: try: - del t_local.ch_column_overrides + del query_settings.column_overrides except AttributeError: pass -class QueryResult(): +class QueryResult: """ Wrapper class for query return values and metadata """ diff --git a/clickhouse_connect/driver/threads.py b/clickhouse_connect/driver/threads.py new file mode 100644 index 00000000..9166ac1d --- /dev/null +++ b/clickhouse_connect/driver/threads.py @@ -0,0 +1,5 @@ +import threading + +query_settings = threading.local() + + diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index fc89a8d8..47c42333 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -1,5 +1,5 @@ import uuid -from ipaddress import IPv4Address +from ipaddress import IPv4Address, IPv6Address import pytest @@ -38,24 +38,45 @@ def test_json(test_client: Client, test_table_engine: str): def test_read_formats(test_client: Client, test_table_engine: str): test_client.command('DROP TABLE IF EXISTS read_format_test') - test_client.command('CREATE TABLE read_format_test (key Int32, uuid UUID, fs FixedString(10), ipv4 IPv4)' + - f'Engine {test_table_engine} ORDER BY key') + test_client.command('CREATE TABLE read_format_test (key Int32, uuid UUID, fs FixedString(10), ipv4 IPv4,' + + f'str_array Array(IPv6)) Engine {test_table_engine} ORDER BY key') uuid1 = uuid.UUID('23E45688e89B-12D3-3273-426614174000') uuid2 = uuid.UUID('77AA3278-3728-12d3-5372-000377723832') - row1 = (1, uuid1, '530055777k', '10.251.30.50') - row2 = (2, uuid2, 'short str', '10.44.75.20') + row1 = (1, uuid1, '530055777k', '10.251.30.50', ['2600::', '2001:4860:4860::8844']) + row2 = (2, uuid2, 'short str', '10.44.75.20', ['74:382::3332', '8700:5200::5782:3992']) test_client.insert('read_format_test', [row1, row2]) + result = test_client.query('SELECT * FROM read_format_test').result_set assert result[0][1] == uuid1 assert result[1][3] == IPv4Address('10.44.75.20') assert result[0][2] == b'\x35\x33\x30\x30\x35\x35\x37\x37\x37\x6b' + set_default_formats('uuid', 'string', 'ip*', 'string', 'FixedString', 'string') result = test_client.query('SELECT * FROM read_format_test').result_set assert result[0][1] == '23e45688-e89b-12d3-3273-426614174000' assert result[1][3] == '10.44.75.20' assert result[0][2] == '530055777k' - clear_default_format('ipv4') + assert result[0][4][1] == '2001:4860:4860::8844' + + clear_default_format('ip*') result = test_client.query('SELECT * FROM read_format_test').result_set assert result[0][1] == '23e45688-e89b-12d3-3273-426614174000' assert result[1][3] == IPv4Address('10.44.75.20') + assert result[0][4][1] == IPv6Address('2001:4860:4860::8844') assert result[0][2] == '530055777k' + + result = test_client.query('SELECT * FROM read_format_test', query_formats={'IP*': 'string'}).result_set + assert result[1][3] == '10.44.75.20' + + # Ensure that the query format clears + result = test_client.query('SELECT * FROM read_format_test').result_set + assert result[1][3] == IPv4Address('10.44.75.20') + + result = test_client.query('SELECT * FROM read_format_test', column_formats={'ipv4': 'string'}).result_set + assert result[1][3] == '10.44.75.20' + + # Ensure that the column format clears + result = test_client.query('SELECT * FROM read_format_test').result_set + assert result[1][3] == IPv4Address('10.44.75.20') + + From 0ffb37661c5d0f40f4c515a703cba8641dc1ddd6 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 10:24:13 -0600 Subject: [PATCH 16/25] Fix lint --- clickhouse_connect/driver/threads.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/clickhouse_connect/driver/threads.py b/clickhouse_connect/driver/threads.py index 9166ac1d..f856468e 100644 --- a/clickhouse_connect/driver/threads.py +++ b/clickhouse_connect/driver/threads.py @@ -1,5 +1,3 @@ import threading query_settings = threading.local() - - From b866f5eaa4bde6265fc620005e35288afe3a8ed4 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 10:30:04 -0600 Subject: [PATCH 17/25] Fix lint --- tests/integration_tests/test_native.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index 47c42333..6d1ab27c 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -78,5 +78,3 @@ def test_read_formats(test_client: Client, test_table_engine: str): # Ensure that the column format clears result = test_client.query('SELECT * FROM read_format_test').result_set assert result[1][3] == IPv4Address('10.44.75.20') - - From 5b215f03eb5d4d72d41badce2c91c6e9a5c9aa76 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 12:29:45 -0600 Subject: [PATCH 18/25] Expand read format tests --- clickhouse_connect/datatypes/container.py | 10 ++++++- clickhouse_connect/driver/client.py | 2 +- clickhouse_connect/driver/query.py | 2 +- tests/integration_tests/test_native.py | 32 ++++++++++++++++++----- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 0148543f..7863227f 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -86,7 +86,7 @@ def write_native_data(self, column: Sequence, dest: MutableSequence): class Tuple(ClickHouseType): _slots = 'element_names', 'element_types', 'from_rb_funcs', 'to_rb_funcs' - python_type = tuple + valid_formats = 'tuple', 'json', 'native' # native is 'tuple' for unnamed tuples, and dict for named tuples def __init__(self, type_def: TypeDef): super().__init__(type_def) @@ -99,6 +99,14 @@ def __init__(self, type_def: TypeDef): else: self._name_suffix = type_def.arg_str + @property + def python_type(self): + if self.read_format() == 'tuple': + return tuple + if self.read_format() == 'json': + return str + return dict + def _from_row_binary(self, source: bytes, loc: int): values = [] for conv in self.from_rb_funcs: diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 5e538c17..d93a6f1c 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -86,7 +86,7 @@ def query(self, parameters: Optional[Dict[str, Any]] = None, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, - column_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, use_none: bool = True, context: QueryContext = None) -> QueryResult: """ diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 9287d774..972052b0 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -46,7 +46,7 @@ def updated_copy(self, parameters: Optional[Dict[str, Any]] = None, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, - column_formats: Optional[Dict[str, str]] = None, + column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, use_none: Optional[bool] = None) -> 'QueryContext': copy = QueryContext() copy.query = query or self.query diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index 6d1ab27c..f5caed39 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -3,7 +3,7 @@ import pytest -from clickhouse_connect.datatypes.format import set_default_formats, clear_default_format +from clickhouse_connect.datatypes.format import set_default_formats, clear_default_format, set_read_format from clickhouse_connect.driver import Client @@ -39,17 +39,20 @@ def test_json(test_client: Client, test_table_engine: str): def test_read_formats(test_client: Client, test_table_engine: str): test_client.command('DROP TABLE IF EXISTS read_format_test') test_client.command('CREATE TABLE read_format_test (key Int32, uuid UUID, fs FixedString(10), ipv4 IPv4,' + - f'str_array Array(IPv6)) Engine {test_table_engine} ORDER BY key') + 'ip_array Array(IPv6), tup Tuple(u1 UInt64, ip2 IPv4))' + + f'Engine {test_table_engine} ORDER BY key') uuid1 = uuid.UUID('23E45688e89B-12D3-3273-426614174000') uuid2 = uuid.UUID('77AA3278-3728-12d3-5372-000377723832') - row1 = (1, uuid1, '530055777k', '10.251.30.50', ['2600::', '2001:4860:4860::8844']) - row2 = (2, uuid2, 'short str', '10.44.75.20', ['74:382::3332', '8700:5200::5782:3992']) + row1 = (1, uuid1, '530055777k', '10.251.30.50', ['2600::', '2001:4860:4860::8844'], (7372, '10.20.30.203')) + row2 = (2, uuid2, 'short str', '10.44.75.20', ['74:382::3332', '8700:5200::5782:3992'], (7320, '252.18.4.50')) test_client.insert('read_format_test', [row1, row2]) result = test_client.query('SELECT * FROM read_format_test').result_set assert result[0][1] == uuid1 assert result[1][3] == IPv4Address('10.44.75.20') assert result[0][2] == b'\x35\x33\x30\x30\x35\x35\x37\x37\x37\x6b' + assert result[0][5]['u1'] == 7372 + assert result[0][5]['ip2'] == IPv4Address('10.20.30.203') set_default_formats('uuid', 'string', 'ip*', 'string', 'FixedString', 'string') result = test_client.query('SELECT * FROM read_format_test').result_set @@ -65,16 +68,33 @@ def test_read_formats(test_client: Client, test_table_engine: str): assert result[0][4][1] == IPv6Address('2001:4860:4860::8844') assert result[0][2] == '530055777k' - result = test_client.query('SELECT * FROM read_format_test', query_formats={'IP*': 'string'}).result_set + # Test query formats + result = test_client.query('SELECT * FROM read_format_test', query_formats={'IP*': 'string', + 'tup': 'json'}).result_set assert result[1][3] == '10.44.75.20' + assert result[0][5] == b'{"u1":7372,"ip2":"10.20.30.203"}' # Ensure that the query format clears result = test_client.query('SELECT * FROM read_format_test').result_set assert result[1][3] == IPv4Address('10.44.75.20') + assert result[0][5]['ip2'] == IPv4Address('10.20.30.203') - result = test_client.query('SELECT * FROM read_format_test', column_formats={'ipv4': 'string'}).result_set + # Test column formats + result = test_client.query('SELECT * FROM read_format_test', column_formats={'ipv4': 'string', + 'tup': 'tuple'}).result_set assert result[1][3] == '10.44.75.20' + assert result[0][5][1] == IPv4Address('10.20.30.203') # Ensure that the column format clears result = test_client.query('SELECT * FROM read_format_test').result_set assert result[1][3] == IPv4Address('10.44.75.20') + assert result[0][5]['ip2'] == IPv4Address('10.20.30.203') + + # Test sub column formats + set_read_format('tuple', 'tuple') + result = test_client.query('SELECT * FROM read_format_test', column_formats={'tup' : {'ip*': 'string'}}).result_set + assert result[0][5][1] == '10.20.30.203' + + set_read_format('tuple', 'native') + result = test_client.query('SELECT * FROM read_format_test', column_formats={'tup': {'ip*': 'string'}}).result_set + assert result[0][5]['ip2'] == '10.20.30.203' From 19308151bb6808615dc3e546c5ea658a2fe40fc0 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 1 Aug 2022 13:37:01 -0600 Subject: [PATCH 19/25] Fix lint, remove spaces from python JSON serialization --- clickhouse_connect/driver/client.py | 5 ++++- clickhouse_connect/driver/query.py | 2 ++ clickhouse_connect/json_impl.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index d93a6f1c..379911bc 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -81,6 +81,7 @@ def client_setting(self, name, value): :param value: Setting value """ + # pylint: disable=duplicate-code def query(self, query: str = None, parameters: Optional[Dict[str, Any]] = None, @@ -126,6 +127,7 @@ def raw_query(self, :return: bytes representing raw ClickHouse return value based on format """ + # pylint: disable=duplicate-code def query_np(self, query: str = None, parameters: Optional[Dict[str, Any]] = None, @@ -134,7 +136,7 @@ def query_np(self, column_formats: Optional[Dict[str, str]] = None, context: QueryContext = None): """ - Query method that results the results as a numpy array + Query method that returns the results as a numpy array :param query: Query statement/format string :param parameters: Optional dictionary used to format the query :param settings: Optional dictionary of ClickHouse settings (key/string values) @@ -151,6 +153,7 @@ def query_np(self, False, context)) + # pylint: disable=duplicate-code def query_df(self, query: str = None, parameters: Optional[Dict[str, Any]] = None, diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 972052b0..ae56b620 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -26,6 +26,8 @@ class QueryContext: """ Argument/parameter object for queries """ + + # pylint: disable=duplicate-code def __init__(self, query: str = None, parameters: Optional[Dict[str, Any]] = None, diff --git a/clickhouse_connect/json_impl.py b/clickhouse_connect/json_impl.py index c0c2e0df..5576bb2e 100644 --- a/clickhouse_connect/json_impl.py +++ b/clickhouse_connect/json_impl.py @@ -16,7 +16,7 @@ def _pyjson_to_json(obj: Any) -> bytes: - return py_json.dumps(obj).encode() + return py_json.dumps(obj, separators=(',', ':')).encode() logger = logging.getLogger(__name__) From 353c6cdceee220c8c8f29127c735ca5ad2cb6910 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 2 Aug 2022 13:59:30 -0600 Subject: [PATCH 20/25] format doc checkpoint --- clickhouse_connect/datatypes/base.py | 9 ++--- clickhouse_connect/driver/query.py | 53 +++++++++++++++++++++------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/clickhouse_connect/datatypes/base.py b/clickhouse_connect/datatypes/base.py index ca0427e9..1de56c8b 100644 --- a/clickhouse_connect/datatypes/base.py +++ b/clickhouse_connect/datatypes/base.py @@ -101,13 +101,8 @@ def name(self): @property def encoding(self): - override = getattr(query_settings, 'column_encoding', None) - if override: - return override - override = getattr(query_settings, 'query_encoding', None) - if override: - return override - return self._encoding + query_encoding = getattr(query_settings, 'query_encoding', None) + return query_encoding or self._encoding def write_native_prefix(self, dest: MutableSequence): """ diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index ae56b620..cd9bccf7 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -24,7 +24,7 @@ class QueryContext: """ - Argument/parameter object for queries + Argument/parameter object for queries. This context is used to set thread/query specific formats """ # pylint: disable=duplicate-code @@ -35,9 +35,30 @@ def __init__(self, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, use_none: bool = True): + """ + Initializes various configuration settings for the query context + + :param query: Query string with Python style format value replacements + :param parameters: Optional dictionary of substitution values + :param settings: Optional ClickHouse settings for the query + :param query_formats: Optional dictionary of query formats with the key of a ClickHouse type name + (with * wildcards) and a value of valid query formats for those types. + The value 'encoding' can be sent to change the expected encoding for this query, with a value of + the desired encoding such as `latin-1` + :param column_formats: Optional dictionary of column specific formats. The key is the column name, + The value is either the format for the data column (such as 'string' for a UUID column) or a + second level "format" dictionary of a ClickHouse type name and a value of query formats. This + secondary dictionary can be used for nested column types such as Tuples or Maps + :param column_formats: Optional dictionary + :param use_none: + """ self.query = query self.parameters = parameters or {} self.settings = settings or {} + if query_formats: + self.encoding = query_formats.pop('encoding', None) + else: + self.encoding = None self.query_formats = format_map(query_formats) self.column_formats = column_formats or {} self.use_none = use_none @@ -50,27 +71,36 @@ def updated_copy(self, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, use_none: Optional[bool] = None) -> 'QueryContext': + """ + Creates + :param query: + :param parameters: + :param settings: + :param query_formats: + :param column_formats: + :param use_none: + :return: + """ copy = QueryContext() copy.query = query or self.query copy.parameters = self.parameters.update(parameters or {}) copy.settings = self.settings.update(settings or {}) + if query_formats: + copy.encoding = self.encoding or query_formats.pop('encoding', None) copy.query_formats = self.query_formats.update(query_formats or {}) copy.column_formats = self.column_formats.update(column_formats or {}) copy.use_none = use_none if use_none is not None else self.use_none return copy def __enter__(self): - if self.query_formats: - query_settings.query_overrides = self.query_formats + query_settings.query_overrides = self.query_formats + query_settings.query_encoding = self.encoding return self def __exit__(self, exc_type, exc_val, exc_tb): - if self.query_formats: - del query_settings.query_overrides - try: - del query_settings.column_overrides - except AttributeError: - pass + query_settings.query_overrides = None + query_settings.column_overrides = None + query_settings.query_encoding = None def start_column(self, name: str, ch_type: ClickHouseType): if name in self.column_formats: @@ -84,10 +114,7 @@ def start_column(self, name: str, ch_type: ClickHouseType): fmt_map = format_map(fmts) query_settings.column_overrides = fmt_map else: - try: - del query_settings.column_overrides - except AttributeError: - pass + query_settings.column_overrides = None class QueryResult: From e2a1f4f4b9bb6dd241ce1e213fd3d36cdba72ac1 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 2 Aug 2022 14:31:11 -0600 Subject: [PATCH 21/25] Lint fix --- clickhouse_connect/driver/query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index cd9bccf7..5db46cd4 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -22,6 +22,7 @@ import pyarrow +# pylint: disable=too-many-instance-attributes class QueryContext: """ Argument/parameter object for queries. This context is used to set thread/query specific formats From cfec41e10f85e4c3e15d2b983aef12da351307d1 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 3 Aug 2022 03:42:00 -0600 Subject: [PATCH 22/25] Fix SQL comment related issues --- clickhouse_connect/driver/httpclient.py | 13 ++++++------ clickhouse_connect/driver/query.py | 23 +++++++++++++++++++++ tests/integration_tests/test_client.py | 3 ++- tests/unit_tests/test_driver/test_parser.py | 15 ++++++++++++++ 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index d4a4bf7a..97ed2fa0 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -14,7 +14,8 @@ from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError, ProgrammingError from clickhouse_connect.driver.httpadapter import KeepAliveAdapter from clickhouse_connect.driver.native import NativeTransform -from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value, QueryContext +from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value, QueryContext, \ + remove_sql_comments from clickhouse_connect.driver.rowbinary import RowBinaryTransform logger = logging.getLogger(__name__) @@ -136,11 +137,11 @@ def __init__(self, self.session.params = self._validate_settings(settings, True) def _format_query(self, query: str) -> str: - query = query.strip() - if query.upper().startswith('INSERT ') and 'VALUES' in query.upper(): - return query - if not query.endswith(self.read_format): - query += f' FORMAT {self.read_format}' + uncommented_query = remove_sql_comments(query) + if uncommented_query.upper().startswith('INSERT ') and 'VALUES' in query.upper(): + return query # Don't format the output of INSERT statements + if not uncommented_query.endswith(self.read_format): + query += f'\nFORMAT {self.read_format}' return query def client_setting(self, name, value): diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 5db46cd4..5278bedf 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -1,4 +1,5 @@ import ipaddress +import re import uuid from enum import Enum @@ -182,6 +183,28 @@ def format_query_value(value, server_tz=UTC): return str(value) +comment_re = re.compile(r"(\".*?\"|\'.*?\')|(/\*.*?\*/|(--\s)[^\n]*$)", re.MULTILINE | re.DOTALL) + + +def remove_sql_comments(sql: str) -> str: + """ + Remove SQL comments. This is useful to determine the type of SQL query, such as SELECT or INSERT, but we + don't fully trust it to correctly ignore weird quoted strings, and other edge cases, so we always pass the + original SQL to ClickHouse (which uses a full-fledged AST/ token parser) + :param sql: SQL query + :return: SQL Query without SQL comments + """ + def replacer(match): + # if the 2nd group (capturing comments) is not None, it means we have captured a + # non-quoted, actual comment string, so return nothing to remove the comment + if match.group(2): + return '' + # Otherwise we've actually captured a quoted string, so return it + return match.group(1) + + return comment_re.sub(replacer, sql) + + def np_result(result: QueryResult) -> 'np.array': """ Convert QueryResult to a numpy array diff --git a/tests/integration_tests/test_client.py b/tests/integration_tests/test_client.py index ac6525af..cfcfa46f 100644 --- a/tests/integration_tests/test_client.py +++ b/tests/integration_tests/test_client.py @@ -84,7 +84,8 @@ def test_query_with_inline_comment(test_client: Client): result = test_client.query(""" SELECT * -- This is just a comment - FROM system.tables + FROM system.tables LIMIT 77 + -- A second comment """) assert len(result.result_set) > 0 diff --git a/tests/unit_tests/test_driver/test_parser.py b/tests/unit_tests/test_driver/test_parser.py index a4065c0b..cc5bab1f 100644 --- a/tests/unit_tests/test_driver/test_parser.py +++ b/tests/unit_tests/test_driver/test_parser.py @@ -1,4 +1,5 @@ from clickhouse_connect.driver.parser import parse_callable, parse_enum +from clickhouse_connect.driver.query import remove_sql_comments def test_parse_callable(): @@ -13,3 +14,17 @@ def test_parse_callable(): def test_parse_enum(): assert parse_enum("Enum8('one' = 1)") == (('one',), (1,)) assert parse_enum("Enum16('**\\'5' = 5, '578' = 7)") == (("**'5", '578'), (5, 7)) + + +def test_remove_comments(): + sql = """SELECT -- 6dcd92a04feb50f14bbcf07c661680ba +* FROM benchmark_results /*With an inline comment */ WHERE result = 'True' +/* A single line */ +LIMIT +/* A multiline comment + +*/ +2 +-- 6dcd92a04feb50f14bbcf07c661680ba +""" + assert remove_sql_comments(sql) == "SELECT \n* FROM benchmark_results WHERE result = 'True'\n\nLIMIT\n\n2\n\n" From 70d2b2d981da31404acfd1cee73f6fbff6aab0ec Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 4 Aug 2022 05:49:32 -0600 Subject: [PATCH 23/25] Fix comment and sqlalchemy issues --- CHANGELOG.md | 2 +- README.md | 7 +- .../cc_sqlalchemy/datatypes/base.py | 38 ++++++--- .../cc_sqlalchemy/sql/__init__.py | 9 +-- .../cc_sqlalchemy/sql/preparer.py | 15 +++- clickhouse_connect/datatypes/network.py | 4 + clickhouse_connect/datatypes/special.py | 6 +- clickhouse_connect/driver/client.py | 41 +++++++--- clickhouse_connect/driver/common.py | 2 - clickhouse_connect/driver/httpclient.py | 34 ++++---- clickhouse_connect/driver/parser.py | 3 +- clickhouse_connect/driver/query.py | 81 ++++++++++++------- tests/unit_tests/test_sqlalchemy/test_ddl.py | 4 +- 13 files changed, 155 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96a786b1..f8933401 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## ClickHouse Connect ChangeLog -### Release 0.1.7, 2022-07-28 +### Release 0.2.0, 2022-08-04 #### Improvements diff --git a/README.md b/README.md index dc27a96b..f664cb43 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,15 @@ ClickHouse HTTP interface. ### Installation + +``` +pip install clickhouse-connect +``` + ClickHouse Connect requires Python 3.7 or higher. The `cython` package must be installed prior to installing `clickhouse_connect` to build and install the optional Cython/C extensions used for improving read and write performance using the ClickHouse Native format. After installing cython if desired, clone this repository and -run `python setup.py install`from the project directory. +run `python setup.py install`from the project directory. ### Getting Started diff --git a/clickhouse_connect/cc_sqlalchemy/datatypes/base.py b/clickhouse_connect/cc_sqlalchemy/datatypes/base.py index eaac7600..bd7a8710 100644 --- a/clickhouse_connect/cc_sqlalchemy/datatypes/base.py +++ b/clickhouse_connect/cc_sqlalchemy/datatypes/base.py @@ -5,14 +5,15 @@ from clickhouse_connect.datatypes.base import ClickHouseType, TypeDef, EMPTY_TYPE_DEF from clickhouse_connect.datatypes.registry import parse_name, type_map +from clickhouse_connect.driver.query import format_query_value logger = logging.getLogger(__name__) class ChSqlaType: """ - A SQLAlchemy TypeEngine that wraps a ClickHouseType. We don't extend TypeEngine directly, instead all concrete subclasses - will inherit from TypeEngine + A SQLAlchemy TypeEngine that wraps a ClickHouseType. We don't extend TypeEngine directly, instead all concrete + subclasses will inherit from TypeEngine. """ ch_type: ClickHouseType = None generic_type: None @@ -22,7 +23,8 @@ class ChSqlaType: def __init_subclass__(cls): """ - Registers ChSqla type in the type map and sets the underlying ClickHouseType class to use to initialize ChSqlaType instances + Registers ChSqla type in the type map and sets the underlying ClickHouseType class to use to initialize + ChSqlaType instances """ base = cls.__name__ if not cls._ch_type_cls: @@ -47,10 +49,10 @@ def build(cls, type_def: TypeDef): def __init__(self, type_def: TypeDef = EMPTY_TYPE_DEF): """ Basic constructor that does nothing but set the wrapped ClickHouseType. It is overridden in some cases - to add specific SqlAlchemy behavior when constructing subclasses "by hand", in which case the type_def parameter is - normally set to None and other keyword parameters used for construction - :param type_def: TypeDef tuple used to build the underlying ClickHouseType. This is normally populated by the parse_name - function + to add specific SqlAlchemy behavior when constructing subclasses "by hand", in which case the type_def + parameter is normally set to None and other keyword parameters used for construction + :param type_def: TypeDef tuple used to build the underlying ClickHouseType. This is normally populated by the + parse_name function """ self.type_def = type_def self.ch_type = self._ch_type_cls.build(type_def) @@ -74,23 +76,33 @@ def low_card(self): @staticmethod def result_processor(): """ - Override for the SqlAlchemy TypeEngine result_processor method, which is used to convert row values to the correct Python type - The core driver handles this automatically, so we always return None + Override for the SqlAlchemy TypeEngine result_processor method, which is used to convert row values to the + correct Python type. The core driver handles this automatically, so we always return None. """ return None @staticmethod def _cached_result_processor(*_): """ - Override for the SqlAlchemy TypeEngine _cached_result_processor method to prevent weird behavior when SQLAlchemy tries to cache + Override for the SqlAlchemy TypeEngine _cached_result_processor method to prevent weird behavior + when SQLAlchemy tries to cache. """ return None + @staticmethod + def _cached_literal_processor(*_): + """ + Override for the SqlAlchemy TypeEngine _cached_literal_processor. We delegate to the driver format_query_value + method and should be able to ignore literal_processor definitions in the dialect, which are verbose and + confusing. + """ + return format_query_value + def _compiler_dispatch(self, _visitor, **_): """ - Override for the SqlAlchemy TypeEngine _compiler_dispatch method to sidestep unnecessary layers and complexity when generating - the type name. The underlying ClickHouseType generates the correct name - :return: Name generated by the underlying driver + Override for the SqlAlchemy TypeEngine _compiler_dispatch method to sidestep unnecessary layers and complexity + when generating the type name. The underlying ClickHouseType generates the correct name + :return: Name generated by the underlying driver. """ return self.name diff --git a/clickhouse_connect/cc_sqlalchemy/sql/__init__.py b/clickhouse_connect/cc_sqlalchemy/sql/__init__.py index e4040382..1358d747 100644 --- a/clickhouse_connect/cc_sqlalchemy/sql/__init__.py +++ b/clickhouse_connect/cc_sqlalchemy/sql/__init__.py @@ -1,17 +1,10 @@ from typing import Optional from sqlalchemy import Table -from sqlalchemy.sql.compiler import RESERVED_WORDS - -from clickhouse_connect.driver.common import identifier_re - -reserved_words = RESERVED_WORDS | set('index') def quote_id(v: str) -> str: - if v in reserved_words or not identifier_re.match(v): - return f'`{v}`' - return v + return f'`{v}`' def full_table(table_name: str, schema: Optional[str] = None) -> str: diff --git a/clickhouse_connect/cc_sqlalchemy/sql/preparer.py b/clickhouse_connect/cc_sqlalchemy/sql/preparer.py index 520e4b08..5337f657 100644 --- a/clickhouse_connect/cc_sqlalchemy/sql/preparer.py +++ b/clickhouse_connect/cc_sqlalchemy/sql/preparer.py @@ -1,5 +1,18 @@ from sqlalchemy.sql.compiler import IdentifierPreparer +from clickhouse_connect.cc_sqlalchemy.sql import quote_id + class ChIdentifierPreparer(IdentifierPreparer): - pass + + quote_identifier = staticmethod(quote_id) + + def normalize_name(self, name): + return name + + def denormalize_name(self, name): + return name + + def _requires_quotes(self, _value): + return True + diff --git a/clickhouse_connect/datatypes/network.py b/clickhouse_connect/datatypes/network.py index 503e2211..352835bb 100644 --- a/clickhouse_connect/datatypes/network.py +++ b/clickhouse_connect/datatypes/network.py @@ -82,6 +82,10 @@ class IPv6(ClickHouseType): def python_type(self): return str if self.read_format() == 'string' else IPv6Address + @property + def np_type(self): + return 'U' if self.read_format() == 'string' else 'O' + @property def python_null(self): return '' if self.read_format() == 'string' else V6_NULL diff --git a/clickhouse_connect/datatypes/special.py b/clickhouse_connect/datatypes/special.py index a9bfb924..78425e01 100644 --- a/clickhouse_connect/datatypes/special.py +++ b/clickhouse_connect/datatypes/special.py @@ -13,7 +13,11 @@ class UUID(ClickHouseType): @property def python_null(self): - return PYUUID(int=0) if self.read_format() == 'uuid' else '' + return '' if self.read_format() == 'string' else PYUUID(0) + + @property + def np_type(self): + return 'U' if self.read_format() == 'string' else 'O' def _from_row_binary(self, source: bytearray, loc: int): int_high, loc = read_uint64(source, loc) diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 379911bc..ee1ea201 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -1,8 +1,9 @@ import logging -import re +import pytz from abc import ABCMeta, abstractmethod from typing import Iterable, Tuple, Optional, Any, Union, Sequence, Dict +from pytz.exceptions import UnknownTimeZoneError from clickhouse_connect.datatypes.registry import get_from_name from clickhouse_connect.datatypes.base import ClickHouseType @@ -12,7 +13,6 @@ to_arrow, QueryContext logger = logging.getLogger(__name__) -limit_re = re.compile(r'\s+LIMIT[$|\s]', re.IGNORECASE) class Client(metaclass=ABCMeta): @@ -30,8 +30,13 @@ def __init__(self, database: str, query_limit: int, uri: str): :param uri: uri for error messages """ self.limit = query_limit - self.server_version, self.server_tz, self.database = \ + self.server_tz = pytz.UTC + self.server_version, server_tz, self.database = \ tuple(self.command('SELECT version(), timezone(), database()', use_database=False)) + try: + self.server_tz = pytz.timezone(server_tz) + except UnknownTimeZoneError: + logger.warning('Warning, server is using an unrecognized timezone %s, will use UTC default', server_tz) server_settings = self.query('SELECT name, value, changed, description, type, readonly FROM system.settings') self.server_settings = {row['name']: SettingDef(**row) for row in server_settings.named_results()} if database and not database == '__default__': @@ -61,13 +66,10 @@ def _validate_settings(self, settings: Optional[Dict[str, Any]], stringify: bool validated[key] = value return validated - def _prep_query(self, query: str, parameters: Optional[Dict[str, Any]] = None): - if parameters: - escaped = {k: format_query_value(v, self.server_tz) for k, v in parameters.items()} - query %= escaped - if self.limit and not limit_re.search(query) and 'SELECT ' in query.upper(): - query += f' LIMIT {self.limit}' - return query + def _prep_query(self, context: QueryContext): + if context.is_select and not context.has_limit: + return f'{context.final_query}\n LIMIT {self.limit}' + return context.final_query @abstractmethod def _query_with_context(self, context: QueryContext): @@ -88,6 +90,7 @@ def query(self, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, + encoding: Optional[str] = None, use_none: bool = True, context: QueryContext = None) -> QueryResult: """ @@ -97,6 +100,7 @@ def query(self, :param settings: Optional dictionary of ClickHouse settings (key/string values) :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring + :param encoding: See QueryContext __init__ docstring :param use_none: Use None for ClickHouse nulls instead of empty values :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: QueryResult -- data and metadata from response @@ -107,9 +111,18 @@ def query(self, settings, query_formats, column_formats, + encoding, + self.server_tz, False) else: - query_context = QueryContext(query, parameters, settings, query_formats, column_formats, use_none) + query_context = QueryContext(query, + parameters, + settings, + query_formats, + column_formats, + encoding, + self.server_tz, + use_none) return self._query_with_context(query_context) @abstractmethod @@ -134,6 +147,7 @@ def query_np(self, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, str]] = None, + encoding: Optional[str] = None, context: QueryContext = None): """ Query method that returns the results as a numpy array @@ -142,6 +156,7 @@ def query_np(self, :param settings: Optional dictionary of ClickHouse settings (key/string values) :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring. + :param encoding: See QueryContext __init__ docstring :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: Numpy array representing the result set """ @@ -150,6 +165,7 @@ def query_np(self, settings, query_formats, column_formats, + encoding, False, context)) @@ -160,6 +176,7 @@ def query_df(self, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, str]] = None, + encoding: Optional[str] = None, context: QueryContext = None): """ Query method that results the results as a pandas dataframe @@ -168,6 +185,7 @@ def query_df(self, :param settings: Optional dictionary of ClickHouse settings (key/string values) :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring + :param encoding: See QueryContext __init__ docstring :param context An alternative QueryContext parameter object that contains some or all of the method arguments :return: Numpy array representing the result set """ @@ -176,6 +194,7 @@ def query_df(self, settings, query_formats, column_formats, + encoding, False, context)) diff --git a/clickhouse_connect/driver/common.py b/clickhouse_connect/driver/common.py index a0480ed9..dd0344af 100644 --- a/clickhouse_connect/driver/common.py +++ b/clickhouse_connect/driver/common.py @@ -1,6 +1,5 @@ import array import sys -import re from typing import Tuple, Sequence, MutableSequence @@ -11,7 +10,6 @@ array_map = {1: 'b', 2: 'h', 4: 'i', 8: 'q'} decimal_prec = {32: 9, 64: 18, 128: 38, 256: 79} -identifier_re = re.compile('^[a-zA-Z_][0-9a-zA-Z_]*$') if int_size == 2: array_map[4] = 'l' diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index 97ed2fa0..ed81da23 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -14,8 +14,7 @@ from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError, ProgrammingError from clickhouse_connect.driver.httpadapter import KeepAliveAdapter from clickhouse_connect.driver.native import NativeTransform -from clickhouse_connect.driver.query import QueryResult, DataResult, format_query_value, QueryContext, \ - remove_sql_comments +from clickhouse_connect.driver.query import QueryResult, DataResult, QueryContext, finalize_query from clickhouse_connect.driver.rowbinary import RowBinaryTransform logger = logging.getLogger(__name__) @@ -136,26 +135,23 @@ def __init__(self, super().__init__(database=database, query_limit=query_limit, uri=self.url) self.session.params = self._validate_settings(settings, True) - def _format_query(self, query: str) -> str: - uncommented_query = remove_sql_comments(query) - if uncommented_query.upper().startswith('INSERT ') and 'VALUES' in query.upper(): - return query # Don't format the output of INSERT statements - if not uncommented_query.endswith(self.read_format): - query += f'\nFORMAT {self.read_format}' - return query - def client_setting(self, name, value): if isinstance(value, bool): value = '1' if value else '0' self.session.params[name] = str(value) + def _prep_query(self, context: QueryContext): + final_query = super()._prep_query(context) + if context.is_insert: + return final_query + return f'{final_query}\n FORMAT {self.write_format}' + def _query_with_context(self, context: QueryContext) -> QueryResult: - final_query = self._prep_query(context.query, context.parameters) headers = {'Content-Type': 'text/plain; charset=utf-8'} params = {'database': self.database} params.update(self._validate_settings(context.settings, True)) - if columns_only_re.search(final_query): - response = self._raw_request(final_query + ' FORMAT JSON', params, headers, retries=2) + if columns_only_re.search(context.uncommented_query): + response = self._raw_request(f'{context.final_query}\n FORMAT JSON', params, headers, retries=2) json_result = json.loads(response.content) # ClickHouse will respond with a JSON object of meta, data, and some other objects # We just grab the column names and column types from the metadata sub object @@ -166,7 +162,7 @@ def _query_with_context(self, context: QueryContext) -> QueryResult: types.append(registry.get_from_name(col['type'])) data_result = DataResult([], tuple(names), tuple(types)) else: - response = self._raw_request(self._format_query(final_query), params, headers, retries=2) + response = self._raw_request(self._prep_query(context), params, headers, retries=2) data_result = self.transform.parse_response(response.content, context) summary = {} if 'X-ClickHouse-Summary' in response.headers: @@ -205,9 +201,7 @@ def command(self, """ See BaseClient doc_string for this method """ - if parameters: - escaped = {k: format_query_value(v, self.server_tz) for k, v in parameters.items()} - cmd %= escaped + cmd = finalize_query(cmd, parameters, self.server_tz) headers = {} params = {} payload = None @@ -286,9 +280,9 @@ def raw_query(self, """ See BaseClient doc_string for this method """ - final_query = self._prep_query(query, parameters) - if fmt and ' FORMAT ' not in query.upper(): - final_query += f' FORMAT {fmt}' + final_query = finalize_query(query, parameters, self.server_tz) + if fmt: + final_query += f'\n FORMAT {fmt}' return self._raw_request(final_query, self._validate_settings(settings, True)).content diff --git a/clickhouse_connect/driver/parser.py b/clickhouse_connect/driver/parser.py index 1815734c..a158e7f9 100644 --- a/clickhouse_connect/driver/parser.py +++ b/clickhouse_connect/driver/parser.py @@ -1,10 +1,9 @@ from typing import Union, Tuple - -# pylint: disable=too-many-branches from clickhouse_connect.driver.common import unescape_identifier +# pylint: disable=too-many-branches def parse_callable(expr) -> Tuple[str, Tuple[Union[str, int], ...], str]: """ Parses a single level ClickHouse optionally 'callable' function/identifier. The identifier is returned as the diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 5278bedf..6713db6c 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -1,11 +1,11 @@ import ipaddress import re import uuid +import pytz from enum import Enum from typing import NamedTuple, Any, Tuple, Dict, Sequence, Optional, Union -from datetime import date, datetime -from pytz import UTC +from datetime import date, datetime, tzinfo from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.datatypes.container import Array @@ -22,6 +22,10 @@ if HAS_ARROW: import pyarrow +limit_re = re.compile(r'\s+LIMIT($|\s)', re.IGNORECASE) +select_re = re.compile(r'(^|\s)SELECT\s', re.IGNORECASE) +insert_re = re.compile(r'(^|\s)INSERT\s*INTO', re.IGNORECASE) + # pylint: disable=too-many-instance-attributes class QueryContext: @@ -31,11 +35,13 @@ class QueryContext: # pylint: disable=duplicate-code def __init__(self, - query: str = None, + query: str = '', parameters: Optional[Dict[str, Any]] = None, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, + encoding: Optional[str] = None, + server_tz: tzinfo = pytz.UTC, use_none: bool = True): """ Initializes various configuration settings for the query context @@ -51,20 +57,38 @@ def __init__(self, The value is either the format for the data column (such as 'string' for a UUID column) or a second level "format" dictionary of a ClickHouse type name and a value of query formats. This secondary dictionary can be used for nested column types such as Tuples or Maps + :param encoding: Optional string encoding for this query, such as 'latin-1' :param column_formats: Optional dictionary :param use_none: """ self.query = query self.parameters = parameters or {} self.settings = settings or {} - if query_formats: - self.encoding = query_formats.pop('encoding', None) - else: - self.encoding = None - self.query_formats = format_map(query_formats) + self.query_formats = query_formats or {} self.column_formats = column_formats or {} + self.encoding = encoding + self.server_tz = server_tz self.use_none = use_none - self.thread_local = None + self.final_query = finalize_query(query, parameters, server_tz) + self._uncommented_query = None + + @property + def uncommented_query(self) -> str: + if not self._uncommented_query: + self._uncommented_query = remove_sql_comments(self.final_query) + return self._uncommented_query + + @property + def is_select(self) -> bool: + return select_re.search(self.uncommented_query) is not None + + @property + def has_limit(self) -> bool: + return limit_re.search(self.uncommented_query) is not None + + @property + def is_insert(self) -> bool: + return insert_re.search(self.uncommented_query) is not None def updated_copy(self, query: Optional[str] = None, @@ -72,30 +96,23 @@ def updated_copy(self, settings: Optional[Dict[str, Any]] = None, query_formats: Optional[Dict[str, str]] = None, column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, + encoding: Optional[str] = None, + server_tz: Optional[tzinfo] = None, use_none: Optional[bool] = None) -> 'QueryContext': """ - Creates - :param query: - :param parameters: - :param settings: - :param query_formats: - :param column_formats: - :param use_none: - :return: + Creates Query context copy with parameters overridden/updated as appropriate """ - copy = QueryContext() - copy.query = query or self.query - copy.parameters = self.parameters.update(parameters or {}) - copy.settings = self.settings.update(settings or {}) - if query_formats: - copy.encoding = self.encoding or query_formats.pop('encoding', None) - copy.query_formats = self.query_formats.update(query_formats or {}) - copy.column_formats = self.column_formats.update(column_formats or {}) - copy.use_none = use_none if use_none is not None else self.use_none - return copy + return QueryContext(query or self.query, + self.parameters.update(parameters or {}), + self.settings.update(settings or {}), + self.query_formats.update(query_formats or {}), + self.column_formats.update(column_formats or {}), + encoding if encoding else self.encoding, + server_tz if server_tz else self.server_tz, + use_none if use_none is not None else self.use_none) def __enter__(self): - query_settings.query_overrides = self.query_formats + query_settings.query_overrides = format_map(self.query_formats) query_settings.query_encoding = self.encoding return self @@ -151,8 +168,14 @@ class DataResult(NamedTuple): must_escape = (BS, '\'') +def finalize_query(query: str, parameters: Optional[Dict[str, Any]], tz: Optional[tzinfo] = None) -> str: + if not parameters: + return query + return query % {k: format_query_value(v, tz) for k, v in parameters.items()} + + # pylint: disable=too-many-return-statements -def format_query_value(value, server_tz=UTC): +def format_query_value(value: Any, server_tz: tzinfo = pytz.UTC): """ Format Python values in a ClickHouse query :param value: Python object diff --git a/tests/unit_tests/test_sqlalchemy/test_ddl.py b/tests/unit_tests/test_sqlalchemy/test_ddl.py index aebe18ab..f81becd8 100644 --- a/tests/unit_tests/test_sqlalchemy/test_ddl.py +++ b/tests/unit_tests/test_sqlalchemy/test_ddl.py @@ -8,12 +8,12 @@ dialect = ClickHouseDialect() replicated_mt_ddl = """\ -CREATE TABLE replicated_mt_test (key UInt64) Engine ReplicatedMergeTree('/clickhouse/tables/repl_mt_test',\ +CREATE TABLE `replicated_mt_test` (`key` UInt64) Engine ReplicatedMergeTree('/clickhouse/tables/repl_mt_test',\ '{replica}') ORDER BY key\ """ replacing_mt_ddl = """\ -CREATE TABLE replacing_mt_test (key UInt32, date DateTime) Engine ReplacingMergeTree(date) ORDER BY key\ +CREATE TABLE `replacing_mt_test` (`key` UInt32, `date` DateTime) Engine ReplacingMergeTree(date) ORDER BY key\ """ From 4383bb8430396c3fb6f1533b518afa4bafcd6709 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 4 Aug 2022 07:28:20 -0600 Subject: [PATCH 24/25] Lint fixes --- clickhouse_connect/cc_sqlalchemy/sql/preparer.py | 7 ------- clickhouse_connect/driver/client.py | 5 ++--- clickhouse_connect/driver/httpadapter.py | 2 ++ clickhouse_connect/driver/query.py | 6 +++--- clickhouse_connect/json_impl.py | 6 +++--- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/clickhouse_connect/cc_sqlalchemy/sql/preparer.py b/clickhouse_connect/cc_sqlalchemy/sql/preparer.py index 5337f657..4450b51c 100644 --- a/clickhouse_connect/cc_sqlalchemy/sql/preparer.py +++ b/clickhouse_connect/cc_sqlalchemy/sql/preparer.py @@ -7,12 +7,5 @@ class ChIdentifierPreparer(IdentifierPreparer): quote_identifier = staticmethod(quote_id) - def normalize_name(self, name): - return name - - def denormalize_name(self, name): - return name - def _requires_quotes(self, _value): return True - diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index ee1ea201..9cbf79ce 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -9,8 +9,7 @@ from clickhouse_connect.datatypes.base import ClickHouseType from clickhouse_connect.driver.exceptions import ProgrammingError, InternalError from clickhouse_connect.driver.models import ColumnDef, SettingDef -from clickhouse_connect.driver.query import QueryResult, np_result, to_pandas_df, from_pandas_df, format_query_value, \ - to_arrow, QueryContext +from clickhouse_connect.driver.query import QueryResult, np_result, to_pandas_df, from_pandas_df, to_arrow, QueryContext logger = logging.getLogger(__name__) @@ -83,7 +82,7 @@ def client_setting(self, name, value): :param value: Setting value """ - # pylint: disable=duplicate-code + # pylint: disable=duplicate-code,too-many-arguments def query(self, query: str = None, parameters: Optional[Dict[str, Any]] = None, diff --git a/clickhouse_connect/driver/httpadapter.py b/clickhouse_connect/driver/httpadapter.py index d1640a94..b3cb24c0 100644 --- a/clickhouse_connect/driver/httpadapter.py +++ b/clickhouse_connect/driver/httpadapter.py @@ -19,6 +19,8 @@ class KeepAliveAdapter(HTTPAdapter): """ Extended HTTP adapter that sets preferred keep alive options """ + + # pylint: disable=no-member def __init__(self, **kwargs): self.socket_options = core_socket_options.copy() interval = kwargs.pop('keep_interval', KEEP_INTERVAL) diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 6713db6c..30b3705d 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -33,7 +33,7 @@ class QueryContext: Argument/parameter object for queries. This context is used to set thread/query specific formats """ - # pylint: disable=duplicate-code + # pylint: disable=duplicate-code,too-many-arguments def __init__(self, query: str = '', parameters: Optional[Dict[str, Any]] = None, @@ -168,10 +168,10 @@ class DataResult(NamedTuple): must_escape = (BS, '\'') -def finalize_query(query: str, parameters: Optional[Dict[str, Any]], tz: Optional[tzinfo] = None) -> str: +def finalize_query(query: str, parameters: Optional[Dict[str, Any]], server_tz: Optional[tzinfo] = None) -> str: if not parameters: return query - return query % {k: format_query_value(v, tz) for k, v in parameters.items()} + return query % {k: format_query_value(v, server_tz) for k, v in parameters.items()} # pylint: disable=too-many-return-statements diff --git a/clickhouse_connect/json_impl.py b/clickhouse_connect/json_impl.py index 5576bb2e..d51804ac 100644 --- a/clickhouse_connect/json_impl.py +++ b/clickhouse_connect/json_impl.py @@ -5,7 +5,7 @@ try: import orjson - any_to_json = orjson.dumps + any_to_json = orjson.dumps # pylint: disable=no-member except ImportError: orjson = None @@ -21,8 +21,8 @@ def _pyjson_to_json(obj: Any) -> bytes: logger = logging.getLogger(__name__) _to_json = OrderedDict() -_to_json['orjson'] = orjson.dumps if orjson else None -_to_json['ujson'] = ujson.dumps if ujson else None +_to_json['orjson'] = orjson.dumps if orjson else None # pylint: disable=no-member +_to_json['ujson'] = ujson.dumps if ujson else None # pylint: disable=c-extension-no-member _to_json['python'] = _pyjson_to_json any_to_json = _pyjson_to_json From 9b1dd8af11fc9824c5b1ce559bb40122ea8206b0 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 4 Aug 2022 08:03:33 -0600 Subject: [PATCH 25/25] Update changelog --- .gitignore | 1 + CHANGELOG.md | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index fa8b8db7..9a72348b 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ # Python cruft *.pyc +.python-version # C extensions *.so diff --git a/CHANGELOG.md b/CHANGELOG.md index f8933401..a510d9ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,21 @@ ### Release 0.2.0, 2022-08-04 +#### Deprecation warning + +* In the next release the row_binary option for ClickHouse serialization will be removed. The performance is significantly lower than Native format and maintaining the option add complexity with no corresponding benefit + #### Improvements -* Support (experimental) JSON/Object datatype. ClickHouse Connect will take advantage of the fast orjson library if available. -* Standardize read format handling and allow setting a return data format per column or per query. +* Support (experimental) JSON/Object datatype. ClickHouse Connect will take advantage of the fast orjson library if available. Note that inserts for JSON columns require ClickHouse server version 22.6.1 or later +* Standardize read format handling and allow specifying a return data format per column or per query. +* Added convenience min_version method to client to see if the server is at least the requested level +* Increase default HTTP timeout to 300 seconds to match ClickHouse server default #### Bug Fixes +* Fixed multiple issues with SQL comments that would cause some queries to fail +* Fixed problem with SQLAlchemy literal binds that would cause an error in Superset filters +* Fixed issue with parameter * Named Tuples were not supported and would result in throwing an exception. This has been fixed. * The client query_arrow function would return incomplete results if the query result exceeded the ClickHouse max_block_size. This has been fixed. As part of the fix query_arrow method returns a PyArrow Table object. While this is a breaking change in the API it should be easy to work around.