Skip to content

Commit

Permalink
Merge pull request ckan#8530 from ckan/5847-selective-indexes
Browse files Browse the repository at this point in the history
configurable datastore FTS indexes
  • Loading branch information
amercader authored Nov 29, 2024
2 parents 822245e + f3023ae commit 958b80b
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 78 deletions.
9 changes: 9 additions & 0 deletions changes/5847.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Allow configuring datastore full text field indexes with new
ckan.datastore.default_fts_index_field_types config option.

The default is an empty list, avoiding automatically creating
separate full text indexes for any individual columns. The
whole-row full text index still exists for all tables.

Use the `ckan datastore fts-index` command to remove existing
column indexes to reclaim database space.
65 changes: 18 additions & 47 deletions ckanext/datastore/backend/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def _where_clauses(
# add full-text search where clause
q: Union[dict[str, str], str, Any] = data_dict.get('q')
full_text = data_dict.get('full_text')
if q and not full_text:
if q:
if isinstance(q, str):
ts_query_alias = _ts_query_alias()
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
Expand All @@ -459,6 +459,7 @@ def _where_clauses(

ftyp = fields_types[field]
if not datastore_helpers.should_fts_index_field_type(ftyp):
# use general full text search to narrow results
clause_str = u'_full_text @@ {0}'.format(query_field)
clauses.append((clause_str,))

Expand All @@ -468,49 +469,14 @@ def _where_clauses(
identifier(field),
query_field)
clauses.append((clause_str,))
elif (full_text and not q):
ts_query_alias = _ts_query_alias()
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
clauses.append((clause_str,))

elif full_text and isinstance(q, dict):
ts_query_alias = _ts_query_alias()
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
clauses.append((clause_str,))
# update clauses with q dict
_update_where_clauses_on_q_dict(data_dict, fields_types, q, clauses)

elif full_text and isinstance(q, str):
if full_text:
ts_query_alias = _ts_query_alias()
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
clauses.append((clause_str,))

return clauses


def _update_where_clauses_on_q_dict(
data_dict: dict[str, str], fields_types: dict[str, str],
q: dict[str, str],
clauses: WhereClauses) -> None:
lang = _fts_lang(data_dict.get('language'))
for field, _ in q.items():
if field not in fields_types:
continue
query_field = _ts_query_alias(field)

ftyp = fields_types[field]
if not datastore_helpers.should_fts_index_field_type(ftyp):
clause_str = u'_full_text @@ {0}'.format(query_field)
clauses.append((clause_str,))

clause_str = (
u'to_tsvector({0}, cast({1} as text)) @@ {2}').format(
literal_string(lang),
identifier(field),
query_field)
clauses.append((clause_str,))


def _textsearch_query(
lang: str, q: Optional[Union[str, dict[str, str], Any]], plain: bool,
full_text: Optional[str]) -> tuple[str, dict[str, str]]:
Expand Down Expand Up @@ -709,6 +675,7 @@ def _build_fts_indexes(
data_dict: dict[str, Any], # noqa
sql_index_str_method: str, fields: list[dict[str, Any]]):
fts_indexes: list[str] = []
fts_noindexes: list[str] = []
resource_id = data_dict['resource_id']
fts_lang = data_dict.get(
'language', config.get('ckan.datastore.default_fts_lang'))
Expand All @@ -722,23 +689,25 @@ def cast_as_text(x: str):

full_text_field = {'type': 'tsvector', 'id': '_full_text'}
for field in [full_text_field] + fields:
if not datastore_helpers.should_fts_index_field_type(field['type']):
continue

field_str = field['id']
if field['type'] not in ['text', 'tsvector']:
field_str = cast_as_text(field_str)
else:
field_str = u'"{0}"'.format(field_str)
if field['type'] != 'tsvector':
field_str = to_tsvector(field_str)
if field['id'] != '_full_text' and not (
datastore_helpers.should_fts_index_field_type(field['type'])):
fts_noindexes.append(_generate_index_name(resource_id, field_str))
continue

fts_indexes.append(sql_index_str_method.format(
res_id=resource_id,
unique='',
name=_generate_index_name(resource_id, field_str),
method=_get_fts_index_method(), fields=field_str))

return fts_indexes
return fts_indexes, fts_noindexes


def _drop_indexes(context: Context, data_dict: dict[str, Any],
Expand Down Expand Up @@ -944,9 +913,8 @@ def create_indexes(context: Context, data_dict: dict[str, Any]):
field_ids = _pluck('id', fields)
json_fields = [x['id'] for x in fields if x['type'] == 'nested']

fts_indexes = _build_fts_indexes(data_dict,
sql_index_string_method,
fields)
fts_indexes, fts_noindexes = _build_fts_indexes(
data_dict, sql_index_string_method, fields)
sql_index_strings = sql_index_strings + fts_indexes

if indexes is not None:
Expand Down Expand Up @@ -986,10 +954,13 @@ def create_indexes(context: Context, data_dict: dict[str, Any]):

current_indexes = _get_index_names(context['connection'],
data_dict['resource_id'])

for fts_idx in current_indexes:
if fts_idx in fts_noindexes:
connection.execute(sa.text(
'DROP INDEX {0} CASCADE'.format(sa.column(fts_idx))))
for sql_index_string in sql_index_strings:
has_index = [c for c in current_indexes
if sql_index_string.find(c) != -1]
if not has_index:
if not any(c in sql_index_string for c in current_indexes):
connection.execute(sa.text(sql_index_string))


Expand Down
64 changes: 37 additions & 27 deletions ckanext/datastore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
import ckan.logic as logic

import ckanext.datastore as datastore_module
from ckanext.datastore.backend import get_all_resources_ids_in_datastore
from ckanext.datastore.backend.postgres import (
identifier,
literal_string,
get_read_engine,
get_write_engine,
_get_raw_field_info,
_TIMEOUT,
)
from ckanext.datastore.blueprint import DUMP_FORMATS, dump_to

Expand Down Expand Up @@ -137,27 +139,17 @@ def purge():

site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})

result = logic.get_action('datastore_search')(
{'user': site_user['name']},
{'resource_id': '_table_metadata'}
)

resource_id_list = []
for record in result['records']:
for resid in get_all_resources_ids_in_datastore():
try:
# ignore 'alias' records (views) as they are automatically
# deleted when the parent resource table is dropped
if record['alias_of']:
continue

logic.get_action('resource_show')(
{'user': site_user['name']},
{'id': record['name']}
{'id': resid}
)
except logic.NotFound:
resource_id_list.append(record['name'])
resource_id_list.append(resid)
click.echo("Resource '%s' orphaned - queued for drop" %
record[u'name'])
resid)
except KeyError:
continue

Expand Down Expand Up @@ -191,22 +183,12 @@ def upgrade():
'''Move field info to _info so that plugins may add private information
to each field for their own purposes.'''

site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})

result = logic.get_action('datastore_search')(
{'user': site_user['name']},
{'resource_id': '_table_metadata'}
)

count = 0
skipped = 0
noinfo = 0
read_connection = get_read_engine()
for record in result['records']:
if record['alias_of']:
continue

raw_fields, old = _get_raw_field_info(read_connection, record['name'])
for resid in get_all_resources_ids_in_datastore():
raw_fields, old = _get_raw_field_info(read_connection, resid)
if not old:
if not raw_fields:
noinfo += 1
Expand All @@ -222,7 +204,7 @@ def upgrade():
raw_sql = literal_string(' ' + json.dumps(
raw, ensure_ascii=False, separators=(',', ':')))
alter_sql.append(u'COMMENT ON COLUMN {0}.{1} is {2}'.format(
identifier(record['name']),
identifier(resid),
identifier(fid),
raw_sql))

Expand All @@ -236,5 +218,33 @@ def upgrade():
count, skipped, noinfo))


@datastore.command(
'fts-index',
short_help='create or remove full-text search indexes after changing '
'the ckan.datastore.default_fts_index_field_types setting'
)
@click.option(
'--timeout', metavar='SECONDS',
type=click.FloatRange(0, 2147483.647), # because postgres max int
default=_TIMEOUT / 1000, show_default=True,
help='maximum index creation time in seconds',
)
def fts_index(timeout: float):
'''Use to create or remove full-text search indexes after changing
the ckan.datastore.default_fts_index_field_types setting.
'''
site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})
resource_ids = get_all_resources_ids_in_datastore()

for i, resid in enumerate(get_all_resources_ids_in_datastore(), 1):
print(f'\r{resid} [{i}/{len(resource_ids)}] ...', end='')
logic.get_action('datastore_create')(
{'user': site_user['name'],
'query_timeout': int(timeout * 1000)}, # type: ignore
{'resource_id': resid, 'force': True}
)
print('\x08\x08\x08done')


def get_commands():
return (set_permissions, dump, purge, upgrade)
12 changes: 12 additions & 0 deletions ckanext/datastore/config_declaration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,15 @@ groups:
The default method used when creating full-text search indexes. Currently it
can be "gin" or "gist". Refer to PostgreSQL's documentation to understand the
characteristics of each one and pick the best for your instance.
- key: ckan.datastore.default_fts_index_field_types
type: list
default: ''
example: text tsvector
description: >
A separate full-text search index will be created by default for fields
with these types, and used when searching on fields by passing a
dictionary to the datastore_search q parameter.
Indexes increase the time and disk space required to load data
into the DataStore.
3 changes: 2 additions & 1 deletion ckanext/datastore/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def _strip(s: Any):


def should_fts_index_field_type(field_type: str):
return field_type.lower() in ['tsvector', 'text', 'number']
return field_type in tk.config.get(
'ckan.datastore.default_fts_index_field_types', [])


def get_table_and_function_names_from_sql(context: Context, sql: str):
Expand Down
2 changes: 2 additions & 0 deletions ckanext/datastore/tests/test_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def test_create_adds_index_on_full_text_search_when_not_creating_other_indexes(
resource_id = result["resource_id"]
assert self._has_index_on_field(resource_id, '"_full_text"')

@pytest.mark.ckan_config(
"ckan.datastore.default_fts_index_field_types", "text")
def test_create_add_full_text_search_indexes_on_every_text_field(self):
package = factories.Dataset()
data = {
Expand Down
68 changes: 66 additions & 2 deletions ckanext/datastore/tests/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@ def test_default_fts_index_method_can_be_overwritten_by_config_var(self):
"_full_text", connection, resource_id, method="gin"
)

@pytest.mark.ckan_config(
"ckan.datastore.default_fts_index_field_types", "text tsvector")
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_english_as_default(
self, _get_fields
):
_get_fields.return_value = [
{"id": "text", "type": "text"},
{"id": "number", "type": "number"},
{"id": "tsvector", "type": "tsvector"},
{"id": "nested", "type": "nested"},
{"id": "date", "type": "date"},
{"id": "text array", "type": "text[]"},
Expand All @@ -62,9 +64,37 @@ def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_eng
"text", connection, resource_id, "english"
)
self._assert_created_index_on(
"number", connection, resource_id, "english", cast=True
"tsvector", connection, resource_id,
)

@mock.patch("ckanext.datastore.backend.postgres._get_fields")
def test_creates_no_fts_indexes_by_default(
self, _get_fields
):
_get_fields.return_value = [
{"id": "text", "type": "text"},
{"id": "tsvector", "type": "tsvector"},
{"id": "nested", "type": "nested"},
{"id": "date", "type": "date"},
{"id": "text array", "type": "text[]"},
{"id": "timestamp", "type": "timestamp"},
]
connection = mock.MagicMock()
context = {"connection": connection}
resource_id = "resource_id"
data_dict = {"resource_id": resource_id}

db.create_indexes(context, data_dict)

self._assert_no_index_created_on(
"text", connection, resource_id, "english"
)
self._assert_no_index_created_on(
"tsvector", connection, resource_id,
)

@pytest.mark.ckan_config(
"ckan.datastore.default_fts_index_field_types", "text tsvector")
@pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple")
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var(
Expand All @@ -80,6 +110,8 @@ def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var(

self._assert_created_index_on("foo", connection, resource_id, "simple")

@pytest.mark.ckan_config(
"ckan.datastore.default_fts_index_field_types", "text tsvector")
@pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple")
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
def test_creates_fts_index_on_textual_fields_can_overwrite_lang_using_lang_param(
Expand Down Expand Up @@ -127,6 +159,38 @@ def _assert_created_index_on(
"called with a string containing '%s'" % sql_str
)

def _assert_no_index_created_on(
self,
field,
connection,
resource_id,
lang=None,
cast=False,
method="gist",
):
field = u'"{0}"'.format(field)
if cast:
field = u"cast({0} AS text)".format(field)
if lang is not None:
sql_str = (
u'ON "resource_id" '
u"USING {method}(to_tsvector('{lang}', {field}))"
)
sql_str = sql_str.format(method=method, lang=lang, field=field)
else:
sql_str = u"USING {method}({field})".format(
method=method, field=field
)

calls = connection.execute.call_args_list

was_called = any(sql_str in str(call.args[0]) for call in calls)

assert not was_called, (
"Expected 'connection.execute' to not have been "
"called with a string containing '%s'" % sql_str
)


class TestGetAllResourcesIdsInDatastore(object):
@pytest.mark.ckan_config(u"ckan.plugins", u"datastore")
Expand Down
Loading

0 comments on commit 958b80b

Please sign in to comment.