Skip to content

Commit

Permalink
Index metadata fields
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-sigurd committed Aug 22, 2023
1 parent 683809b commit 63282b5
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 1 deletion.
62 changes: 61 additions & 1 deletion lambdas/es/indexer/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,64 @@ def do_index(
logger_.debug("%s indexed as package (%s)", key, event_type)


def _try_parse_date(s: str) -> datetime.datetime:
# XXX: do we need to support more formats?
# XXX: do we need to preserve UTC timezone?
if s[-1:] == "Z":
s = s[:-1]
try:
return datetime.datetime.fromisoformat(s)
except ValueError:
return None


def _get_metadata_fields(path: tuple, d: dict):
for k, v in d.items():
if "." in k:
# XXX: ignore for now
print("ignoring field %s" % path + (k,))
continue
if isinstance(v, dict):
yield from _get_metadata_fields(path + (k,), v)
else:
if isinstance(v, str):
date = _try_parse_date(v)
if date is not None:
type_ = "date"
v = date
else:
# XXX: make length limit variable
type_ = "keyword" if len(v) < 256 else "text"
elif isinstance(v, bool):
type_ = "boolean"
elif isinstance(v, (int, float)):
# XXX: do we need to convert it explicitly to float?
# XXX: do something on ints that can't be converted to float without loss?
type_ = "double"
elif isinstance(v, list):
# XXX: ignore for now
continue
else:
print("ignoring value of type %s" % type(v))
continue

yield path + (k,), type_, v


def get_metadata_fields(meta):
if not isinstance(meta, dict):
# XXX: can we do something better?
return None
return [
{
"name": ".".join(path),
"type": type_,
type_: value,
}
for path, type_, value in _get_metadata_fields((), meta)
]


def index_if_package(
s3_client,
doc_queue: DocumentQueue,
Expand Down Expand Up @@ -350,6 +408,7 @@ def get_pkg_data():
if not stats:
return

user_meta = first.get("user_meta", {})
return {
"key": key,
"etag": etag,
Expand All @@ -360,7 +419,8 @@ def get_pkg_data():
"pointer_file": pointer_file,
"hash": package_hash,
"package_stats": stats,
"metadata": json.dumps(first.get("user_meta", {})),
"metadata": json.dumps(user_meta),
"metadata_fields": get_metadata_fields(user_meta),
"comment": str(first.get("message", "")),
}

Expand Down
7 changes: 7 additions & 0 deletions lambdas/es/indexer/test/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,13 @@ def test_index_if_package(self, append_mock, select_meta_mock, select_stats_mock
"hash": pkg_hash,
"package_stats": select_stats_mock.return_value,
"metadata": json.dumps(meta),
"metadata_fields": [
{
"name": "foo",
"type": "keyword",
"keyword": "bar",
},
],
"comment": message,
})

Expand Down

0 comments on commit 63282b5

Please sign in to comment.