Skip to content

Commit

Permalink
add support for fixed width UTF8 strings - #270 (#278)
Browse files Browse the repository at this point in the history
* add support for fixed width UTF8 strings - #270

* add support for binary request of utf8 fixed width strings

* updates for fixed utf8 attribute values
  • Loading branch information
jreadey authored Nov 3, 2023
1 parent 0bf6bb9 commit b0446f1
Show file tree
Hide file tree
Showing 10 changed files with 409 additions and 112 deletions.
5 changes: 5 additions & 0 deletions hsds/attr_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ async def PUT_AttributeValue(request):
raise HTTPBadRequest(reason=msg)

np_shape = getShapeDims(attr_shape)
log.debug(f"np_shape: {np_shape}")
type_json = dn_json["type"]
np_dtype = createDataType(type_json) # np datatype

Expand Down Expand Up @@ -697,6 +698,10 @@ async def PUT_AttributeValue(request):
# convert to JSON for transmission to DN
data = arr.tolist()
value = bytesArrayToList(data)
if attr_shape["class"] == "H5S_SCALAR":
# just send the value, not a list
value = value[0]

else:
try:
body = await request.json()
Expand Down
7 changes: 3 additions & 4 deletions hsds/chunk_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ async def PUT_Value(request):
else:
arr = jsonToArray(np_shape, dset_dtype, json_data)

log.debug(f"jsonToArray returned: {arr}")
if num_elements != np.prod(arr.shape):
msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}"
raise HTTPBadRequest(reason=msg)
Expand All @@ -520,13 +519,13 @@ async def PUT_Value(request):
arr_tmp[...] = arr
arr = arr_tmp
except ValueError:
log.warn(msg)
log.warn(f"ValueError: {msg}")
raise HTTPBadRequest(reason=msg)
except TypeError:
log.warn(msg)
log.warn(f"TypeError: {msg}")
raise HTTPBadRequest(reason=msg)
except IndexError:
log.warn(msg)
log.warn(f"IndexError: {msg}")
raise HTTPBadRequest(reason=msg)
log.debug(f"got json arr: {arr.shape}")
else:
Expand Down
9 changes: 6 additions & 3 deletions hsds/dset_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,12 +801,15 @@ async def POST_Dataset(request):
shape_json["dims"] = dims
rank = 1
elif isinstance(shape, str):
# only valid string value is H5S_NULL
if shape != "H5S_NULL":
# only valid string value is H5S_NULL or H5S_SCALAR
if shape == "H5S_NULL":
shape_json["class"] = "H5S_NULL"
elif shape == "H5S_SCALAR":
shape_json["class"] = "H5S_SCALAR"
else:
msg = "POST Datset with invalid shape value"
log.warn(msg)
raise HTTPBadRequest(reason=msg)
shape_json["class"] = "H5S_NULL"
elif isinstance(shape, list):
if len(shape) == 0:
shape_json["class"] = "H5S_SCALAR"
Expand Down
38 changes: 21 additions & 17 deletions hsds/util/arrayUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def toTuple(rank, data):
else:
return tuple(toTuple(rank - 1, x) for x in data)
else:
if isinstance(data, str):
data = data.encode("utf8")
return data


Expand Down Expand Up @@ -93,6 +95,23 @@ def getNumElements(dims):
return num_elements


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def jsonToArray(data_shape, data_dtype, data_json):
"""
Return numpy array from the given json array.
Expand Down Expand Up @@ -122,6 +141,8 @@ def fillVlenArray(rank, data, arr, index):
converted_data = toTuple(np_shape_rank, data_json)
data_json = converted_data
else:
if isinstance(data_json, str):
data_json = data_json.encode("utf8")
data_json = [data_json,] # listify

if not (None in data_json):
Expand Down Expand Up @@ -149,23 +170,6 @@ def fillVlenArray(rank, data, arr, index):
return arr


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def getElementSize(e, dt):
"""
Get number of byte needed to given element as a bytestream
Expand Down
23 changes: 19 additions & 4 deletions hsds/util/hdf5dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,26 @@ def getTypeItem(dt, metadata=None):
else:
# Fixed length string type
type_info["class"] = "H5T_STRING"
type_info["charSet"] = "H5T_CSET_ASCII"
type_info["length"] = dt.itemsize
type_info["charSet"] = "H5T_CSET_ASCII"
type_info["strPad"] = "H5T_STR_NULLPAD"
elif dt.base.kind == "U":
# Fixed length unicode type
raise TypeError("Fixed length unicode type is not supported")
print("fixed UTF, itemsize:", dt.itemsize)
ref_check = check_dtype(ref=dt.base)
if ref_check is not None:
raise TypeError("unexpected reference type")

# Fixed length string type with unicode support
type_info["class"] = "H5T_STRING"

# this can be problematic if the encoding of the string is not valid,
# or reqires too many bytes. Use variable length strings to handle all
# UTF8 strings correctly
type_info["charSet"] = "H5T_CSET_UTF8"
# convert from UTF32 length to a fixed length
type_info["length"] = dt.itemsize
type_info["strPad"] = "H5T_STR_NULLPAD"

elif dt.kind == "b":
# boolean type - h5py stores as enum
Expand Down Expand Up @@ -614,8 +628,9 @@ def createBaseDataType(typeItem):
if typeItem["charSet"] == "H5T_CSET_ASCII":
type_code = "S"
elif typeItem["charSet"] == "H5T_CSET_UTF8":
msg = "fixed-width unicode strings are not supported"
raise TypeError(msg)
# use the same type_code as ascii strings
# (othewise, numpy will reserve bytes for UTF32 representation)
type_code = "S"
else:
raise TypeError("unexpected 'charSet' value")
# a fixed size string
Expand Down
129 changes: 116 additions & 13 deletions tests/integ/attr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def testPutFixedStringNullTerm(self):

def testPutVLenUTF8String(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string
print("testPutFixedUTF8String", self.base_domain)
print("testPutVLenUTF8String", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"
Expand All @@ -531,46 +531,149 @@ def testPutVLenUTF8String(self):

# create attr
text = "I'm an UTF-8 null terminated string"
text_length = len(text) + 1
fixed_str_type = {

variable_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": text_length,
"length": "H5T_VARIABLE",
"strPad": "H5T_STR_NULLTERM",
}
variable_str_type = {
scalar_shape = {"class": "H5S_SCALAR"}

data = {"type": variable_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)

# read attr
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], "H5T_VARIABLE")
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8")

def testPutFixedUTF8String(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string
print("testPutFixedUTF8String", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create attr
text = "this is the chinese character for the number eight: \u516b"

text_length = len(text) + 1
fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": "H5T_VARIABLE",
"length": text_length,
"strPad": "H5T_STR_NULLTERM",
}

scalar_shape = {"class": "H5S_SCALAR"}
data = {"type": fixed_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
# Should fail since UTF8 with fixed width is not supported
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 400)
self.assertEqual(rsp.status_code, 201)

data = {"type": variable_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
# read attr
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
print(rspJson)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], text_length)
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8")

def testPutFixedUTF8StringBinary(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string in binary
print("testPutFixedUTF8StringBinary", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create attr with json
character_text = "this is the chinese character for the number eight: \u516b"

binary_text = bytearray(character_text, "UTF-8")
byte_length = len(binary_text)

fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": byte_length, # Null byte explicitly included
"strPad": "H5T_STR_NULLTERM",
}

scalar_shape = {"class": "H5S_SCALAR"}
data = {"type": fixed_str_type, "shape": scalar_shape}
attr_name = "fixed_unicode_str_attr_binary"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)

# write to attr in binary
attr_name = "fixed_unicode_str_attr_binary"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + "/value"
headers["Content-Type"] = "application/octet-stream"
rsp = self.session.put(req, data=binary_text, headers=headers)
self.assertEqual(rsp.status_code, 200)

# read attr
headers["Content-Type"] = "application/json"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name

rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
print(rspJson)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
print(f"Retrieved UTF8 string: {rspJson['value']}")
self.assertEqual(rspJson["value"], character_text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], "H5T_VARIABLE")
self.assertEqual(type_json["length"], byte_length)
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
Expand Down Expand Up @@ -1302,7 +1405,7 @@ def testPutAttributeBinaryValue(self):
rsp = self.session.put(req, data=data, headers=headers_bin_req)
self.assertEqual(rsp.status_code, 200)

# try writing to few bytes, should fail
# try writing too few bytes, should fail
data = bytearray(extent)
for i in range(extent):
data[i] = 255
Expand Down
4 changes: 3 additions & 1 deletion tests/integ/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def testScalarDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
data = {"type": "H5T_IEEE_F32LE"}
data = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)
Expand Down Expand Up @@ -207,6 +207,8 @@ def testScalarEmptyDimsDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
# using an empty list for shape is equivalent to using
# "H5S_SCALAR"
data = {"type": "H5T_IEEE_F32LE", "shape": []}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
Expand Down
Loading

0 comments on commit b0446f1

Please sign in to comment.