From 1f18c91f6985040b1d3527a8c3e31faf67aa05d5 Mon Sep 17 00:00:00 2001 From: tiboun Date: Thu, 19 Dec 2024 15:28:21 +0100 Subject: [PATCH] If an InvalidFieldNameException occurs while creating struct subfields, then consider field type to be variant instead. --- .../InvalidFieldNameException.scala | 6 ++ .../schema/handlers/InferSchemaHandler.scala | 74 +++++++++++-------- 2 files changed, 49 insertions(+), 31 deletions(-) create mode 100644 src/main/scala/ai/starlake/schema/exceptions/InvalidFieldNameException.scala diff --git a/src/main/scala/ai/starlake/schema/exceptions/InvalidFieldNameException.scala b/src/main/scala/ai/starlake/schema/exceptions/InvalidFieldNameException.scala new file mode 100644 index 000000000..bb64c3871 --- /dev/null +++ b/src/main/scala/ai/starlake/schema/exceptions/InvalidFieldNameException.scala @@ -0,0 +1,6 @@ +package ai.starlake.schema.exceptions + +class InvalidFieldNameException(val fieldName: String) + extends RuntimeException( + s"Invalid field name ->${fieldName}<-. Only letters, digits and '_' are allowed. Other characters including spaces are forbidden." + ) {} diff --git a/src/main/scala/ai/starlake/schema/handlers/InferSchemaHandler.scala b/src/main/scala/ai/starlake/schema/handlers/InferSchemaHandler.scala index d0d1933c8..0bbd8a1fd 100644 --- a/src/main/scala/ai/starlake/schema/handlers/InferSchemaHandler.scala +++ b/src/main/scala/ai/starlake/schema/handlers/InferSchemaHandler.scala @@ -21,6 +21,7 @@ package ai.starlake.schema.handlers import ai.starlake.config.Settings +import ai.starlake.schema.exceptions.InvalidFieldNameException import ai.starlake.schema.model._ import ai.starlake.utils.YamlSerde import org.apache.hadoop.fs.Path @@ -101,36 +102,49 @@ object InferSchemaHandler { currentSchema match { case st: StructType => val schemaWithIndex: Seq[(StructField, Int)] = st.zipWithIndex - val attributes = schemaWithIndex.map { case (field, index) => - val structLines = - currentLines.flatMap(Option(_)).map { - case r: Row => r.get(index) - case other => - throw new RuntimeException( - "Encountered " + other.getClass.getName + s" for field path $fieldPath but expected a Row instead for a Struct" - ) - } - - createAttribute( - structLines, - st(index).dataType, - field, - fieldPath + "." + field.name, - forcePattern - ) - }.toList - val rename = container.name.replaceAll("[:.-]", "_") val renamedField = if (rename != container.name) Some(rename) else None - Attribute( - name = container.name, - `type` = st.typeName, - rename = renamedField, - required = if (!container.nullable) Some(true) else None, - array = Some(false), - attributes = attributes, - sample = None // currentLines.map(Option(_)).headOption.flatten.map(_.toString) - ) + try { + val attributes = schemaWithIndex.map { case (field, index) => + val structLines = + currentLines.flatMap(Option(_)).map { + case r: Row => r.get(index) + case other => + throw new RuntimeException( + "Encountered " + other.getClass.getName + s" for field path $fieldPath but expected a Row instead for a Struct" + ) + } + + createAttribute( + structLines, + st(index).dataType, + field, + fieldPath + "." + field.name, + forcePattern + ) + }.toList + + Attribute( + name = container.name, + `type` = st.typeName, + rename = renamedField, + required = if (!container.nullable) Some(true) else None, + array = Some(false), + attributes = attributes, + sample = None // currentLines.map(Option(_)).headOption.flatten.map(_.toString) + ) + } catch { + case _: InvalidFieldNameException => + Attribute( + name = container.name, + `type` = PrimitiveType.variant.value, + rename = renamedField, + required = if (!container.nullable) Some(true) else None, + array = Some(false), + attributes = Nil, + sample = None // currentLines.map(Option(_)).headOption.flatten.map(_.toString) + ) + } case dt: ArrayType => dt.elementType match { case _: ArrayType => @@ -207,9 +221,7 @@ object InferSchemaHandler { if (!forcePattern || identifierRegex.pattern.matcher(result.name).matches) result else - throw new RuntimeException( - s"Invalid field name ->${result.name}<-. Only letters, digits and '_' are allowed. Other characters including spaces are forbidden." - ) + throw new InvalidFieldNameException(result.name) } createAttribute( lines,