Skip to content

Commit

Permalink
Restore original schema-ddl behaviour for objects with no defined fields
Browse files Browse the repository at this point in the history
This concerns schemas like:

```
{"type": "object", "additionalProperties": false}
```

Older versions of schema-ddl would convert this to a schema type to
String (JSON) parquet column.  In snowplow/schema-ddl#205 we changed the
behaviour so this schema is converted to a `None`, i.e. do not create a
column for this schema. It was a good change for newer loaders (aside
from RDB Loader).

But that caused problems for RDB Loader under an edge-case scenario: If
the schema above is evolved from `1-0-0` to `1-0-1` and the new schema
adds a field to the schema, then RDB Loader tries to create a column for
the new field.  But that clashes with the old string column created with
the older version of RDB Loader.

This PR returns to the original behaviour of schema-ddl for this schemas
with no explicit properties.  It does so without us making any change to
schema-ddl, so we still get all the benefits of snowplow/schema-ddl#205
for the other loaders.
  • Loading branch information
istreeter committed Nov 29, 2024
1 parent deca714 commit a073ccf
Showing 1 changed file with 50 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import cats.syntax.all._
import com.snowplowanalytics.iglu.client.Resolver
import com.snowplowanalytics.iglu.client.resolver.registries.RegistryLookup
import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema
import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.{ArrayProperty, CommonProperties, ObjectProperty}
import com.snowplowanalytics.iglu.schemaddl.parquet.{Field, Type}
import com.snowplowanalytics.iglu.schemaddl.parquet.Migrations.mergeSchemas
import com.snowplowanalytics.snowplow.analytics.scalasdk.SnowplowEvent
Expand Down Expand Up @@ -125,6 +126,9 @@ object NonAtomicFieldsProvider {
.fetchSchemasWithSameModel[F](resolver, typeSet.max.schemaKey)
// Preserve the historic behaviour by dropping the schemas newer then max in this batch
.map(listOfSchemas => listOfSchemas.filter(_.schemaKey <= typeSet.max.schemaKey))
.map(_.map { case SchemaWithKey(key, schema) =>
SchemaWithKey(key, setAdditionalPropertiesTrue(schema))
})
.flatMap(schemaList =>
EitherT
.fromOption[F][LoaderIgluError, NonEmptyList[SchemaWithKey]](
Expand All @@ -139,6 +143,52 @@ object NonAtomicFieldsProvider {
)
)

/**
* Restores legacy behaviour of schema-ddl for backwards compatibility
*
* This concerns schemas like: `{ "type": "object", "additionalProperties": false }`
*
* - Older versions of schema-ddl converted this to a string (JSON) column.
* - Newer versions of schema-ddl convert this to a `None`, i.e. do not create a column for this
* schema.
*
* Here we convert the `additionalProperties` to `true` which tricks schema-ddl to return to the
* original behaviour.
*/
private def setAdditionalPropertiesTrue(schema: Schema): Schema = {
val items = schema.items.map {
case ArrayProperty.Items.ListItems(li) =>
ArrayProperty.Items.ListItems(setAdditionalPropertiesTrue(li))
case ArrayProperty.Items.TupleItems(ti) =>
ArrayProperty.Items.TupleItems(ti.map(setAdditionalPropertiesTrue))
}
val additionalItems = schema.additionalItems.map {
case ArrayProperty.AdditionalItems.AdditionalItemsAllowed(ail) =>
ArrayProperty.AdditionalItems.AdditionalItemsAllowed(ail)
case ArrayProperty.AdditionalItems.AdditionalItemsSchema(ais) =>
ArrayProperty.AdditionalItems.AdditionalItemsSchema(setAdditionalPropertiesTrue(ais))
}
val properties = schema.properties
.map { properties =>
properties.value.map { case (k, v) =>
k -> setAdditionalPropertiesTrue(v)
}
}
.map(ObjectProperty.Properties(_))

val additionalProperties =
if (schema.additionalProperties.isDefined) Some(ObjectProperty.AdditionalProperties.AdditionalPropertiesAllowed(false)) else None

schema.copy(
items = items,
properties = properties,
additionalProperties = additionalProperties,
additionalItems = additionalItems,
oneOf = schema.oneOf.map(oneOf => CommonProperties.OneOf(oneOf.value.map(setAdditionalPropertiesTrue))),
anyOf = schema.anyOf.map(anyOf => CommonProperties.AnyOf(anyOf.value.map(setAdditionalPropertiesTrue)))
)
}

private def fieldFromSchema(`type`: WideRow.Type)(schema: Schema): Option[Field] = {
val fieldName = SnowplowEvent.transformSchema(`type`.snowplowEntity.toSdkProperty, `type`.schemaKey)

Expand Down

0 comments on commit a073ccf

Please sign in to comment.