Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement "inline schemas": ability to add type hints into the type providers' source documents #1447

Merged
merged 11 commits into from
Aug 8, 2022
Merged
27 changes: 18 additions & 9 deletions docs/tutorials/JsonAnonymizer.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,25 @@ type JsonAnonymizer(?propertiesToSkip, ?valuesToSkip) =
let randomize (str:string) =
String(str.ToCharArray() |> Array.map getRandomChar)

let isType testType typ =
match typ with
| Runtime.StructuralTypes.InferedType.Primitive (typ, _, _, _) -> typ = testType
| _ -> false

let rec anonymize json =
match json with
| JsonValue.String s when valuesToSkip.Contains s -> json
| JsonValue.String s ->
let typ =
Runtime.StructuralInference.inferPrimitiveType
CultureInfo.InvariantCulture s

( if typ = typeof<Guid> then Guid.NewGuid().ToString()
elif typ = typeof<Runtime.StructuralTypes.Bit0> ||
typ = typeof<Runtime.StructuralTypes.Bit1> then s
elif typ = typeof<DateTime> then s
Runtime.StructuralInference.defaultUnitsOfMeasureProvider
Runtime.StructuralInference.InferenceMode'.ValuesOnly
CultureInfo.InvariantCulture s None

( if typ |> isType typeof<Guid> then Guid.NewGuid().ToString()
elif typ |> isType typeof<Runtime.StructuralTypes.Bit0> ||
typ |> isType typeof<Runtime.StructuralTypes.Bit1> then s
elif typ |> isType typeof<DateTime> then s
else
let prefix, s =
if s.StartsWith "http://" then
Expand All @@ -92,9 +99,11 @@ type JsonAnonymizer(?propertiesToSkip, ?valuesToSkip) =
| JsonValue.Number d ->
let typ =
Runtime.StructuralInference.inferPrimitiveType
CultureInfo.InvariantCulture (d.ToString())
if typ = typeof<Runtime.StructuralTypes.Bit0> ||
typ = typeof<Runtime.StructuralTypes.Bit1> then json
Runtime.StructuralInference.defaultUnitsOfMeasureProvider
Runtime.StructuralInference.InferenceMode'.ValuesOnly
CultureInfo.InvariantCulture (d.ToString()) None
if typ |> isType typeof<Runtime.StructuralTypes.Bit0> ||
typ |> isType typeof<Runtime.StructuralTypes.Bit1> then json
else d.ToString() |> randomize |> Decimal.Parse |> JsonValue.Number
| JsonValue.Float f ->
f.ToString()
Expand Down
399 changes: 316 additions & 83 deletions src/CommonRuntime/StructuralInference.fs

Large diffs are not rendered by default.

28 changes: 16 additions & 12 deletions src/CommonRuntime/StructuralTypes.fs
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ type InferedTypeTag =
/// to generate nicer types!
[<CustomEquality; NoComparison; RequireQualifiedAccess>]
type InferedType =
| Primitive of typ: Type * unit: option<System.Type> * optional: bool
| Primitive of typ: Type * unit: option<System.Type> * optional: bool * shouldOverrideOnMerge: bool
| Record of name: string option * fields: InferedProperty list * optional: bool
| Json of typ: InferedType * optional: bool
| Collection of order: InferedTypeTag list * types: Map<InferedTypeTag, InferedMultiplicity * InferedType>
| Heterogeneous of types: Map<InferedTypeTag, InferedType>
| Heterogeneous of types: Map<InferedTypeTag, InferedType> * containsOptional: bool
| Null
| Top

Expand All @@ -86,16 +86,17 @@ type InferedType =
member x.EnsuresHandlesMissingValues allowEmptyValues =
match x with
| Null
| Heterogeneous _
| Heterogeneous(containsOptional = true)
| Primitive(optional = true)
| Record(optional = true)
| Json(optional = true) -> x
| Primitive (typ, _, false) when
| Primitive (typ, _, false, _) when
allowEmptyValues
&& InferedType.CanHaveEmptyValues typ
->
x
| Primitive (typ, unit, false) -> Primitive(typ, unit, true)
| Heterogeneous (map, false) -> Heterogeneous(map, true)
| Primitive (typ, unit, false, overrideOnMerge) -> Primitive(typ, unit, true, overrideOnMerge)
| Record (name, props, false) -> Record(name, props, true)
| Json (typ, false) -> Json(typ, true)
| Collection (order, types) ->
Expand All @@ -106,12 +107,15 @@ type InferedType =
Collection(order, typesR)
| Top -> failwith "EnsuresHandlesMissingValues: unexpected InferedType.Top"

member x.DropOptionality() =
member x.GetDropOptionality() =
match x with
| Primitive (typ, unit, true) -> Primitive(typ, unit, false)
| Record (name, props, true) -> Record(name, props, false)
| Json (typ, true) -> Json(typ, false)
| _ -> x
| Primitive (typ, unit, true, overrideOnMerge) -> Primitive(typ, unit, false, overrideOnMerge), true
| Record (name, props, true) -> Record(name, props, false), true
| Json (typ, true) -> Json(typ, false), true
| Heterogeneous (map, true) -> Heterogeneous(map, false), true
| _ -> x, false

member x.DropOptionality() = x.GetDropOptionality() |> fst

// We need to implement custom equality that returns 'true' when
// values reference the same object (to support recursive types)
Expand All @@ -121,11 +125,11 @@ type InferedType =
if y :? InferedType then
match x, y :?> InferedType with
| a, b when Object.ReferenceEquals(a, b) -> true
| Primitive (t1, ot1, b1), Primitive (t2, ot2, b2) -> t1 = t2 && ot1 = ot2 && b1 = b2
| Primitive (t1, ot1, b1, x1), Primitive (t2, ot2, b2, x2) -> t1 = t2 && ot1 = ot2 && b1 = b2 && x1 = x2
| Record (s1, pl1, b1), Record (s2, pl2, b2) -> s1 = s2 && pl1 = pl2 && b1 = b2
| Json (t1, o1), Json (t2, o2) -> t1 = t2 && o1 = o2
| Collection (o1, t1), Collection (o2, t2) -> o1 = o2 && t1 = t2
| Heterogeneous (m1), Heterogeneous (m2) -> m1 = m2
| Heterogeneous (m1, o1), Heterogeneous (m2, o2) -> m1 = m2 && o1 = o2
| Null, Null
| Top, Top -> true
| _ -> false
Expand Down
160 changes: 69 additions & 91 deletions src/Csv/CsvInference.fs
Original file line number Diff line number Diff line change
Expand Up @@ -10,46 +10,34 @@ open FSharp.Data.Runtime
open FSharp.Data.Runtime.StructuralTypes
open FSharp.Data.Runtime.StructuralInference

/// The schema may be set explicitly. This table specifies the mapping
/// from the names that users can use to the types used.
let private nameToType =
[ "int", (typeof<int>, TypeWrapper.None)
"int64", (typeof<int64>, TypeWrapper.None)
"bool", (typeof<bool>, TypeWrapper.None)
"float", (typeof<float>, TypeWrapper.None)
"decimal", (typeof<decimal>, TypeWrapper.None)
"date", (typeof<DateTime>, TypeWrapper.None)
"datetimeoffset", (typeof<DateTimeOffset>, TypeWrapper.None)
"timespan", (typeof<TimeSpan>, TypeWrapper.None)
"guid", (typeof<Guid>, TypeWrapper.None)
"string", (typeof<String>, TypeWrapper.None)
"int?", (typeof<int>, TypeWrapper.Nullable)
"int64?", (typeof<int64>, TypeWrapper.Nullable)
"bool?", (typeof<bool>, TypeWrapper.Nullable)
"float?", (typeof<float>, TypeWrapper.Nullable)
"decimal?", (typeof<decimal>, TypeWrapper.Nullable)
"date?", (typeof<DateTime>, TypeWrapper.Nullable)
"datetimeoffset?", (typeof<DateTimeOffset>, TypeWrapper.Nullable)
"timespan?", (typeof<TimeSpan>, TypeWrapper.Nullable)
"guid?", (typeof<Guid>, TypeWrapper.Nullable)
"int option", (typeof<int>, TypeWrapper.Option)
"int64 option", (typeof<int64>, TypeWrapper.Option)
"bool option", (typeof<bool>, TypeWrapper.Option)
"float option", (typeof<float>, TypeWrapper.Option)
"decimal option", (typeof<decimal>, TypeWrapper.Option)
"date option", (typeof<DateTime>, TypeWrapper.Option)
"datetimeoffset option", (typeof<DateTimeOffset>, TypeWrapper.Option)
"timespan option", (typeof<TimeSpan>, TypeWrapper.Option)
"guid option", (typeof<Guid>, TypeWrapper.Option)
"string option", (typeof<string>, TypeWrapper.Option) ]
/// This table specifies the mapping from (the names that users can use) to (the types used).
/// The table here for the CsvProvider extends the mapping used for inline schemas by adding nullable and optionals.
let private nameToTypeForCsv =
[ for KeyValue (k, v) in StructuralInference.nameToType -> k, v ]
@ [ "int?", (typeof<int>, TypeWrapper.Nullable)
"int64?", (typeof<int64>, TypeWrapper.Nullable)
"bool?", (typeof<bool>, TypeWrapper.Nullable)
"float?", (typeof<float>, TypeWrapper.Nullable)
"decimal?", (typeof<decimal>, TypeWrapper.Nullable)
"date?", (typeof<DateTime>, TypeWrapper.Nullable)
"datetimeoffset?", (typeof<DateTimeOffset>, TypeWrapper.Nullable)
"timespan?", (typeof<TimeSpan>, TypeWrapper.Nullable)
"guid?", (typeof<Guid>, TypeWrapper.Nullable)
"int option", (typeof<int>, TypeWrapper.Option)
"int64 option", (typeof<int64>, TypeWrapper.Option)
"bool option", (typeof<bool>, TypeWrapper.Option)
"float option", (typeof<float>, TypeWrapper.Option)
"decimal option", (typeof<decimal>, TypeWrapper.Option)
"date option", (typeof<DateTime>, TypeWrapper.Option)
"datetimeoffset option", (typeof<DateTimeOffset>, TypeWrapper.Option)
"timespan option", (typeof<TimeSpan>, TypeWrapper.Option)
"guid option", (typeof<Guid>, TypeWrapper.Option)
"string option", (typeof<string>, TypeWrapper.Option) ]
|> dict

let private nameAndTypeRegex =
lazy Regex(@"^(?<name>.+)\((?<type>.+)\)$", RegexOptions.Compiled ||| RegexOptions.RightToLeft)

let private typeAndUnitRegex =
lazy Regex(@"^(?<type>.+)<(?<unit>.+)>$", RegexOptions.Compiled ||| RegexOptions.RightToLeft)

let private overrideByNameRegex =
lazy
Regex(
Expand All @@ -65,56 +53,15 @@ type private SchemaParseResult =
| FullByName of property: PrimitiveInferedProperty * originalName: string
| Rename of name: string * originalName: string

let private asOption =
function
| true, x -> Some x
| false, _ -> None

/// <summary>
/// Parses type specification in the schema for a single column.
/// This can be of the form: <c>type|measure|type&lt;measure&gt;</c>
/// </summary>
let private parseTypeAndUnit unitsOfMeasureProvider str =
let m = typeAndUnitRegex.Value.Match(str)

if m.Success then
// type<unit> case, both type and unit have to be valid
let typ =
m.Groups.["type"].Value.TrimEnd().ToLowerInvariant()
|> nameToType.TryGetValue
|> asOption

match typ with
| None -> None, None
| Some typ ->
let unitName = m.Groups.["unit"].Value.Trim()
let unit = StructuralInference.parseUnitOfMeasure unitsOfMeasureProvider unitName

if unit.IsNone then
failwithf "Invalid unit of measure %s" unitName
else
Some typ, unit
else
// it is not a full type with unit, so it can be either type or a unit
let typ =
str.ToLowerInvariant()
|> nameToType.TryGetValue
|> asOption

match typ with
| Some (typ, typWrapper) ->
// Just type
Some(typ, typWrapper), None
| None ->
// Just unit (or nothing)
None, StructuralInference.parseUnitOfMeasure unitsOfMeasureProvider str

/// Parse schema specification for column. This can either be a name
/// with type or just type: name (typeInfo)|typeInfo.
/// If forSchemaOverride is set to true, only Full or Name is returned
/// (if we succeed we override the inferred schema, otherwise, we just
/// override the header name)
let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
let parseTypeAndUnit =
StructuralInference.parseTypeAndUnit unitsOfMeasureProvider nameToTypeForCsv

let name, typ, unit, isOverrideByName, originalName =
let m = overrideByNameRegex.Value.Match str

Expand All @@ -123,7 +70,7 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
let originalName = m.Groups.["name"].Value.TrimEnd()
let newName = m.Groups.["newName"].Value.Trim()
let typeAndUnit = m.Groups.["type"].Value.Trim()
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider typeAndUnit
let typ, unit = parseTypeAndUnit typeAndUnit

if typ.IsNone && typeAndUnit <> "" then
failwithf "Invalid type: %s" typeAndUnit
Expand All @@ -136,11 +83,11 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
// name (type|measure|type<measure>)
let name = m.Groups.["name"].Value.TrimEnd()
let typeAndUnit = m.Groups.["type"].Value.Trim()
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider typeAndUnit
let typ, unit = parseTypeAndUnit typeAndUnit
name, typ, unit, false, ""
elif forSchemaOverride then
// type|type<measure>
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider str
let typ, unit = parseTypeAndUnit str

match typ, unit with
| None, _ -> str, None, None, false, ""
Expand All @@ -162,18 +109,26 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
| None, Some _ when forSchemaOverride -> SchemaParseResult.Name str
| None, Some unit -> SchemaParseResult.NameAndUnit(name, unit)

let internal inferCellType preferOptionals missingValues cultureInfo unit (value: string) =
let internal inferCellType
unitsOfMeasureProvider
preferOptionals
missingValues
inferenceMode
cultureInfo
unit
(value: string)
=
// Explicit missing values (NaN, NA, Empty string etc.) will be treated as float unless the preferOptionals is set to true
if Array.exists (value.Trim() |> (=)) missingValues then
if preferOptionals then
InferedType.Null
else
InferedType.Primitive(typeof<float>, unit, false)
InferedType.Primitive(typeof<float>, unit, false, false)
// If there's only whitespace between commas, treat it as a missing value and not as a string
elif String.IsNullOrWhiteSpace value then
InferedType.Null
else
getInferedTypeFromString cultureInfo value unit
StructuralInference.getInferedTypeFromString unitsOfMeasureProvider inferenceMode cultureInfo value unit

let internal parseHeaders headers numberOfColumns schema unitsOfMeasureProvider =

Expand Down Expand Up @@ -282,9 +237,11 @@ let internal inferType
(rows: seq<_>)
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
=

// If we have no data, generate one empty row with empty strings,
Expand Down Expand Up @@ -328,7 +285,15 @@ let internal inferType
let typ =
match schema with
| Some _ -> InferedType.Null // this will be ignored, so just return anything
| None -> inferCellType preferOptionals missingValues cultureInfo unit value
| None ->
inferCellType
unitsOfMeasureProvider
preferOptionals
missingValues
inferenceMode
cultureInfo
unit
value

{ Name = name; Type = typ } ]

Expand Down Expand Up @@ -377,7 +342,7 @@ let internal getFields preferOptionals inferedType schema =
field.Name, field.Name

match field.Type with
| InferedType.Primitive (typ, unit, optional) ->
| InferedType.Primitive (typ, unit, optional, _) ->

// Transform the types as described above
let typ, typWrapper =
Expand Down Expand Up @@ -420,11 +385,23 @@ let internal inferColumnTypes
rows
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
=
inferType headerNamesAndUnits schema rows inferRows missingValues cultureInfo assumeMissingValues preferOptionals
inferType
headerNamesAndUnits
schema
rows
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
||> getFields preferOptionals

type CsvFile with
Expand All @@ -442,14 +419,13 @@ type CsvFile with
(
inferRows,
missingValues,
inferenceMode,
cultureInfo,
schema,
assumeMissingValues,
preferOptionals,
[<Optional>] ?unitsOfMeasureProvider
unitsOfMeasureProvider
) =
let unitsOfMeasureProvider =
defaultArg unitsOfMeasureProvider defaultUnitsOfMeasureProvider

let headerNamesAndUnits, schema =
parseHeaders x.Headers x.NumberOfColumns schema unitsOfMeasureProvider
Expand All @@ -460,6 +436,8 @@ type CsvFile with
(x.Rows |> Seq.map (fun row -> row.Columns))
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
Loading