-
Notifications
You must be signed in to change notification settings - Fork 415
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Utilise struct stats when available (#656)
* Log stats * Point unittest to problematic data * Add minimal test data to reproduce * Update test data * Fix test data * Add test * Update rust logging * It actually compiles! * I don't understand rust * Debug logging * Fix sample data * Tidy * Update rust/src/action.rs Co-authored-by: Will Jones <[email protected]> * Test data with more complex types * Still return json stats if there is an error parsing parquet stats * Better error message * Unittest covering more complex types * Support parsing structs * Compare struct comes out the same as json * Correct timestamp formatting * In progress better test and support for more columns * All types except decimal work * Test data with nested structs * Update test * All workng except decimal * Working decimal conversion * Update test data again * Passing test * Tidy * Tidy * Tidy * Remove .crc files * Remove unneeded return statements * Remove python test * Use from and reference instead of clone Co-authored-by: Will Jones <[email protected]> * Use into Co-authored-by: Will Jones <[email protected]> * dereferance timestamp Co-authored-by: Will Jones <[email protected]> * use into Co-authored-by: Will Jones <[email protected]> * Use reference to field Co-authored-by: Will Jones <[email protected]> * de-reference date Co-authored-by: Will Jones <[email protected]> * Update rust/tests/read_delta_test.rs Co-authored-by: Will Jones <[email protected]> Co-authored-by: Will Jones <[email protected]>
- Loading branch information
1 parent
9bfa8ab
commit 8d7f3b6
Showing
31 changed files
with
144 additions
and
9 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 3 additions & 0 deletions
3
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000000.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} | ||
{"metaData":{"id":"9c4df48c-6085-4dcf-b73e-13147a5a405e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1656252116073}} | ||
{"commitInfo":{"timestamp":1656252116149,"userId":"6114986638742036","userName":"[email protected]","operation":"CREATE OR REPLACE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpoint.writeStatsAsStruct\":\"true\"}"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"af1b716b-5ec8-41f6-9cc2-bb89e010f943"}} |
3 changes: 3 additions & 0 deletions
3
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000001.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"metaData":{"id":"9c4df48c-6085-4dcf-b73e-13147a5a405e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"null\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boolean\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(8,5)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"nested_struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"struct_of_array_of_map\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1656252116073}} | ||
{"add":{"path":"part-00000-51653f4d-b029-44bd-9fda-578e73518a26-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252122000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:01.678Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:01.678Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252122000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252122901,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"4398d2b8-9a41-46ea-b92c-38820595bfec"}} |
2 changes: 2 additions & 0 deletions
2
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000002.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"add":{"path":"part-00000-a222e75a-a0b4-4e72-a776-2776ece95606-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252124000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":1,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:03.793Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":1,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:03.793Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252124000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252124212,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"26079ba4-cd65-4421-9bd3-ba0416dfb509"}} |
2 changes: 2 additions & 0 deletions
2
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000003.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"add":{"path":"part-00000-b97608c1-8d8e-4369-b067-bd84435a1606-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252125000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":2,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:05.077Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":2,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:05.077Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252125000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252125457,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"c2a690f3-3bb7-4dc2-b00e-cd81382e8b21"}} |
2 changes: 2 additions & 0 deletions
2
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000004.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"add":{"path":"part-00000-7c59d077-8928-4402-b5e6-7259f5440fd0-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252126000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":3,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:06.242Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":3,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:06.242Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252126000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252126676,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"4522c45a-ed48-4b7d-bfc6-1bad78981261"}} |
2 changes: 2 additions & 0 deletions
2
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000005.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"add":{"path":"part-00000-7264b4fa-c3d7-4e25-956f-716358f594ff-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252127000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":4,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:07.511Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":4,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:07.511Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252127000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252127961,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"f60b53dc-cb04-4e5b-a541-037c9c2cee89"}} |
2 changes: 2 additions & 0 deletions
2
rust/tests/data/delta-1.2.1-only-struct-stats/_delta_log/00000000000000000006.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"add":{"path":"part-00000-1a04af5a-e0d5-497d-9496-7bcf9af3419f-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252129000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":5,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:08.788Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":5,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:08.788Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252129000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} | ||
{"commitInfo":{"timestamp":1656252129258,"userId":"6114986638742036","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":5,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"0bf1e2ce-ef4f-4235-aa4c-918507d5097b"}} |
Oops, something went wrong.