Sort Delta log objects when comparing and avoid caching all logs (NVI…

…DIA#7456) Signed-off-by: Jason Lowe <[email protected]> Signed-off-by: Jason Lowe <[email protected]>
razajafri · Jan 5, 2023 · 17343d5 · 17343d5
1 parent e9e605b
commit 17343d5
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 2 deletions.
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -220,7 +220,8 @@ else
         export PYSP_TEST_spark_jars="${ALL_JARS//:/,}"
     fi
 
-    export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=UTC $COVERAGE_SUBMIT_FLAGS"
+    # Set the Delta log cache size to prevent the driver from caching every Delta log indefinitely
+    export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=UTC -Ddelta.log.cacheSize=10 $COVERAGE_SUBMIT_FLAGS"
     export PYSP_TEST_spark_executor_extraJavaOptions='-ea -Duser.timezone=UTC'
     export PYSP_TEST_spark_ui_showConsoleProgress='false'
     export PYSP_TEST_spark_sql_session_timeZone='UTC'

diff --git a/integration_tests/src/main/python/delta_lake_write_test.py b/integration_tests/src/main/python/delta_lake_write_test.py
@@ -90,6 +90,12 @@ def decode_jsons(json_data):
         # Skip whitespace between records
         while idx < len(json_data) and json_data[idx].isspace():
             idx += 1
+    # reorder to produce a consistent output for comparison
+    def json_to_sort_key(j):
+        keys = sorted(j.keys())
+        paths = sorted([ v.get("path", "") for v in j.values() ])
+        return ','.join(keys + paths)
+    jsons.sort(key=json_to_sort_key)
     return jsons
 
 def assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path):