diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 43dff108d9a29..b8ea7b12cf2b3 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -534,7 +534,7 @@ def test_createDataFrame_arrow_pandas(self): def test_createDataFrame_verifySchema(self): data = {"id": [1, 2, 3], "value": [100000000000, 200000000000, 300000000000]} - table = pa.table(data) + # data.value should fail schema validation when verifySchema is True schema = StructType( [StructField("id", IntegerType(), True), StructField("value", IntegerType(), True)] ) @@ -543,13 +543,29 @@ def test_createDataFrame_verifySchema(self): Row(id=2, value=-1863462912), Row(id=3, value=-647710720), ] - + # Arrow table + table = pa.table(data) df = self.spark.createDataFrame(table, schema=schema) self.assertEqual(df.collect(), expected) with self.assertRaises(Exception): self.spark.createDataFrame(table, schema=schema, verifySchema=True) + # pandas DataFrame with Arrow optimization + pdf = pd.DataFrame(data) + df = self.spark.createDataFrame(pdf, schema=schema) # verifySchema defaults to False + self.assertEqual(df.collect(), expected) + with self.assertRaises(Exception): + df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=True) + + # pandas DataFrame without Arrow optimization + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): + pdf = pd.DataFrame(data) + with self.assertRaises(Exception): + df = self.spark.createDataFrame(pdf, schema=schema) # verifySchema defaults to True + df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=False) + self.assertEqual(df.collect(), expected) + def _createDataFrame_toggle(self, data, schema=None): with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): df_no_arrow = self.spark.createDataFrame(data, schema=schema)