mrpowers-io · SemyonSinchenko · Jul 15, 2024 · Jul 14, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
     - main
+    - planning-1.0-release
   workflow_dispatch:
 
 jobs:
@@ -66,6 +67,18 @@ jobs:
       - name: Run tests with pytest against PySpark ${{ matrix.pyspark-version }}
         run: make test
 
+      - name: Run tests using Spark-Connect against PySpark ${{ matrix.pyspark-version }}
+        env:
+          SPARK_VERSION: ${{ matrix.pyspark-version }}
+          SPARK_CONNECT_MODE_ENABLED: 1
+        run: |
+          if [[ "${SPARK_VERSION}" > "3.4" ]]; then
+            sh scripts/run_spark_connect_server.sh
+            # The tests should be called from here.
+          else
+            echo "Skipping Spark-Connect tests for Spark version <= 3.4"
+          fi
+
   check-license-headers:
     runs-on: ubuntu-latest
     steps:

diff --git a/Makefile b/Makefile
@@ -3,8 +3,8 @@
 all: help
 
 .PHONY: install_test
-install_test: ## Install test dependencies
-	@poetry install --with=development,testing
+install_test: ## Install the 'dev, test and extras' dependencies
+	@poetry install --with=development,testing --extras connect
 
 .PHONY: install_deps
 install_deps: ## Install all dependencies
@@ -15,7 +15,7 @@ update_deps: ## Update dependencies
 	@poetry update --with=development,linting,testing,docs
 
 .PHONY: test
-test: ## Run the unit tests
+test: ## Run all tests
 	@poetry run pytest tests
 
 .PHONY: lint 
@@ -31,4 +31,4 @@ format: ## Format the code
 help: ## Show help for the commands
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 
-.DEFAULT_GOAL := help
+.DEFAULT_GOAL := help
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,15 @@ build-backend = "poetry.masonry.api"
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 
+# Below are the optional dependencies
+pyarrow = "13.0.0"
+pandas = { version = "^1.5.3", optional = true }
+numpy = { version = "^1.21.0", optional = true }
+grpcio = { version = "^1.48.1", optional = true }
+grpcio-status = { version = "^1.64.1", optional = true }
+
+[tool.poetry.extras]
+connect = ["pyarrow", "pandas", "numpy", "grpcio", "grpcio-status"]
 
 ###########################################################################
 #                         DEPENDENCY GROUPS 
@@ -102,11 +111,9 @@ ignore = [
 ]
 
 [tool.ruff.lint.per-file-ignores]
-"quinn/extensions/column_ext.py" = ["FBT003", "N802"]
-"quinn/extensions/__init__.py" = ["F401", "F403"]
 "quinn/__init__.py" = ["F401", "F403"]
 "quinn/functions.py" = ["FBT003"]
 "quinn/keyword_finder.py" = ["A002"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 required-imports = ["from __future__ import annotations"]
diff --git a/quinn/schema_helpers.py b/quinn/schema_helpers.py
@@ -131,7 +131,7 @@ def _lookup_type(type_str: str) -> T.DataType:
 
         return type_lookup[type_str]
 
-    def _convert_nullable(null_str: Optional[str]) -> bool:
+    def _convert_nullable(null_str: str | None) -> bool:
         if null_str is None:
             return True
 

diff --git a/scripts/run_spark_connect_server.sh b/scripts/run_spark_connect_server.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/bash
+
+# This script was inspired by https://github.com/pyspark-ai/pyspark-ai/blob/master/run_spark_connect.sh
+
+# The Spark version is set as an environment variable for this script.
+echo "The SPARK_VERSION is $SPARK_VERSION"
+
+# Download the spark binaries. If the download fails, throw an error message
+if ! wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz; then
+  echo "Error: Unable to download Spark binaries"
+  exit 1
+fi
+
+# Extract the downloaded spark binaries and check if the extraction is successful or not
+if ! tar -xzf spark-$SPARK_VERSION-bin-hadoop3.tgz; then
+  echo "Error: Unable to extract Spark binaries"
+  exit 1
+fi
+
+# Start the Spark server
+echo "Starting the Spark-Connect server"
+./spark-$SPARK_VERSION-bin-hadoop3/sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:$SPARK_VERSION
+
+# TODO: Check if the server is running or not (maybe using netstat) and throw an error message if it is not running
diff --git a/tests/spark.py b/tests/spark.py
@@ -1,3 +1,7 @@
+import os
 from pyspark.sql import SparkSession
 
-spark = SparkSession.builder.master("local").appName("chispa").getOrCreate()
+if "SPARK_CONNECT_MODE_ENABLED" in os.environ:
+    spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
+else:
+    spark = SparkSession.builder.master("local").appName("chispa").getOrCreate()
diff --git a/tests/test_spark_connect.py b/tests/test_spark_connect.py
@@ -0,0 +1,20 @@
+import chispa
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+import quinn
+from .spark import spark
+
+
+def test_create_df():
+    rows_data = [("abc", 1), ("lu", 2), ("torrence", 3)]
+    col_specs = [("name", StringType()), ("age", IntegerType())]
+
+    expected_schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ],
+    )
+    actual = quinn.create_df(spark, rows_data, col_specs)
+    expected = spark.createDataFrame(rows_data, expected_schema)
+    chispa.assert_df_equality(actual, expected)