mrpowers-io · SemyonSinchenko · Jul 15, 2024 · Jul 14, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
     - main
+    - planning-1.0-release
   workflow_dispatch:
 
 jobs:
@@ -66,6 +67,19 @@ jobs:
       - name: Run tests with pytest against PySpark ${{ matrix.pyspark-version }}
         run: make test
 
+      - name: Run tests using Spark-Connect against PySpark ${{ matrix.pyspark-version }}
+        env:
+          HADOOP_VERSION: 3
+          SPARK_VERSION: ${{ matrix.pyspark-version }}
+          SPARK_CONNECT_MODE_ENABLE: 1
+        run: |
+          if [[ "${SPARK_VERSION}" > "3.4" ]]; then
+            sh scripts/run_spark_connect_server.sh
+            make test_spark_connect
+          else
+            echo "Skipping Spark-Connect tests for Spark version <= 3.4"
+          fi
+
   check-license-headers:
     runs-on: ubuntu-latest
     steps:

diff --git a/Makefile b/Makefile
@@ -3,8 +3,8 @@
 all: help
 
 .PHONY: install_test
-install_test: ## Install test dependencies
-	@poetry install --with=development,testing
+install_test:
+	@poetry install --with=development,testing --extras connect
 
 .PHONY: install_deps
 install_deps: ## Install all dependencies
@@ -15,8 +15,12 @@ update_deps: ## Update dependencies
 	@poetry update --with=development,linting,testing,docs
 
 .PHONY: test
-test: ## Run the unit tests
-	@poetry run pytest tests
+test:
+	@poetry run pytest tests -k "not test_spark_connect.py"
+
+.PHONY: test
+test_spark_connect:
+	@poetry run pytest tests/test_spark_connect.py
 
 .PHONY: lint 
 lint: ## Lint the code
@@ -28,7 +32,16 @@ format: ## Format the code
 
 # Inspired by https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
 .PHONY: help
+help:
+	@echo '................... Quinn ..........................'
+	@echo 'help                      - print that message'
+	@echo 'lint                      - run linter'
+	@echo 'format                    - reformat the code'
+	@echo 'test                      - run tests'
+	@echo 'install_test              - install test deps'
+	@echo 'install_deps              - install dev deps'
+	@echo 'update_deps               - update and install deps'
 help: ## Show help for the commands
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 
-.DEFAULT_GOAL := help
+.DEFAULT_GOAL := help
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,15 @@ build-backend = "poetry.masonry.api"
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 
+# Below are the optional dependencies
+pyarrow = "13.0.0"
+pandas = { version = "^1.5.3", optional = true }
+numpy = { version = "^1.21.0", optional = true }
+grpcio = { version = "^1.48.1", optional = true }
+grpcio-status = { version = "^1.64.1", optional = true }
+
+[tool.poetry.extras]
+connect = ["pyarrow", "pandas", "numpy", "grpcio", "grpcio-status"]
 
 ###########################################################################
 #                         DEPENDENCY GROUPS 
@@ -102,8 +111,6 @@ ignore = [
 ]
 
 [tool.ruff.lint.per-file-ignores]
-"quinn/extensions/column_ext.py" = ["FBT003", "N802"]
-"quinn/extensions/__init__.py" = ["F401", "F403"]
 "quinn/__init__.py" = ["F401", "F403"]
 "quinn/functions.py" = ["FBT003"]
 "quinn/keyword_finder.py" = ["A002"]

diff --git a/scripts/run_spark_connect_server.sh b/scripts/run_spark_connect_server.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/bash
+
+# This script was inspired by https://github.com/pyspark-ai/pyspark-ai/blob/master/run_spark_connect.sh
+
+# The Hadoop and Spark versions are set as environment variables for this script.
+echo "The HADOOP_VERSION is $HADOOP_VERSION"
+echo "The SPARK_VERSION is $SPARK_VERSION"
+
+# Download the spark binaries. If the download fails, throw an error message
+if ! wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz; then
+  echo "Error: Unable to download Spark binaries"
+  exit 1
+fi
+
+# Extract the downloaded spark binaries and check if the extraction is successful or not
+if ! tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz; then
+  echo "Error: Unable to extract Spark binaries"
+  exit 1
+fi
+
+# Start the Spark server
+echo "Starting the Spark-Connect server"
+./spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION/sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:$SPARK_VERSION
+
+# TODO: Check if the server is running or not (maybe using netstat) and throw an error message if it is not running
diff --git a/tests/spark.py b/tests/spark.py
@@ -1,3 +1,7 @@
+import os
 from pyspark.sql import SparkSession
 
-spark = SparkSession.builder.master("local").appName("chispa").getOrCreate()
+if "SPARK_CONNECT_MODE_ENABLE" in os.environ:
+    spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
+else:
+    spark = SparkSession.builder.master("local").appName("chispa").getOrCreate()
diff --git a/tests/test_spark_connect.py b/tests/test_spark_connect.py
@@ -0,0 +1,20 @@
+import chispa
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+import quinn
+from .spark import spark
+
+
+def test_create_df():
+    rows_data = [("abc", 1), ("lu", 2), ("torrence", 3)]
+    col_specs = [("name", StringType()), ("age", IntegerType())]
+
+    expected_schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ],
+    )
+    actual = quinn.create_df(spark, rows_data, col_specs)
+    expected = spark.createDataFrame(rows_data, expected_schema)
+    chispa.assert_df_equality(actual, expected)