From 9aef014e8b2dd179381d2947b541ab760255b32a Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 9 Jan 2024 15:30:15 +0100 Subject: [PATCH 1/2] Python bindings: implement __arrow_c_stream__() interface for ogr.Layer --- autotest/ogr/ogr_mem.py | 38 +++++++++++++++++++++++++++++ swig/include/ogr.i | 41 ++++++++++++++++++++++++++++++++ swig/include/python/ogr_python.i | 29 ++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py index e144ab0412e4..077363a20bf8 100755 --- a/autotest/ogr/ogr_mem.py +++ b/autotest/ogr/ogr_mem.py @@ -709,6 +709,44 @@ def test_ogr_mem_alter_geom_field_defn(): assert lyr.GetSpatialRef() is None +############################################################################### +# Test ogr.Layer.__arrow_c_stream__() interface. +# Cf https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + +@gdaltest.enable_exceptions() +def test_ogr_mem_arrow_stream_pycapsule_interface(): + import ctypes + + ds = ogr.GetDriverByName("Memory").CreateDataSource("") + lyr = ds.CreateLayer("foo") + + stream = lyr.__arrow_c_stream__() + assert stream + t = type(stream) + assert t.__module__ == "builtins" + assert t.__name__ == "PyCapsule" + capsule_get_name = ctypes.pythonapi.PyCapsule_GetName + capsule_get_name.argtypes = [ctypes.py_object] + capsule_get_name.restype = ctypes.c_char_p + assert capsule_get_name(ctypes.py_object(stream)) == b"arrow_array_stream" + + with pytest.raises( + Exception, match="An arrow Arrow Stream is in progress on that layer" + ): + lyr.__arrow_c_stream__() + + del stream + + stream = lyr.__arrow_c_stream__() + assert stream + del stream + + with pytest.raises(Exception, match="requested_schema != None not implemented"): + # "something" should rather by a PyCapsule with an ArrowSchema... + lyr.__arrow_c_stream__(requested_schema="something") + + ############################################################################### diff --git a/swig/include/ogr.i b/swig/include/ogr.i index bfefba922e34..816097a12612 100644 --- a/swig/include/ogr.i +++ b/swig/include/ogr.i @@ -1143,6 +1143,22 @@ public: }; /* class ArrowArrayStream */ #endif +#ifdef SWIGPYTHON +// Implements __arrow_c_stream__ export interface: +// https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#create-a-pycapsule +%{ +static void ReleaseArrowArrayStreamPyCapsule(PyObject* capsule) { + struct ArrowArrayStream* stream = + (struct ArrowArrayStream*)PyCapsule_GetPointer(capsule, "arrow_array_stream"); + if (stream->release != NULL) { + stream->release(stream); + } + CPLFree(stream); +} +%} + +#endif + /************************************************************************/ /* OGRLayer */ /************************************************************************/ @@ -1507,6 +1523,31 @@ public: #ifdef SWIGPYTHON + PyObject* ExportArrowArrayStreamPyCapsule() + { + struct ArrowArrayStream* stream = + (struct ArrowArrayStream*)CPLMalloc(sizeof(struct ArrowArrayStream)); + + const int success = OGR_L_GetArrowStream(self, stream, NULL); + + PyObject* ret; + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + if( success ) + { + ret = PyCapsule_New(stream, "arrow_array_stream", ReleaseArrowArrayStreamPyCapsule); + } + else + { + CPLFree(stream); + Py_INCREF(Py_None); + ret = Py_None; + } + + SWIG_PYTHON_THREAD_END_BLOCK; + + return ret; + } + %newobject GetArrowStream; ArrowArrayStream* GetArrowStream(char** options = NULL) { struct ArrowArrayStream* stream = (struct ArrowArrayStream* )malloc(sizeof(struct ArrowArrayStream)); diff --git a/swig/include/python/ogr_python.i b/swig/include/python/ogr_python.i index 2129e8baac3b..773e391578de 100644 --- a/swig/include/python/ogr_python.i +++ b/swig/include/python/ogr_python.i @@ -411,6 +411,35 @@ def ReleaseResultSet(self, sql_lyr): schema = property(schema) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule, according to + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + Also note that only one active stream can be queried at a time for a + given layer. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema is not None + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + + if requested_schema is not None: + raise NotImplementedError("requested_schema != None not implemented") + + return self.ExportArrowArrayStreamPyCapsule() + + def GetArrowStreamAsPyArrow(self, options = []): """ Return an ArrowStream as PyArrow Schema and Array objects """ From e4f78bee75f9834721fa2a4e8a58a9b48524cb3b Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 9 Jan 2024 17:17:34 +0100 Subject: [PATCH 2/2] Python bindings: add a ogr.Layer.GetArrowArrayStreamInterface() method --- autotest/ogr/ogr_mem.py | 10 ++++++ swig/include/ogr.i | 4 +-- swig/include/python/ogr_python.i | 55 ++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py index 077363a20bf8..2875a8fce028 100755 --- a/autotest/ogr/ogr_mem.py +++ b/autotest/ogr/ogr_mem.py @@ -746,6 +746,16 @@ def test_ogr_mem_arrow_stream_pycapsule_interface(): # "something" should rather by a PyCapsule with an ArrowSchema... lyr.__arrow_c_stream__(requested_schema="something") + # Also test GetArrowArrayStreamInterface() to be able to specify options + stream = lyr.GetArrowArrayStreamInterface( + {"INCLUDE_FID": "NO"} + ).__arrow_c_stream__() + assert stream + t = type(stream) + assert t.__module__ == "builtins" + assert t.__name__ == "PyCapsule" + del stream + ############################################################################### diff --git a/swig/include/ogr.i b/swig/include/ogr.i index 816097a12612..19b4de6e2667 100644 --- a/swig/include/ogr.i +++ b/swig/include/ogr.i @@ -1523,12 +1523,12 @@ public: #ifdef SWIGPYTHON - PyObject* ExportArrowArrayStreamPyCapsule() + PyObject* ExportArrowArrayStreamPyCapsule(char** options = NULL) { struct ArrowArrayStream* stream = (struct ArrowArrayStream*)CPLMalloc(sizeof(struct ArrowArrayStream)); - const int success = OGR_L_GetArrowStream(self, stream, NULL); + const int success = OGR_L_GetArrowStream(self, stream, options); PyObject* ret; SWIG_PYTHON_THREAD_BEGIN_BLOCK; diff --git a/swig/include/python/ogr_python.i b/swig/include/python/ogr_python.i index 773e391578de..bf27eaba9526 100644 --- a/swig/include/python/ogr_python.i +++ b/swig/include/python/ogr_python.i @@ -419,6 +419,9 @@ def ReleaseResultSet(self, sql_lyr): Also note that only one active stream can be queried at a time for a given layer. + To specify options how the ArrowStream should be generated, use + the GetArrowArrayStreamInterface(self, options) method + Parameters ---------- requested_schema : PyCapsule, default None @@ -440,6 +443,58 @@ def ReleaseResultSet(self, sql_lyr): return self.ExportArrowArrayStreamPyCapsule() + def GetArrowArrayStreamInterface(self, options = []): + """ + Return a proxy object that implements the __arrow_c_stream__() method, + but allows the user to pass options. + + Parameters + ---------- + options : List of strings or dict with options such as INCLUDE_FID=NO, MAX_FEATURES_IN_BATCH=, etc. + + Returns + ------- + a proxy object which implements the __arrow_c_stream__() method + """ + + class ArrowArrayStreamInterface: + def __init__(self, lyr, options): + self.lyr = lyr + self.options = options + + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule, according to + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + Also note that only one active stream can be queried at a time for a + given layer. + + To specify options how the ArrowStream should be generated, use + the GetArrowArrayStreamInterface(self, options) method + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema is not None + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + if requested_schema is not None: + raise NotImplementedError("requested_schema != None not implemented") + + return self.lyr.ExportArrowArrayStreamPyCapsule(self.options) + + return ArrowArrayStreamInterface(self, options) + + def GetArrowStreamAsPyArrow(self, options = []): """ Return an ArrowStream as PyArrow Schema and Array objects """