From 3652e933aef44e0d604bcde8fd54dbcdf5f5db3e Mon Sep 17 00:00:00 2001
From: Antoine Martin <totaam@xpra.org>
Date: Fri, 10 Jul 2020 08:04:06 +0000
Subject: [PATCH] #1462 move colourspace conversion to the restored csc_cython
 module

git-svn-id: https://xpra.org/svn/Xpra/trunk@26940 3bb7dfac-3a0b-4e04-842a-767bc560f471
---
 src/xpra/codecs/argb/argb.pyx                 |  34 -
 src/xpra/codecs/codec_checks.py               |   7 +-
 src/xpra/codecs/csc_cython/__init__.py        |   4 +
 .../csc_cython/colorspace_converter.pyx       | 748 ++++++++++++++++++
 src/xpra/codecs/enc_x264/encoder.pyx          |  21 +-
 src/xpra/codecs/loader.py                     |   3 +-
 src/xpra/codecs/video_helper.py               |   8 +-
 7 files changed, 772 insertions(+), 53 deletions(-)
 create mode 100644 src/xpra/codecs/csc_cython/__init__.py
 create mode 100644 src/xpra/codecs/csc_cython/colorspace_converter.pyx

diff --git a/src/xpra/codecs/argb/argb.pyx b/src/xpra/codecs/argb/argb.pyx
index 68ba629fed..a9e11e8c6c 100644
--- a/src/xpra/codecs/argb/argb.pyx
+++ b/src/xpra/codecs/argb/argb.pyx
@@ -192,40 +192,6 @@ cdef r210data_to_rgb(unsigned int* r210,
     return memoryview(output_buf)
 
 
-def r210_to_bgr48(buf,
-                  const unsigned int w, const unsigned int h,
-                  const unsigned int src_stride, const unsigned int dst_stride):
-    assert buf, "no buffer"
-    assert w*4<=src_stride, "invalid source stride %i for width %i" % (src_stride, w)
-    assert w*6<=dst_stride, "invalid destination stride %i for width %i" % (dst_stride, w)
-    assert (dst_stride%2)==0, "invalid destination stride %i (odd number)" % (dst_stride)
-    cdef unsigned int* cbuf = <unsigned int *> 0
-    cdef Py_ssize_t cbuf_len = 0
-    assert as_buffer(buf, <const void**> &cbuf, &cbuf_len)==0, "cannot convert %s to a readable buffer" % type(buf)
-    assert cbuf_len>0, "invalid buffer size: %i" % cbuf_len
-    assert cbuf_len>=h*src_stride, "source buffer is %i bytes, which is too small for %ix%i" % (cbuf_len, src_stride, h)
-    return r210data_to_bgr48(cbuf, w, h, src_stride, dst_stride)
-
-cdef r210data_to_bgr48(unsigned int* r210,
-                       const unsigned int w, const unsigned int h,
-                       const unsigned int src_stride, const unsigned int dst_stride):
-    cdef MemBuf output_buf = getbuf(h*dst_stride*10)
-    cdef unsigned short* rgba = <unsigned short*> output_buf.get_mem()
-    cdef unsigned int y = 0
-    cdef unsigned int i = 0
-    cdef unsigned int v
-    for y in range(h):
-        i = y*dst_stride//2
-        for x in range(w):
-            v = r210[x]
-            rgba[i] = v&0x000003ff
-            rgba[i+1] = (v&0x000ffc00) >> 10
-            rgba[i+2]   = (v&0x3ff00000) >> 20
-            i = i + 3
-        r210 = <unsigned int*> ((<uintptr_t> r210) + src_stride)
-    return memoryview(output_buf)
-
-
 def argb_to_rgba(buf):
     assert len(buf) % 4 == 0, "invalid buffer size: %s is not a multiple of 4" % len(buf)
     # buf is a Python buffer object
diff --git a/src/xpra/codecs/codec_checks.py b/src/xpra/codecs/codec_checks.py
index a20f3e6a7a..d102fdfef5 100644
--- a/src/xpra/codecs/codec_checks.py
+++ b/src/xpra/codecs/codec_checks.py
@@ -65,8 +65,11 @@ def make_test_image(pixel_format, w, h):
             strides.append(w//vdiv[0])
         image = ImageWrapper(0, 0, w, h, planes, pixel_format, 32, strides, planes=nplanes, thread_safe=True)
         #l = len(y)+len(u)+len(v)
-    elif pixel_format in ("RGB", "BGR", "RGBX", "BGRX", "XRGB", "BGRA", "RGBA", "r210"):
-        stride = w*len(pixel_format)
+    elif pixel_format in ("RGB", "BGR", "RGBX", "BGRX", "XRGB", "BGRA", "RGBA", "r210", "BGR48"):
+        if pixel_format=="BGR48":
+            stride = w*6
+        else:
+            stride = w*len(pixel_format)
         rgb_data = makebuf(stride*h)
         image = ImageWrapper(0, 0, w, h, rgb_data, pixel_format, 32, stride, planes=ImageWrapper.PACKED, thread_safe=True)
         #l = len(rgb_data)
diff --git a/src/xpra/codecs/csc_cython/__init__.py b/src/xpra/codecs/csc_cython/__init__.py
new file mode 100644
index 0000000000..0f7a10c3e4
--- /dev/null
+++ b/src/xpra/codecs/csc_cython/__init__.py
@@ -0,0 +1,4 @@
+# This file is part of Xpra.
+# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
+# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
+# later version. See the file COPYING for details.
diff --git a/src/xpra/codecs/csc_cython/colorspace_converter.pyx b/src/xpra/codecs/csc_cython/colorspace_converter.pyx
new file mode 100644
index 0000000000..1e3d17b31f
--- /dev/null
+++ b/src/xpra/codecs/csc_cython/colorspace_converter.pyx
@@ -0,0 +1,748 @@
+# This file is part of Xpra.
+# Copyright (C) 2013 Arthur Huillet
+# Copyright (C) 2012-2020 Antoine Martin <antoine@devloop.org.uk>
+# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
+# later version. See the file COPYING for details.
+
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, overflowcheck=False, cdivision=True, unraisable_tracebacks=True
+
+import os
+import sys
+import time
+
+from xpra.log import Logger
+log = Logger("csc", "cython")
+
+from xpra.codecs.codec_constants import csc_spec
+from xpra.codecs.image_wrapper import ImageWrapper
+
+from libc.stdint cimport uint8_t, uintptr_t # pylint: disable=syntax-error
+from xpra.buffers.membuf cimport memory_as_pybuffer, memalign, object_as_buffer
+
+
+cdef extern from "stdlib.h":
+    void free(void *ptr)
+
+cdef inline int roundup(int n, int m):
+    return (n + m - 1) & ~(m - 1)
+
+#precalculate indexes in native endianness:
+cdef uint8_t BGRX_R, BGRX_G, BGRX_B, BGRX_X
+cdef uint8_t RGBX_R, RGBX_G, RGBX_B, RGBX_X
+cdef uint8_t RGB_R, RGB_G, RGB_B
+cdef uint8_t BGR_R, BGR_G, BGR_B
+
+if sys.byteorder=="little":
+    BGRX_B, BGRX_G, BGRX_R, BGRX_X = 0, 1, 2, 3
+    RGBX_R, RGBX_G, RGBX_B, RGBX_X = 0, 1, 2, 3
+    BGR_R, BGR_G, BGR_B = 2, 1, 0
+    RGB_R, RGB_G, RGB_B = 0, 1, 2
+else:
+    BGRX_B, BGRX_G, BGRX_R, BGRX_X = 0, 1, 2, 3
+    RGBX_R, RGBX_G, RGBX_B, RGBX_X = 0, 1, 2, 3
+    BGR_R, BGR_G, BGR_B = 0, 1, 2
+    RGB_R, RGB_G, RGB_B = 2, 1, 0
+
+log("csc_cython: %s endian:", sys.byteorder)
+log("csc_cython: byteorder(BGRX)=%s", (BGRX_B, BGRX_G, BGRX_R, BGRX_X))
+log("csc_cython: byteorder(RGBX)=%s", (RGBX_R, RGBX_G, RGBX_B, RGBX_X))
+log("csc_cython: byteorder(RGB)=%s", (RGB_R, RGB_G, RGB_B))
+log("csc_cython: byteorder(BGR)=%s", (BGR_R, BGR_G, BGR_B))
+
+#COLORSPACES = {"BGRX" : ["YUV420P"], "YUV420P" : ["RGB", "BGR", "RGBX", "BGRX"], "GBRP" : ["RGBX", "BGRX"] }
+def get_CS(in_cs, valid_options):
+    v = os.environ.get("XPRA_CSC_CYTHON_%s_COLORSPACES" % in_cs)
+    if not v:
+        return valid_options
+    env_override = []
+    for cs in v.split(","):
+        if cs in valid_options:
+            env_override.append(cs)
+        else:
+            log.warn("invalid colorspace override for %s: %s (only supports: %s)", in_cs, cs, valid_options)
+    log("environment override for %s: %s", in_cs, env_override)
+    return env_override
+COLORSPACES = {
+               "BGRX"       : get_CS("BGRX",    ["YUV420P"]),
+               "RGBX"       : get_CS("RGBX",    ["YUV420P"]),
+               "BGR"        : get_CS("BGR",     ["YUV420P"]),
+               "RGB"        : get_CS("RGB",     ["YUV420P"]),
+               "YUV420P"    : get_CS("YUV420P", ["RGB", "BGR", "RGBX", "BGRX"]),
+               "GBRP"       : get_CS("GBRP",    ["RGBX", "BGRX"]),
+               "r210"       : get_CS("r210",    ["YUV420P", "BGR48"]),
+               }
+
+
+def init_module():
+    #nothing to do!
+    log("csc_cython.init_module()")
+
+def cleanup_module():
+    log("csc_cython.cleanup_module()")
+
+def get_type():
+    return "cython"
+
+def get_version():
+    return "4.1"
+
+def get_info():
+    info = {
+            "version"   : (4, 1),
+            }
+    return info
+
+def get_input_colorspaces():
+    return COLORSPACES.keys()
+
+def get_output_colorspaces(input_colorspace):
+    return COLORSPACES[input_colorspace]
+
+def get_spec(in_colorspace, out_colorspace):
+    assert in_colorspace in COLORSPACES, "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, get_input_colorspaces())
+    assert out_colorspace in COLORSPACES.get(in_colorspace), "invalid output colorspace: %s (must be one of %s)" % (out_colorspace, get_output_colorspaces(in_colorspace))
+    #low score as this should be used as fallback only:
+    can_scale = True
+    if in_colorspace=="r210" and out_colorspace=="BGR48":
+        can_scale = False
+    return csc_spec(in_colorspace, out_colorspace,
+                    ColorspaceConverter, codec_type=get_type(),
+                    quality=50, speed=10, setup_cost=10, min_w=2, min_h=2, max_w=16*1024, max_h=16*1024, can_scale=can_scale)
+
+
+class CythonImageWrapper(ImageWrapper):
+
+    def free(self):                             #@DuplicatedSignature
+        log("CythonImageWrapper.free() cython_buffer=%#x", <unsigned long> self.cython_buffer)
+        ImageWrapper.free(self)
+        cb = self.cython_buffer
+        if cb>0:
+            self.cython_buffer = 0
+            free(<void *> (<unsigned long> cb))
+
+    def _cn(self):
+        return "CythonImageWrapper"
+
+
+
+DEF STRIDE_ROUNDUP = 2
+
+#Pre-calculate some coefficients and define them as constants
+#We use integer calculations so everything is multipled by 2**16
+#To get the result as a byte, we just bitshift:
+DEF shift = 16
+
+#RGB to YUV
+#Y[o] = clamp(0.257 * R + 0.504 * G + 0.098 * B + 16)
+# Y = 0.257 * R + 0.504 * G + 0.098 * B + 16
+DEF YR = 16843      # 0.257 * 2**16
+DEF YG = 33030      # 0.504 * 2**16
+DEF YB = 6423       # 0.098 * 2**16
+DEF Yc = 16
+DEF YC = 1048576    # 16    * 2**16
+#U[y*self.dst_strides[1] + x] = clamp(-0.148 * Rsum/sum - 0.291 * Gsum/sum + 0.439 * Bsum/sum + 128)
+# U = -0.148 * R - 0.291 * G + 0.439 * B + 128
+DEF UR = -9699      #-0.148 * 2**16
+DEF UG = -19071     #-0.291 * 2**16
+DEF UB = 28770      # 0.439 * 2**16
+DEF Uc = 128
+DEF UC = 8388608    # 128   * 2**16
+#V[y*self.dst_strides[2] + x] = clamp(0.439 * Rsum/sum - 0.368 * Gsum/sum - 0.071 * Bsum/sum + 128)
+# V = 0.439 * R - 0.368 * G - 0.071 * B + 128
+DEF VR = 28770      # 0.439  * 2**16
+DEF VG = -24117     #-0.368  * 2**16
+DEF VB = -4653      #-0.071  * 2**16
+DEF Vc = 128
+DEF VC = 8388608    # 128    * 2**16
+
+DEF max_clamp = 16777216    #2**(16+8)
+
+#YUV to RGB:
+#Y, Cb and Cr are adjusted as:
+#Y'  = Y - 16
+#Cb' = Cb - 128
+#Cr' = Cr - 128
+# (see YC, UC and VC above)
+#RGB:
+#R = 1.164*Y'                 + 1.596 * Cr'
+#G = 1.164*Y' - 0.391   * Cb' - 0.813   * Cr'
+#B = 1.164*Y' + 2.018   * Cb'
+
+DEF RY = 76284      #1.164    * 2**16
+DEF RU = 0
+DEF RV = 104582     #1.5958   * 2**16
+
+DEF GY = 76284      #1.164    * 2**16
+DEF GU = -25672     #-0.39173 * 2**16
+DEF GV = -53274     #-0.81290 * 2**16
+
+DEF BY = 76284      #1.164    * 2**16
+DEF BU = 132186     #2.017    * 2**16
+DEF BV = 0
+
+
+cdef inline unsigned char clamp(const long v) nogil:
+    if v<=0:
+        return 0
+    elif v>=max_clamp:
+        return 255
+    else:
+        return <unsigned char> (v>>shift)
+
+
+cdef class ColorspaceConverter:
+    cdef unsigned int src_width
+    cdef unsigned int src_height
+    cdef object src_format
+    cdef unsigned int dst_width
+    cdef unsigned int dst_height
+    cdef object dst_format
+    cdef unsigned long[3] dst_strides
+    cdef unsigned long[3] dst_sizes
+    cdef unsigned long[3] offsets
+
+    cdef convert_image_function
+
+    cdef unsigned long frames
+    cdef double time
+    cdef unsigned long buffer_size
+
+    cdef object __weakref__
+
+    def init_context(self, int src_width, int src_height, src_format,
+                           int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
+        cdef int i
+        assert src_format in get_input_colorspaces(), "invalid input colorspace: %s (must be one of %s)" % (src_format, get_input_colorspaces())
+        assert dst_format in get_output_colorspaces(src_format), "invalid output colorspace: %s (must be one of %s)" % (dst_format, get_output_colorspaces(src_format))
+        log("csc_cython.ColorspaceConverter.init_context%s", (src_width, src_height, src_format, dst_width, dst_height, dst_format, speed))
+        self.src_width = src_width
+        self.src_height = src_height
+        self.dst_width = dst_width
+        self.dst_height = dst_height
+        self.src_format = src_format[:]
+        self.dst_format = dst_format[:]
+
+        self.time = 0
+        self.frames = 0
+
+        #explicity clear all strides / sizes / offsets:
+        for i in range(2):
+            self.dst_strides[i] = 0
+            self.dst_sizes[i]   = 0
+            self.offsets[i]     = 0
+
+        if src_format in ("BGRX", "RGBX", "RGB", "BGR", "r210") and dst_format=="YUV420P":
+            self.dst_strides[0] = roundup(self.dst_width,   STRIDE_ROUNDUP)
+            self.dst_strides[1] = roundup(self.dst_width/2, STRIDE_ROUNDUP)
+            self.dst_strides[2] = roundup(self.dst_width/2, STRIDE_ROUNDUP)
+            self.dst_sizes[0] = self.dst_strides[0] * self.dst_height
+            self.dst_sizes[1] = self.dst_strides[1] * self.dst_height/2
+            self.dst_sizes[2] = self.dst_strides[2] * self.dst_height/2
+            #U channel follows Y with 1 line padding, V follows U with another line of padding:
+            self.offsets[0] = 0
+            self.offsets[1] = self.dst_strides[0] * (self.dst_height+1)
+            self.offsets[2] = self.offsets[1] + (self.dst_strides[1] * (self.dst_height/2+1))
+            #output buffer ends after V + 1 line of padding:
+            self.buffer_size = self.offsets[2] + (self.dst_strides[2] * (self.dst_height/2+1))
+            if src_format=="BGRX":
+                self.convert_image_function = self.BGRX_to_YUV420P
+            elif src_format=="RGBX":
+                self.convert_image_function = self.RGBX_to_YUV420P
+            elif src_format=="BGR":
+                self.convert_image_function = self.BGR_to_YUV420P
+            elif src_format=="RGB":
+                self.convert_image_function = self.RGB_to_YUV420P
+            else:
+                assert src_format=="r210"
+                self.convert_image_function = self.r210_to_YUV420P
+        elif src_format=="r210" and dst_format=="BGR48":
+            self.dst_strides[0] = roundup(self.dst_width*6, STRIDE_ROUNDUP)
+            self.dst_sizes[0] = self.dst_strides[0] * self.dst_height
+            self.buffer_size = self.dst_sizes[0]+self.dst_strides[0]
+            self.convert_image_function = self.r210_to_BGR48
+            assert src_width==dst_width
+            assert src_height==dst_height
+        elif src_format=="YUV420P" and dst_format in ("RGBX", "BGRX", "RGB", "BGR"):
+            #3 or 4 bytes per pixel:
+            self.dst_strides[0] = roundup(self.dst_width*len(dst_format), STRIDE_ROUNDUP)
+            self.dst_sizes[0] = self.dst_strides[0] * self.dst_height
+            self.offsets[0] = 0
+            #output buffer ends after 1 line of padding:
+            self.buffer_size = self.dst_sizes[0] + roundup(dst_width*len(dst_format), STRIDE_ROUNDUP)
+
+            if dst_format=="RGBX":
+                self.convert_image_function = self.YUV420P_to_RGBX
+            elif dst_format=="BGRX":
+                self.convert_image_function = self.YUV420P_to_BGRX
+            elif dst_format=="RGB":
+                self.convert_image_function = self.YUV420P_to_RGB
+            else:
+                assert dst_format=="BGR"
+                self.convert_image_function = self.YUV420P_to_BGR
+        elif src_format=="GBRP" and dst_format in ("RGBX", "BGRX"):
+            #4 bytes per pixel:
+            self.dst_strides[0] = roundup(self.dst_width*4, STRIDE_ROUNDUP)
+            self.dst_sizes[0] = self.dst_strides[0] * self.dst_height
+            self.offsets[0] = 0
+            #output buffer ends after 1 line of padding:
+            self.buffer_size = self.dst_sizes[0] + roundup(dst_width*4, STRIDE_ROUNDUP)
+
+            if dst_format=="RGBX":
+                self.convert_image_function = self.GBRP_to_RGBX
+            else:
+                assert dst_format=="BGRX"
+                self.convert_image_function = self.GBRP_to_BGRX
+        else:
+            raise Exception("BUG: src_format=%s, dst_format=%s", src_format, dst_format)
+
+    def clean(self):                        #@DuplicatedSignature
+        #overzealous clean is cheap!
+        cdef int i                          #
+        self.src_width = 0
+        self.src_height = 0
+        self.dst_width = 0
+        self.dst_height = 0
+        self.src_format = ""
+        self.dst_format = ""
+        self.time = 0
+        self.frames = 0
+        for i in range(3):
+            self.dst_strides[i] = 0
+            self.dst_sizes[i] = 0
+            self.offsets[i] = 0
+        self.convert_image_function = None
+        self.buffer_size = 0
+
+    def is_closed(self):
+        return self.convert_image_function is None
+
+    def get_info(self):      #@DuplicatedSignature
+        info = {
+                "frames"    : self.frames,
+                "src_width" : self.src_width,
+                "src_height": self.src_height,
+                "dst_width" : self.dst_width,
+                "dst_height": self.dst_height,
+                }
+        if self.src_format:
+            info["src_format"] = self.src_format
+        if self.dst_format:
+            info["dst_format"] = self.dst_format
+        if self.frames>0 and self.time>0:
+            pps = float(self.src_width) * float(self.src_height) * float(self.frames) / self.time
+            info["total_time_ms"] = int(self.time*1000.0)
+            info["pixels_per_second"] = int(pps)
+        return info
+
+    def __repr__(self):
+        return "csc_cython(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
+                                                 self.dst_format, self.dst_width, self.dst_height)
+
+    def __dealloc__(self):                  #@DuplicatedSignature
+        self.clean()
+
+    def get_src_width(self):
+        return self.src_width
+
+    def get_src_height(self):
+        return self.src_height
+
+    def get_src_format(self):
+        return self.src_format
+
+    def get_dst_width(self):
+        return self.dst_width
+
+    def get_dst_height(self):
+        return self.dst_height
+
+    def get_dst_format(self):
+        return self.dst_format
+
+    def get_type(self):                     #@DuplicatedSignature
+        return  "cython"
+
+
+    def convert_image(self, image):
+        return self.convert_image_function(image)
+
+
+    def r210_to_YUV420P(self, image):
+        return self.do_RGB_to_YUV420P(image, 4, 0, 0, 0)
+
+    def BGR_to_YUV420P(self, image):
+        return self.do_RGB_to_YUV420P(image, 3, BGR_R, BGR_G, BGR_B)
+
+    def RGB_to_YUV420P(self, image):
+        return self.do_RGB_to_YUV420P(image, 3, RGB_R, RGB_G, RGB_B)
+
+    def BGRX_to_YUV420P(self, image):
+        return self.do_RGB_to_YUV420P(image, 4, BGRX_R, BGRX_G, BGRX_B)
+
+    def RGBX_to_YUV420P(self, image):
+        return self.do_RGB_to_YUV420P(image, 4, RGBX_R, RGBX_G, RGBX_B)
+
+    cdef do_RGB_to_YUV420P(self, image, const uint8_t Bpp, const uint8_t Rindex, const uint8_t Gindex, const uint8_t Bindex):
+        cdef Py_ssize_t pic_buf_len = 0
+        cdef const unsigned char *input_image
+        cdef const unsigned int *input_r210
+        cdef unsigned char *output_image
+        cdef unsigned int input_stride
+        cdef unsigned int x,y,o             #@DuplicatedSignature
+        cdef unsigned int sx, sy, ox, oy
+        cdef unsigned int workw, workh
+        cdef unsigned int Ystride, Ustride, Vstride
+        cdef unsigned unsigned int r210
+        cdef unsigned char R, G, B
+        cdef unsigned short Rsum, Gsum, Bsum
+        cdef unsigned char sum, i, dx, dy
+        cdef unsigned char *Y
+        cdef unsigned char *U
+        cdef unsigned char *V
+
+        start = time.time()
+        iplanes = image.get_planes()
+        assert iplanes==ImageWrapper.PACKED, "invalid input format: %s planes" % iplanes
+        assert image.get_width()>=self.src_width, "invalid image width: %s (minimum is %s)" % (image.get_width(), self.src_width)
+        assert image.get_height()>=self.src_height, "invalid image height: %s (minimum is %s)" % (image.get_height(), self.src_height)
+        pixels = image.get_pixels()
+        assert pixels, "failed to get pixels from %s" % image
+        input_stride = image.get_rowstride()
+        log("do_RGB_to_YUV420P(%s, %i, %i, %i, %i) input=%s, strides=%s" % (image, Bpp, Rindex, Gindex, Bindex, len(pixels), input_stride))
+
+        assert object_as_buffer(pixels, <const void**> &input_image, &pic_buf_len)==0
+        #allocate output buffer:
+        output_image = <unsigned char*> memalign(self.buffer_size)
+        Y = output_image + self.offsets[0]
+        U = output_image + self.offsets[1]
+        V = output_image + self.offsets[2]
+
+        #copy to local variables (ensures C code will be optimized correctly)
+        Ystride = self.dst_strides[0]
+        Ustride = self.dst_strides[1]
+        Vstride = self.dst_strides[2]
+        cdef unsigned int src_width = self.src_width
+        cdef unsigned int src_height = self.src_height
+        cdef unsigned int dst_width = self.dst_width
+        cdef unsigned int dst_height = self.dst_height
+
+        #we process 4 pixels at a time:
+        workw = roundup(dst_width/2, 2)
+        workh = roundup(dst_height/2, 2)
+        #from now on, we can release the gil:
+        if self.src_format=="r210":
+            assert Bpp==4
+            input_r210 = <unsigned int*> input_image
+            with nogil:
+                for y in range(workh):
+                    for x in range(workw):
+                        R = G = B = 0
+                        Rsum = Gsum = Bsum = 0
+                        sum = 0
+                        for dy in range(2):
+                            oy = y*2 + dy
+                            if oy>=dst_height:
+                                break
+                            sy = oy*src_height//dst_height
+                            for dx in range(2):
+                                ox = x*2 + dx
+                                if ox>=dst_width:
+                                    break
+                                sx = ox*src_width//dst_width
+                                o = sy*input_stride + sx*Bpp
+                                r210 = input_r210[o//4]
+                                B = (r210&0x3ff00000) >> 22
+                                G = (r210&0x000ffc00) >> 12
+                                R = (r210&0x000003ff) >> 2
+                                o = oy*Ystride + ox
+                                Y[o] = clamp(YR * R + YG * G + YB * B + YC)
+                                sum += 1
+                                Rsum += R
+                                Gsum += G
+                                Bsum += B
+                        #write 1U and 1V:
+                        if sum>0:
+                            Rsum /= sum
+                            Gsum /= sum
+                            Bsum /= sum
+                            U[y*Ustride + x] = clamp(UR * Rsum + UG * Gsum + UB * Bsum + UC)
+                            V[y*Vstride + x] = clamp(VR * Rsum + VG * Gsum + VB * Bsum + VC)
+        else:
+            with nogil:
+                for y in range(workh):
+                    for x in range(workw):
+                        R = G = B = 0
+                        Rsum = Gsum = Bsum = 0
+                        sum = 0
+                        for dy in range(2):
+                            oy = y*2 + dy
+                            if oy>=dst_height:
+                                break
+                            sy = oy*src_height//dst_height
+                            for dx in range(2):
+                                ox = x*2 + dx
+                                if ox>=dst_width:
+                                    break
+                                sx = ox*src_width//dst_width
+                                o = sy*input_stride + sx*Bpp
+                                R = input_image[o + Rindex]
+                                G = input_image[o + Gindex]
+                                B = input_image[o + Bindex]
+                                o = oy*Ystride + ox
+                                Y[o] = clamp(YR * R + YG * G + YB * B + YC)
+                                sum += 1
+                                Rsum += R
+                                Gsum += G
+                                Bsum += B
+                        #write 1U and 1V:
+                        if sum>0:
+                            Rsum /= sum
+                            Gsum /= sum
+                            Bsum /= sum
+                            U[y*Ustride + x] = clamp(UR * Rsum + UG * Gsum + UB * Bsum + UC)
+                            V[y*Vstride + x] = clamp(VR * Rsum + VG * Gsum + VB * Bsum + VC)
+
+        #create python buffer from each plane:
+        planes = []
+        strides = []
+        for i in range(3):
+            strides.append(self.dst_strides[i])
+            planes.append(memory_as_pybuffer(<void *> (<unsigned long> (output_image + self.offsets[i])), self.dst_sizes[i], True))
+        elapsed = time.time()-start
+        log("%s took %.1fms", self, 1000.0*elapsed)
+        self.time += elapsed
+        self.frames += 1
+        out_image = CythonImageWrapper(0, 0, dst_width, dst_height, planes, self.dst_format, 24, strides, ImageWrapper.PLANAR_3)
+        out_image.cython_buffer = <unsigned long> output_image
+        return out_image
+
+
+    def r210_to_BGR48(self, image):
+        cdef double start = time.time()
+        assert image.get_planes()==ImageWrapper.PACKED, "invalid input format: %s planes" % image.get_planes()
+        assert image.get_width()>=self.src_width, "invalid image width: %s (minimum is %s)" % (image.get_width(), self.src_width)
+        assert image.get_height()>=self.src_height, "invalid image height: %s (minimum is %s)" % (image.get_height(), self.src_height)
+        pixels = image.get_pixels()
+        assert pixels, "failed to get pixels from %s" % image
+        input_stride = image.get_rowstride()
+        log("r210_to_BGR48(%s) input=%s, strides=%s" % (image, len(pixels), input_stride))
+
+        cdef Py_ssize_t pic_buf_len = 0
+        cdef const unsigned int *r210
+        assert object_as_buffer(pixels, <const void**> &r210, &pic_buf_len)==0
+
+        #allocate output buffer:
+        cdef unsigned short *bgr48 = <unsigned short*> memalign(self.dst_sizes[0])
+
+        cdef unsigned int w = self.src_width
+        cdef unsigned int h = self.src_height
+        cdef unsigned int dst_stride = self.dst_strides[0]
+        cdef unsigned int src_stride = image.get_rowstride()
+
+        cdef unsigned int y = 0
+        cdef unsigned int i = 0
+        cdef unsigned int v
+        #enable nogil if image is thread safe?
+        #with nogil:
+        for y in range(h):
+            i = y*dst_stride//2
+            for x in range(w):
+                v = r210[x]
+                bgr48[i] = v&0x000003ff
+                bgr48[i+1] = (v&0x000ffc00) >> 10
+                bgr48[i+2] = (v&0x3ff00000) >> 20
+                i = i + 3
+            r210 = <unsigned int*> ((<uintptr_t> r210) + src_stride)
+
+        bgr48_buffer = memory_as_pybuffer(<void *> bgr48, self.dst_sizes[0], True)
+        cdef double elapsed = time.time()-start
+        log("%s took %.1fms", self, 1000.0*elapsed)
+        self.time += elapsed
+        self.frames += 1
+        out_image = CythonImageWrapper(0, 0, self.dst_width, self.dst_height, bgr48_buffer, "BGR48", 48, dst_stride, ImageWrapper.PACKED)
+        out_image.cython_buffer = <unsigned long> bgr48
+        return out_image
+
+
+    def YUV420P_to_RGBX(self, image):
+        return self.do_YUV420P_to_RGB(image, 4, RGBX_R, RGBX_G, RGBX_B, RGBX_X)
+
+    def YUV420P_to_RGB(self, image):
+        return self.do_YUV420P_to_RGB(image, 3, RGB_R, RGB_G, RGB_B, 0)
+
+    def YUV420P_to_BGRX(self, image):
+        return self.do_YUV420P_to_RGB(image, 4, BGRX_R, BGRX_G, BGRX_B, BGRX_X)
+
+    def YUV420P_to_BGR(self, image):
+        return self.do_YUV420P_to_RGB(image, 3, BGR_R, BGR_G, BGR_B, 0)
+
+    cdef do_YUV420P_to_RGB(self, image, const uint8_t Bpp, const uint8_t Rindex, const uint8_t Gindex, const uint8_t Bindex, const uint8_t Xindex):
+        cdef Py_ssize_t buf_len = 0
+        cdef unsigned char *output_image        #
+        cdef unsigned int x,y,o                 #@DuplicatedSignature
+        cdef unsigned int sx, sy, ox, oy
+        cdef unsigned int workw, workh          #
+        cdef unsigned int stride
+        cdef unsigned char *Ybuf
+        cdef unsigned char *Ubuf
+        cdef unsigned char *Vbuf
+        cdef unsigned char dx, dy
+        cdef short Y, U, V
+        cdef unsigned int Ystride, Ustride, Vstride      #
+        cdef object rgb
+
+        start = time.time()
+        iplanes = image.get_planes()
+        assert iplanes==ImageWrapper.PLANAR_3, "invalid input format: %s planes" % iplanes
+        assert image.get_width()>=self.src_width, "invalid image width: %s (minimum is %s)" % (image.get_width(), self.src_width)
+        assert image.get_height()>=self.src_height, "invalid image height: %s (minimum is %s)" % (image.get_height(), self.src_height)
+        planes = image.get_pixels()
+        assert planes, "failed to get pixels from %s" % image
+        input_strides = image.get_rowstride()
+        log("do_YUV420P_to_RGB(%s) strides=%s", (image, Bpp, Rindex, Gindex, Bindex, Xindex), input_strides)
+
+        #copy to local variables:
+        stride = self.dst_strides[0]
+        Ystride = input_strides[0]
+        Ustride = input_strides[1]
+        Vstride = input_strides[2]
+        cdef unsigned int src_width = self.src_width
+        cdef unsigned int src_height = self.src_height
+        cdef unsigned int dst_width = self.dst_width
+        cdef unsigned int dst_height = self.dst_height
+
+        assert object_as_buffer(planes[0], <const void**> &Ybuf, &buf_len)==0, "failed to convert %s to a buffer" % type(planes[0])
+        assert buf_len>=Ystride*image.get_height(), "buffer for Y plane is too small: %s bytes, expected at least %s" % (buf_len, Ystride*image.get_height())
+        assert object_as_buffer(planes[1], <const void**> &Ubuf, &buf_len)==0, "failed to convert %s to a buffer" % type(planes[1])
+        assert buf_len>=Ustride*image.get_height()//2, "buffer for U plane is too small: %s bytes, expected at least %s" % (buf_len, Ustride*image.get_height()/2)
+        assert object_as_buffer(planes[2], <const void**> &Vbuf, &buf_len)==0, "failed to convert %s to a buffer" % type(planes[2])
+        assert buf_len>=Vstride*image.get_height()//2, "buffer for V plane is too small: %s bytes, expected at least %s" % (buf_len, Vstride*image.get_height()/2)
+
+        #allocate output buffer:
+        output_image = <unsigned char*> memalign(self.buffer_size)
+
+        #we process 4 pixels at a time:
+        workw = roundup(dst_width//2, 2)
+        workh = roundup(dst_height//2, 2)
+        #from now on, we can release the gil:
+        with nogil:
+            for y in range(workh):
+                for x in range(workw):
+                    #assert x*2<=src_width and y*2<=src_height
+                    #read U and V for the next 4 pixels:
+                    sx = x*src_width//dst_width
+                    sy = y*src_height//dst_height
+                    U = Ubuf[sy*Ustride + sx] - Uc
+                    V = Vbuf[sy*Vstride + sx] - Vc
+                    #now read up to 4 Y values and write an RGBX pixel for each:
+                    for dy in range(2):
+                        oy = y*2 + dy
+                        if oy>=dst_height:
+                            break
+                        sy = oy*src_height//dst_height
+                        for dx in range(2):
+                            ox = x*2 + dx
+                            if ox>=dst_width:
+                                break
+                            sx = ox*src_width//dst_width
+                            Y = Ybuf[sy*Ystride + sx] - Yc
+                            o = oy*stride + ox * Bpp
+                            output_image[o + Rindex] = clamp(RY * Y + RU * U + RV * V)
+                            output_image[o + Gindex] = clamp(GY * Y + GU * U + GV * V)
+                            output_image[o + Bindex] = clamp(BY * Y + BU * U + BV * V)
+                            if Bpp==4:
+                                output_image[o + Xindex] = 255
+        rgb = memory_as_pybuffer(<void *> output_image, self.dst_sizes[0], True)
+        elapsed = time.time()-start
+        log("%s took %.1fms", self, 1000.0*elapsed)
+        self.time += elapsed
+        self.frames += 1
+        out_image = CythonImageWrapper(0, 0, dst_width, dst_height, rgb, self.dst_format, 24, stride, ImageWrapper.PACKED)
+        out_image.cython_buffer = <unsigned long> output_image
+        return out_image
+
+
+    def GBRP_to_RGBX(self, image):
+        return self.do_RGBP_to_RGB(image, 2, 0, 1, RGBX_R, RGBX_G, RGBX_B, RGBX_X)
+
+    def GBRP_to_BGRX(self, image):
+        return self.do_RGBP_to_RGB(image, 2, 0, 1, RGBX_B, RGBX_G, RGBX_R, RGBX_X)
+
+    cdef do_RGBP_to_RGB(self, image, const uint8_t Rsrc, const uint8_t Gsrc, const uint8_t Bsrc,
+                                     const uint8_t Rdst, const uint8_t Gdst, const uint8_t Bdst, const uint8_t Xdst):
+        cdef Py_ssize_t buf_len = 0             #
+        cdef unsigned char *output_image        #@DuplicatedSignature
+        cdef unsigned int x,y,o                 #@DuplicatedSignature
+        cdef unsigned int sx, sy                #@DuplicatedSignature
+        cdef unsigned int stride                #@DuplicatedSignature
+        cdef unsigned char *Gbuf                #@DuplicatedSignature
+        cdef unsigned char *Gptr
+        cdef unsigned char *Bbuf                #@DuplicatedSignature
+        cdef unsigned char *Bptr
+        cdef unsigned char *Rbuf                #@DuplicatedSignature
+        cdef unsigned char *Rptr
+        cdef unsigned char sum
+        cdef unsigned int Gstride, Bstride, Rstride
+        cdef object rgb                         #@DuplicatedSignature
+
+        start = time.time()
+        iplanes = image.get_planes()
+        assert iplanes==ImageWrapper.PLANAR_3, "invalid input format: %s planes" % iplanes
+        assert image.get_width()>=self.src_width, "invalid image width: %s (minimum is %s)" % (image.get_width(), self.src_width)
+        assert image.get_height()>=self.src_height, "invalid image height: %s (minimum is %s)" % (image.get_height(), self.src_height)
+        planes = image.get_pixels()
+        assert planes, "failed to get pixels from %s" % image
+        input_strides = image.get_rowstride()
+        log("do_RGBP_to_RGB(%s) strides=%s", (image, Rsrc, Gsrc, Bsrc, Rdst, Gdst, Bdst, Xdst), input_strides)
+
+        #copy to local variables:
+        Rstride = input_strides[Rsrc]
+        Gstride = input_strides[Gsrc]
+        Bstride = input_strides[Bsrc]
+        stride = self.dst_strides[0]
+        cdef unsigned int src_width = self.src_width
+        cdef unsigned int src_height = self.src_height
+        cdef unsigned int dst_width = self.dst_width
+        cdef unsigned int dst_height = self.dst_height
+
+        assert object_as_buffer(planes[Rsrc], <const void**> &Rbuf, &buf_len)==0
+        assert buf_len>=Rstride*image.get_height(), "buffer for R plane is too small: %s bytes, expected at least %s" % (buf_len, Rstride*image.get_height())
+        assert object_as_buffer(planes[Gsrc], <const void**> &Gbuf, &buf_len)==0
+        assert buf_len>=Gstride*image.get_height(), "buffer for G plane is too small: %s bytes, expected at least %s" % (buf_len, Gstride*image.get_height())
+        assert object_as_buffer(planes[Bsrc], <const void**> &Bbuf, &buf_len)==0
+        assert buf_len>=Bstride*image.get_height(), "buffer for B plane is too small: %s bytes, expected at least %s" % (buf_len, Bstride*image.get_height())
+
+        #allocate output buffer:
+        output_image = <unsigned char*> memalign(self.buffer_size)
+
+        #from now on, we can release the gil:
+        with nogil:
+            for y in range(dst_height):
+                o = stride*y
+                sy = y*src_height/dst_height
+                Rptr  = Rbuf + (sy * Rstride)
+                Gptr  = Gbuf + (sy * Gstride)
+                Bptr  = Bbuf + (sy * Bstride)
+                for x in range(dst_width):
+                    sx = x*src_width/dst_width
+                    output_image[o+Rdst] = Rptr[sx]
+                    output_image[o+Gdst] = Gptr[sx]
+                    output_image[o+Bdst] = Bptr[sx]
+                    output_image[o+Xdst] = 255
+                    o += 4
+
+        rgb = memory_as_pybuffer(<void *> output_image, self.dst_sizes[0], True)
+        elapsed = time.time()-start
+        log("%s took %.1fms", self, 1000.0*elapsed)
+        self.time += elapsed
+        self.frames += 1
+        out_image = CythonImageWrapper(0, 0, dst_width, dst_height, rgb, self.dst_format, 24, stride, ImageWrapper.PACKED)
+        out_image.cython_buffer = <unsigned long> output_image
+        return out_image
+
+
+def selftest(full=False):
+    from xpra.codecs.codec_checks import testcsc
+    from xpra.codecs.csc_cython import colorspace_converter
+    testcsc(colorspace_converter, full)
diff --git a/src/xpra/codecs/enc_x264/encoder.pyx b/src/xpra/codecs/enc_x264/encoder.pyx
index b586651f8d..a4c985987a 100644
--- a/src/xpra/codecs/enc_x264/encoder.pyx
+++ b/src/xpra/codecs/enc_x264/encoder.pyx
@@ -372,8 +372,8 @@ COLORSPACES = {
     }
 emit_ifdef_bitdepth()
 if SUPPORT_30BPP:
-    COLORSPACE_FORMATS["r210"] = (X264_CSP_BGR | X264_CSP_HIGH_DEPTH,    PROFILE_HIGH444_PREDICTIVE,    RGB_PROFILES)
-    COLORSPACES["r210"] = ("r210", )
+    COLORSPACE_FORMATS["BGR48"] = (X264_CSP_BGR | X264_CSP_HIGH_DEPTH,    PROFILE_HIGH444_PREDICTIVE,    RGB_PROFILES)
+    COLORSPACES["BGR48"] = ("BGR48", )
 emit_endif_bitdepth()
 if SUPPORT_24BPP:
     COLORSPACES.update({
@@ -841,20 +841,15 @@ cdef class Encoder:
 
         x264_picture_init(&pic_in)
 
-        if self.src_format.find("RGB")>=0 or self.src_format.find("BGR")>=0 or self.src_format=="r210":
+        if self.src_format.find("RGB")>=0 or self.src_format.find("BGR")>=0:
             assert len(pixels)>0
             assert istrides>0
-            emit_ifdef_bitdepth()
-            if self.src_format=="r210":
-                #CSC should be moved elsewhere!
-                from xpra.codecs.argb.argb import r210_to_bgr48
-                pixels = r210_to_bgr48(pixels, self.width, self.height, istrides, self.width*6)
-                istrides = self.width*6
-            emit_endif_bitdepth()
             assert object_as_buffer(pixels, <const void**> &pic_buf, &pic_buf_len)==0, "unable to convert %s to a buffer" % type(pixels)
-            for i in range(3):
-                pic_in.img.plane[i] = pic_buf
-                pic_in.img.i_stride[i] = istrides
+            pic_in.img.plane[0] = pic_buf
+            pic_in.img.i_stride[0] = istrides
+            for i in range(1, 3):
+                pic_in.img.plane[i] = NULL
+                pic_in.img.i_stride[i] = 0
             self.bytes_in += pic_buf_len
             pic_in.img.i_plane = 1
         else:
diff --git a/src/xpra/codecs/loader.py b/src/xpra/codecs/loader.py
index 7bb07e52c8..d22ac69abb 100755
--- a/src/xpra/codecs/loader.py
+++ b/src/xpra/codecs/loader.py
@@ -159,6 +159,7 @@ def xpra_codec_import(name, description, top_module, class_module, classname):
     #csc:
     "csc_swscale"   : ("swscale colorspace conversion", "csc_swscale", "colorspace_converter", "ColorspaceConverter"),
     "csc_libyuv"    : ("libyuv colorspace conversion", "csc_libyuv", "colorspace_converter", "ColorspaceConverter"),
+    "csc_cython"    : ("cython colorspace conversion", "csc_cython", "colorspace_converter", "ColorspaceConverter"),
     #decoders:
     "dec_pillow"    : ("Pillow decoder",    "pillow",       "decoder", "decompress"),
     "dec_webp"      : ("webp decoder",      "webp",         "decoder", "decompress"),
@@ -228,7 +229,7 @@ def has_codec(name) -> bool:
     return name in codecs
 
 
-CSC_CODECS = "csc_swscale", "csc_libyuv"
+CSC_CODECS = "csc_swscale", "csc_cython", "csc_libyuv"
 ENCODER_CODECS = "enc_pillow", "enc_webp", "enc_jpeg"
 ENCODER_VIDEO_CODECS = "enc_vpx", "enc_x264", "enc_x265", "nvenc", "enc_ffmpeg"
 DECODER_CODECS = "dec_pillow", "dec_webp", "dec_jpeg"
diff --git a/src/xpra/codecs/video_helper.py b/src/xpra/codecs/video_helper.py
index cc1e096e88..51957090ed 100755
--- a/src/xpra/codecs/video_helper.py
+++ b/src/xpra/codecs/video_helper.py
@@ -22,6 +22,7 @@
     "x265"       : "enc_x265",
     "nvenc"      : "nvenc",
     "swscale"    : "csc_swscale",
+    "cython"     : "csc_cython",
     "libyuv"     : "csc_libyuv",
     "avcodec2"   : "dec_avcodec2",
     "ffmpeg"     : "enc_ffmpeg",
@@ -49,7 +50,7 @@ def try_import_modules(*codec_names):
 #try to import the module that contains them (cheap check):
 ALL_VIDEO_ENCODER_OPTIONS = try_import_modules("x264", "vpx", "x265", "nvenc", "ffmpeg")
 HARDWARE_ENCODER_OPTIONS = try_import_modules("nvenc")
-ALL_CSC_MODULE_OPTIONS = try_import_modules("swscale", "libyuv")
+ALL_CSC_MODULE_OPTIONS = try_import_modules("swscale", "cython", "libyuv")
 NO_GFX_CSC_OPTIONS = []
 ALL_VIDEO_DECODER_OPTIONS = try_import_modules("avcodec2", "vpx")
 
@@ -405,16 +406,17 @@ def get_server_full_csc_modes(self, *client_supported_csc_modes):
             returns the CSC modes per encoding that the server can encode with.
             (taking into account the decoder's actual output colorspace for each encoding)
         """
-        log("get_client_full_csc_modes(%s) decoder encodings=%s",
+        log.warn("get_server_full_csc_modes(%s) decoder encodings=%s",
             client_supported_csc_modes, self._video_decoder_specs.keys())
         full_csc_modes = {}
         for encoding, encoding_specs in self._video_decoder_specs.items():
             assert encoding_specs is not None
             for colorspace, decoder_specs in sorted(encoding_specs.items()):
                 for decoder_name, decoder_module in decoder_specs:
-                    log("found decoder %12s for %5s with %7s mode", decoder_name, encoding, colorspace)
                     #figure out the actual output colorspace:
                     output_colorspace = decoder_module.get_output_colorspace(encoding, colorspace)
+                    log("found decoder %12s for %5s with %7s mode, outputs '%s'",
+                             decoder_name, encoding, colorspace, output_colorspace)
                     if output_colorspace in client_supported_csc_modes:
                         encoding_colorspaces = full_csc_modes.setdefault(encoding, [])
                         if colorspace not in encoding_colorspaces: