Skip to content

Commit

Permalink
#1317 / #365: stay on the GPU, Linux version:
Browse files Browse the repository at this point in the history
* add new kernels for nvenc to handle XRGB pixel data
* make it easier to test CUDA capture from the capture tool: XPRA_NVFBC_CUDA (defaults to 0)
* fix NvFBC capture on Linux: CUdeviceptr is an "unsigned long"
* prefer CUDA capture from the X11 shadow server so we can use the CUDA buffer directly in NVENC

git-svn-id: https://xpra.org/svn/Xpra/trunk@16492 3bb7dfac-3a0b-4e04-842a-767bc560f471
  • Loading branch information
totaam committed Jul 24, 2017
1 parent 273196d commit 41035ec
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 41 deletions.
7 changes: 3 additions & 4 deletions src/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,10 +935,9 @@ def pkgconfig(*pkgs_options, **ekw):
"xpra/codecs/vpx/encoder.c",
"xpra/codecs/vpx/decoder.c",
"xpra/codecs/nvenc/encoder.c",
"xpra/codecs/cuda_common/ARGB_to_NV12.fatbin",
"xpra/codecs/cuda_common/ARGB_to_YUV444.fatbin",
"xpra/codecs/cuda_common/BGRA_to_NV12.fatbin",
"xpra/codecs/cuda_common/BGRA_to_U.fatbin",
"xpra/codecs/cuda_common/BGRA_to_V.fatbin",
"xpra/codecs/cuda_common/BGRA_to_Y.fatbin",
"xpra/codecs/cuda_common/BGRA_to_YUV444.fatbin",
"xpra/codecs/enc_x264/encoder.c",
"xpra/codecs/enc_x265/encoder.c",
Expand Down Expand Up @@ -2024,7 +2023,7 @@ def which(cmd):
#TODO:
# * compile directly to output directory instead of using data files?
# * detect which arches we want to build for? (does it really matter much?)
kernels = ("BGRA_to_NV12", "BGRA_to_YUV444")
kernels = ("ARGB_to_NV12", "ARGB_to_YUV444", "BGRA_to_NV12", "BGRA_to_YUV444")
for kernel in kernels:
cuda_src = "xpra/codecs/cuda_common/%s.cu" % kernel
cuda_bin = "xpra/codecs/cuda_common/%s.fatbin" % kernel
Expand Down
2 changes: 2 additions & 0 deletions src/win32/PY27_MINGW_BUILD.sh
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ if [ "${DO_CUDA}" == "1" ]; then
echo "* Building CUDA kernels"
cmd.exe //c "win32\\BUILD_CUDA_KERNEL" BGRA_to_NV12 || exit 1
cmd.exe //c "win32\\BUILD_CUDA_KERNEL" BGRA_to_YUV444 || exit 1
cmd.exe //c "win32\\BUILD_CUDA_KERNEL" XRGB_to_NV12 || exit 1
cmd.exe //c "win32\\BUILD_CUDA_KERNEL" XRGB_to_YUV444 || exit 1
fi

echo "* Building Python 2.7 Cython modules"
Expand Down
75 changes: 75 additions & 0 deletions src/xpra/codecs/cuda_common/ARGB_to_NV12.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* This file is part of Xpra.
* Copyright (C) 2013-2017 Antoine Martin <[email protected]>
* Xpra is released under the terms of the GNU GPL v2, or, at your option, any
* later version. See the file COPYING for details.
*/

#include <stdint.h>

extern "C" __global__ void ARGB_to_NV12(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
int w, int h)
{
uint32_t gx, gy;
gx = blockIdx.x * blockDim.x + threadIdx.x;
gy = blockIdx.y * blockDim.y + threadIdx.y;

uint32_t src_y = gy*2 * src_h / dst_h;
uint32_t src_x = gx*2 * src_w / dst_w;

if ((src_x < w) & (src_y < h)) {
//4 bytes per pixel, and 2 pixels width/height at a time:
//byte index:
uint32_t si = (src_y * srcPitch) + src_x * 4;

//we may read up to 4 32-bit RGB pixels:
uint8_t R[4];
uint8_t G[4];
uint8_t B[4];
uint8_t j = 0;
R[0] = srcImage[si+1];
G[0] = srcImage[si+2];
B[0] = srcImage[si+3];
for (j=1; j<4; j++) {
R[j] = R[0];
G[j] = G[0];
B[j] = B[0];
}

//write up to 4 Y pixels:
uint32_t di = (gy * 2 * dstPitch) + gx * 2;
dstImage[di] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16);
if (gx*2 + 1 < src_w) {
R[1] = srcImage[si+5];
G[1] = srcImage[si+6];
B[1] = srcImage[si+7];
dstImage[di + 1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16);
}
if (gy*2 + 1 < src_h) {
si += srcPitch;
di += dstPitch;
R[2] = srcImage[si+1];
G[2] = srcImage[si+2];
B[2] = srcImage[si+3];
dstImage[di] = __float2int_rn(0.257 * R[2] + 0.504 * G[2] + 0.098 * B[2] + 16);
if (gx*2 + 1 < src_w) {
R[3] = srcImage[si+5];
G[3] = srcImage[si+6];
B[3] = srcImage[si+7];
dstImage[di + 1] = __float2int_rn(0.257 * R[3] + 0.504 * G[3] + 0.098 * B[3] + 16);
}
}

//write 1 U and 1 V pixel:
float u = 0;
float v = 0;
for (j=0; j<4; j++) {
u += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128;
v += 0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128;
}
di = (dst_h + gy) * dstPitch + gx * 2;
dstImage[di] = __float2int_rn(u / 4.0);
dstImage[di + 1] = __float2int_rn(v / 4.0);
}
}
39 changes: 39 additions & 0 deletions src/xpra/codecs/cuda_common/ARGB_to_YUV444.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* This file is part of Xpra.
* Copyright (C) 2013-2017 Antoine Martin <[email protected]>
* Xpra is released under the terms of the GNU GPL v2, or, at your option, any
* later version. See the file COPYING for details.
*/

#include <stdint.h>

extern "C" __global__ void ARGB_to_YUV444(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
int w, int h)
{
uint32_t gx, gy;
gx = blockIdx.x * blockDim.x + threadIdx.x;
gy = blockIdx.y * blockDim.y + threadIdx.y;

uint32_t src_y = gy * src_h / dst_h;
uint32_t src_x = gx * src_w / dst_w;

if ((src_x < w) & (src_y < h)) {
uint8_t R;
uint8_t G;
uint8_t B;
//one 32-bit RGB pixel at a time:
uint32_t si = (src_y * srcPitch) + src_x * 4;
R = srcImage[si+1];
G = srcImage[si+2];
B = srcImage[si+3];

uint32_t di;
di = (gy * dstPitch) + gx;
dstImage[di] = __float2int_rn(0.257 * R + 0.504 * G + 0.098 * B + 16);
di += dstPitch*dst_h;
dstImage[di] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128);
di += dstPitch*dst_h;
dstImage[di] = __float2int_rn(0.439 * R - 0.368 * G - 0.071 * B + 128);
}
}
26 changes: 10 additions & 16 deletions src/xpra/codecs/nvenc/encoder.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,8 @@ def get_COLORSPACES(encoding):
COLORSPACES = {
"BGRX" : out_cs,
"BGRA" : out_cs,
"XRGB" : out_cs,
"ARGB" : out_cs,
}
return COLORSPACES

Expand Down Expand Up @@ -1325,7 +1327,7 @@ def get_spec(encoding, colorspace):
#FIXME: we should probe this using WIDTH_MAX, HEIGHT_MAX!
global MAX_SIZE
max_w, max_h = MAX_SIZE.get(encoding, (4096, 4096))
has_lossless_mode = colorspace in ("BGRX", "BGRA" ) and encoding=="h264"
has_lossless_mode = colorspace in ("XRGB", "ARGB", "BGRX", "BGRA" ) and encoding=="h264"
cs = video_spec(encoding=encoding, output_colorspaces=get_COLORSPACES(encoding)[colorspace], has_lossless_mode=LOSSLESS_CODEC_SUPPORT.get(encoding, LOSSLESS_ENABLED),
codec_class=Encoder, codec_type=get_type(),
quality=60+has_lossless_mode*40, speed=100, setup_cost=80, cpu_cost=10, gpu_cost=100,
Expand Down Expand Up @@ -1402,16 +1404,6 @@ cdef inline raiseNVENC(NVENCSTATUS ret, msg):
if ret!=0:
raise NVENCException(ret, msg)

cpdef get_CUDA_CSC_function(int device_id, function_name):
return function_name, get_CUDA_function(device_id, function_name)


cpdef get_BGRA2NV12(int device_id):
return get_CUDA_CSC_function(device_id, "BGRA_to_NV12")

cpdef get_BGRA2YUV444(int device_id):
return get_CUDA_CSC_function(device_id, "BGRA_to_YUV444")


cdef class Encoder:
cdef unsigned int width
Expand Down Expand Up @@ -1519,6 +1511,7 @@ cdef class Encoder:
def init_context(self, int width, int height, src_format, dst_formats, encoding, int quality, int speed, scaling, options={}): #@DuplicatedSignature
assert NvEncodeAPICreateInstance is not None, "encoder module is not initialized"
log("init_context%s", (width, height, src_format, dst_formats, encoding, quality, speed, scaling, options))
assert src_format in ("ARGB", "XRGB", "BGRA", "BGRX"), "invalid source format %s" % src_format
self.width = width
self.height = height
self.speed = speed
Expand Down Expand Up @@ -1656,23 +1649,23 @@ cdef class Encoder:
da = driver.device_attribute
if self.pixel_format=="BGRX":
assert NATIVE_RGB
kernel_gen = None
kernel_name = None
self.bufferFmt = NV_ENC_BUFFER_FORMAT_ARGB
plane_size_div= 1
wmult = 4
hmult = 1
#if supported (separate plane flag), use YUV444P:
elif self.pixel_format=="YUV444P":
assert YUV444_CODEC_SUPPORT.get(self.encoding, YUV444_ENABLED), "YUV444 is not enabled for %s" % self.encoding
kernel_gen = get_BGRA2YUV444
kernel_name = "%s_to_YUV444" % (self.src_format.replace("X", "A")) #ie: ARGB_to_YUV444
self.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444
#3 full planes:
plane_size_div = 1
wmult = 1
hmult = 3
elif self.pixel_format=="NV12":
assert YUV420_ENABLED
kernel_gen = get_BGRA2NV12
kernel_name = "%s_to_NV12" % (self.src_format.replace("X", "A")) #ie: BGRA_to_NV12
self.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12
#1 full Y plane and 2 U+V planes subsampled by 4:
plane_size_div = 2
Expand All @@ -1688,9 +1681,10 @@ cdef class Encoder:
self.cudaOutputBuffer, self.outputPitch = driver.mem_alloc_pitch(self.encoder_width*wmult, self.encoder_height*hmult//plane_size_div, 16)
log("CUDA Output Buffer=%#x, pitch=%s", int(self.cudaOutputBuffer), self.outputPitch)

if kernel_gen:
if kernel_name:
#load the kernel:
self.kernel_name, self.kernel = kernel_gen(self.cuda_device_id)
self.kernel = get_CUDA_function(self.cuda_device_id, kernel_name)
self.kernel_name = kernel_name
assert self.kernel, "failed to load %s for device %i" % (self.kernel_name, self.cuda_device_id)
#allocate CUDA input buffer (on device) 32-bit RGBX
#(and make it bigger just in case - subregions from XShm can have a huge rowstride)
Expand Down
9 changes: 8 additions & 1 deletion src/xpra/codecs/nvfbc/capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
import time
import os.path

from xpra.util import envbool
from xpra.log import Logger, add_debug_category
log = Logger("encoder", "nvfbc")

USE_NVFBC_CUDA = envbool("XPRA_NVFBC_CUDA", False)


def main():
if "-v" in sys.argv or "--verbose" in sys.argv:
log.enable_debug()
Expand All @@ -34,7 +38,10 @@ def main():
print_nested_dict(fbc_capture.get_status(), print_fn=log.info)
try:
log("creating test capture class")
c = fbc_capture.NvFBC_SysCapture()
if USE_NVFBC_CUDA:
c = fbc_capture.NvFBC_CUDACapture() #@UndefinedVariable
else:
c = fbc_capture.NvFBC_SysCapture() #@UndefinedVariable
log("Capture=%s", c)
c.init_context()
except Exception as e:
Expand Down
27 changes: 8 additions & 19 deletions src/xpra/codecs/nvfbc/fbc_capture_linux.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ DEFAULT_PIXEL_FORMAT = os.environ.get("XPRA_NVFBC_DEFAULT_PIXEL_FORMAT", "RGB")

ctypedef unsigned long DWORD
ctypedef int BOOL
ctypedef unsigned int CUdeviceptr
ctypedef unsigned long CUdeviceptr

cdef extern from "string.h":
void* memset(void * ptr, int value, size_t num)
Expand Down Expand Up @@ -444,7 +444,7 @@ def get_info():


PIXEL_FORMAT_CONST = {
"BGRX" : NVFBC_BUFFER_FORMAT_ARGB,
"XRGB" : NVFBC_BUFFER_FORMAT_ARGB,
"RGB" : NVFBC_BUFFER_FORMAT_RGB,
"YUV420P" : NVFBC_BUFFER_FORMAT_YUV420P,
"YUV444P" : NVFBC_BUFFER_FORMAT_YUV444P,
Expand Down Expand Up @@ -553,35 +553,27 @@ cdef class NvFBC_CUDACapture:

cdef object __weakref__

def init_context(self, int width=-1, int height=-1, pixel_format="BGRX"):
def init_context(self, int width=-1, int height=-1, pixel_format="XRGB"):
log("init_context(%i, %i, %s)", width, height, pixel_format)
if pixel_format not in ("BGRA", "BGRX", "YUV420P", "YUV444P"):
if pixel_format not in PIXEL_FORMAT_CONST:
raise Exception("unsupported pixel format '%s'" % pixel_format)
cdef NVFBC_BUFFER_FORMAT buffer_format = PIXEL_FORMAT_CONST[pixel_format]
self.pixel_format = pixel_format
#CUDA init:
self.cuda_device_id, self.cuda_device = select_device()
if not self.cuda_device:
raise Exception("no valid CUDA device")
d = self.cuda_device
cf = driver.ctx_flags
self.cuda_context = d.make_context(flags=cf.SCHED_AUTO | cf.MAP_HOST)
self.cuda_context = d.make_context(flags=cf.SCHED_AUTO)
assert self.cuda_context, "failed to create a CUDA context for device %s" % device_info(d)
self.cuda_context.pop()
self.cuda_context.push()
self.context = create_context()
get_context_status(self.context)
create_capture_session(self.context, NVFBC_CAPTURE_SHARED_CUDA)
cdef NVFBC_TOCUDA_SETUP_PARAMS params
memset(&params, 0, sizeof(NVFBC_TOCUDA_SETUP_PARAMS))
params.dwVersion = NVFBC_TOCUDA_SETUP_PARAMS_VER
if pixel_format in ("BGRX", "BGRA"):
params.eBufferFormat = NVFBC_BUFFER_FORMAT_ARGB
elif pixel_format=="YUV420P":
params.eBufferFormat = NVFBC_BUFFER_FORMAT_YUV420P
elif pixel_format=="YUV444P":
params.eBufferFormat = NVFBC_BUFFER_FORMAT_YUV444P
else:
raise Exception("invalid pixel format %s" % pixel_format)
params.eBufferFormat = buffer_format
cdef NVFBCSTATUS res = <NVFBCSTATUS> function_list.nvFBCToCudaSetUp(self.context, &params)
self.raiseNvFBC(res, "NvFBCCudaSetup")
log("nvFBCToCudaSetUp()=%i", res)
Expand All @@ -606,7 +598,6 @@ cdef class NvFBC_CUDACapture:

def get_image(self, x=0, y=0, width=0, height=0):
log("get_image%s", (x, y, width, height))
#self.cuda_context.push()
cdef double start = monotonic_time()
cdef CUdeviceptr cuDevicePtr = 0
cdef NVFBC_FRAME_GRAB_INFO grab_info
Expand All @@ -630,15 +621,13 @@ cdef class NvFBC_CUDACapture:
cdef double end = monotonic_time()
log("NvFBCCudaGrabFrame: size=%#x, elapsed=%ims", grab_info.dwHeight*grab_info.dwWidth, int((end-start)*1000))
log("NvFBCCudaGrabFrame: info=%s", info)
#allocate CUDA device memory:
if not self.cuda_device_buffer or self.buffer_size!=grab_info.dwByteSize:
#allocate CUDA device memory:
self.buffer_size = grab_info.dwByteSize
self.cuda_device_buffer = driver.mem_alloc(self.buffer_size)
log("buffer_size=%#x, cuda device buffer=%s", self.buffer_size, self.cuda_device_buffer)
#copy to the buffer we own:
log("memcpy_dtod(%#x, %#x, %#x)", int(self.cuda_device_buffer), int(cuDevicePtr), self.buffer_size)
driver.memcpy_dtod(int(self.cuda_device_buffer), int(cuDevicePtr), self.buffer_size)
#self.cuda_context.pop()
Bpp = len(self.pixel_format) # ie: "BGR" -> 3
image = CUDAImageWrapper(0, 0, int(grab_info.dwWidth), int(grab_info.dwHeight), None, self.pixel_format, Bpp*8, int(grab_info.dwWidth*Bpp), Bpp)
image.cuda_device_buffer = self.cuda_device_buffer
Expand Down
2 changes: 1 addition & 1 deletion src/xpra/x11/shadow_x11_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
USE_XSHM = envbool("XPRA_XSHM", True)
POLL_CURSOR = envint("XPRA_POLL_CURSOR", 20)
USE_NVFBC = envbool("XPRA_NVFBC", True)
USE_NVFBC_CUDA = envbool("XPRA_NVFBC_CUDA", False)
USE_NVFBC_CUDA = envbool("XPRA_NVFBC_CUDA", True)
if USE_NVFBC:
try:
from xpra.codecs.nvfbc.fbc_capture_linux import init_module, NvFBC_SysCapture, NvFBC_CUDACapture #@UnresolvedImport
Expand Down

0 comments on commit 41035ec

Please sign in to comment.