Skip to content

Commit

Permalink
kernel: Bump to 3.18.9
Browse files Browse the repository at this point in the history
kernel: Align pcm512x driver with upstream
See: raspberrypi/linux#873

kernel: dts: overlay: add mz61581, fix piscreen and rpi-display
See: raspberrypi/linux#874

kernel: bcm2709: Increase the spare/free IRQs to match bcm2708
See: raspberrypi/linux#871

kernel: add support for Adafruit PiTFT
See: raspberrypi/linux#858

firmware: dispserve: Allow vsync requests from multiple clients
See: raspberrypi/userland#218

firmware: hello_fft: Update to version 3
See: http://www.aholme.co.uk/GPU_FFT/Main.htm
  • Loading branch information
popcornmix committed Mar 7, 2015
1 parent cdcb506 commit 2aad6d8
Show file tree
Hide file tree
Showing 5,635 changed files with 151,174 additions and 143,215 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
Binary file modified boot/fixup.dat
Binary file not shown.
Binary file modified boot/fixup_x.dat
Binary file not shown.
Binary file modified boot/kernel.img
Binary file not shown.
Binary file modified boot/kernel7.img
Binary file not shown.
Binary file added boot/overlays/mz61581-overlay.dtb
Binary file not shown.
Binary file modified boot/overlays/piscreen-overlay.dtb
Binary file not shown.
Binary file added boot/overlays/pitft28-resistive-overlay.dtb
Binary file not shown.
Binary file modified boot/overlays/rpi-display-overlay.dtb
Binary file not shown.
Binary file modified boot/start.elf
Binary file not shown.
Binary file modified boot/start_cd.elf
Binary file not shown.
Binary file modified boot/start_x.elf
Binary file not shown.
11,924 changes: 5,967 additions & 5,957 deletions extra/Module.symvers

Large diffs are not rendered by default.

12,142 changes: 6,076 additions & 6,066 deletions extra/Module7.symvers

Large diffs are not rendered by default.

103,441 changes: 51,798 additions & 51,643 deletions extra/System.map

Large diffs are not rendered by default.

105,070 changes: 52,612 additions & 52,458 deletions extra/System7.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion extra/git_hash
Original file line number Diff line number Diff line change
@@ -1 +1 @@
91a03559bbc1b5cb2b1157150c1c698b22716d9a
780e68130fba82a525b89e85f051c91b7a508e52
2 changes: 1 addition & 1 deletion extra/uname_string
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Linux version 3.18.8+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #765 PREEMPT Thu Mar 5 15:41:59 GMT 2015
Linux version 3.18.9+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #767 PREEMPT Sat Mar 7 21:41:13 GMT 2015
2 changes: 1 addition & 1 deletion extra/uname_string7
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Linux version 3.18.8-v7+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #765 SMP PREEMPT Thu Mar 5 15:47:06 GMT 2015
Linux version 3.18.9-v7+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #767 SMP PREEMPT Sat Mar 7 21:52:35 GMT 2015
1 change: 1 addition & 0 deletions hardfp/opt/vc/include/interface/mmal/mmal_encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/** \name Pre-defined video encodings */
/* @{ */
#define MMAL_ENCODING_H264 MMAL_FOURCC('H','2','6','4')
#define MMAL_ENCODING_MVC MMAL_FOURCC('M','V','C',' ')
#define MMAL_ENCODING_H263 MMAL_FOURCC('H','2','6','3')
#define MMAL_ENCODING_MP4V MMAL_FOURCC('M','P','4','V')
#define MMAL_ENCODING_MP2V MMAL_FOURCC('M','P','2','V')
Expand Down
Binary file modified hardfp/opt/vc/lib/libEGL_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libGLESv2_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libdebug_sym_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libkhrn_client.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libkhrn_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libmmal_util.so
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvcfiled_check.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvchostif.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvcilcs.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvmcs_rpc_client.a
Binary file not shown.
6 changes: 3 additions & 3 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BCM2835 "GPU_FFT" release 2.0
Copyright (c) 2014, Andrew Holme.
BCM2835 "GPU_FFT" release 3.0
Copyright (c) 2015, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -36,7 +36,7 @@ typedef struct GPU_FFT_COMPLEX COMPLEX;

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...20
int log2_N, // log2(FFT_length) = 8...22
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft) {
Expand Down
8 changes: 4 additions & 4 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BCM2835 "GPU_FFT" release 2.0
Copyright (c) 2014, Andrew Holme.
BCM2835 "GPU_FFT" release 3.0
Copyright (c) 2015, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -51,7 +51,7 @@ struct GPU_FFT_PTR {

struct GPU_FFT_BASE {
int mb;
unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS];
unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS], peri_size;
volatile unsigned *peri;
};

Expand All @@ -63,7 +63,7 @@ struct GPU_FFT {

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...20
int log2_N, // log2(FFT_length) = 8...22
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft);
Expand Down
50 changes: 26 additions & 24 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
BCM2835 "GPU_FFT" release 2.0 by Andrew Holme, 2014.
BCM2835 "GPU_FFT" release 3.0 by Andrew Holme, 2015.

GPU_FFT is an FFT library for the Raspberry Pi which exploits the BCM2835 SoC
3D hardware to deliver ten times more data throughput than is possible on the
700 MHz ARM. Kernels are provided for all power-of-2 FFT lengths between 256
and 2,097,152 points inclusive. A transpose function, which also uses the 3D
hardware, is provided to support 2-dimensional transforms.
700 MHz ARM of the Pi 1. Kernels are provided for all power-of-2 FFT lengths
between 256 and 4,194,304 points inclusive. A transpose function, which also
uses the 3D hardware, is provided to support 2-dimensional transforms.


*** Accuracy ***
Expand All @@ -13,32 +13,36 @@ GPU_FFT uses single-precision floats for data and twiddle factors. The output
is not scaled. The relative root-mean-square (rms) error in parts-per-million
(ppm) for different transform lengths (N) is typically:

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17
ppm rms | 0.27 | 0.42 | 0.50 | 0.70 | 2.3 | 4.4 | 7.6 | 9.2 | 18 | 70
log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15
ppm rms | 0.33 | 0.46 | 0.52 | 0.59 | 0.78 | 0.83 | 0.92 | 0.98

log2(N) | 18 | 19 | 20 | 21 | 8...17 batch of 10
ppm rms | 100 | 180 | 360 | 720 | 18...21 batch of 1
log2(N) | 16 | 17 | 18 | 19 | 20 | 21 | 22
ppm rms | 1.0 | 1.3 | 1.3 | 1.4 | 1.5 | 1.5 | 1.5

Accuracy has improved significantly over previous releases at the expense of a
small (2%) performance hit; however, FFTW is still one order of magnitude more
accurate than GPU_FFT.


*** Throughput ***

GPU_FFT 1.0 had to be invoked through a "mailbox" which added a 100us overhead
on every call. To mitigate this, batches of transforms could be submitted via
a single call. GPU_FFT 2.0 avoids this 100us overhead by poking GPU registers
a single call. GPU_FFT now avoids this 100us overhead by poking GPU registers
directly from the ARM if total batch runtime will be short; but still uses the
mailbox for longer jobs to avoid busy waiting at 100% CPU for too long.

Typical per-transform runtimes for batch sizes of 1 and 10; and comparative
figures for FFTW (FFTW_MEASURE mode) are:
figures for FFTW (FFTW_MEASURE mode) on a Pi 1 with L2 cache enabled are:

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
1 | 0.036 | 0.051 | 0.070 | 0.11 | 0.24 | 0.58 | 1.2 | 3.3 |
10 | 0.016 | 0.027 | 0.045 | 0.095 | 0.25 | 0.61 | 1.2 | 3.2 |
FFTW | 0.092 | 0.22 | 0.48 | 0.95 | 3.0 | 5.1 | 12 | 31 |
log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15
1 | 0.033 | 0.049 | 0.070 | 0.12 | 0.25 | 0.61 | 1.2 | 3.5
10 | 0.017 | 0.029 | 0.049 | 0.11 | 0.27 | 0.66 | 1.2 | 3.3
FFTW | 0.092 | 0.22 | 0.48 | 0.95 | 3.0 | 5.1 | 12 | 31

log2(N) | 16 | 17 | 18 | 19 | 20 | 21 | All times in
1 | 6.8 | 16 | 42 | 95 | 190 | 380 | milliseconds
FFTW | 83 | 180 | 560 | 670 | 1600 | 3400 | 2 sig. figs.
log2(N) | 16 | 17 | 18 | 19 | 20 | 21 | 22 All times in
1 | 7.0 | 17 | 43 | 97 | 194 | 388 | 786 milliseconds
FFTW | 83 | 180 | 560 | 670 | 1600 | 3400 | 8800 2 sig. figs.


*** API functions ***
Expand All @@ -57,7 +61,7 @@ log2(N) | 16 | 17 | 18 | 19 | 20 | 21 | All times in

int mb Mailbox file descriptor obtained by calling mbox_open()

int log2_N log2(FFT length) = 8 to 21
int log2_N log2(FFT length) = 8 to 22

int direction FFT direction: GPU_FFT_FWD for forward FFT
GPU_FFT_REV for inverse FFT
Expand Down Expand Up @@ -105,7 +109,6 @@ as a demo with the latest Raspbian distro. Build and run it as follows:

cd /opt/vc/src/hello_pi/hello_fft
make
sudo mknod char_dev c 100 0
sudo ./hello_fft.bin 12

It accepts three optional command-line arguments: <log2_N> <batch> <loops>
Expand All @@ -114,13 +117,12 @@ The special character device is required for the ioctl mailbox through which
the ARM communicates with the Videocore GPU.


*** With Open GL ***
*** With Open GL on Pi 1 ***

GPU_FFT and Open GL will run concurrently if the GPU_FFT_MEM_* defines in
file gpu_fft.c are changed as follows:
GPU_FFT and Open GL will run concurrently on Pi 1 if GPU_FFT is configured not
to use VC4 L2 cache by zeroing a define in file gpu_fft_base.c as follows:

#define GPU_FFT_MEM_FLG 0x4 // cached=0xC; direct=0x4
#define GPU_FFT_MEM_MAP 0x20000000 // cached=0x0; direct=0x20000000
#define GPU_FFT_USE_VC4_L2_CACHE 0 // Pi 1 only: cached=1; direct=0

Overall performance will probably be higher if GPU_FFT and Open GL take turns
at using the 3D hardware. Since eglSwapBuffers() returns immediately without
Expand Down
69 changes: 55 additions & 14 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft_base.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BCM2835 "GPU_FFT" release 2.0
Copyright (c) 2014, Andrew Holme.
BCM2835 "GPU_FFT" release 3.0
Copyright (c) 2015, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand All @@ -26,7 +26,8 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "bcm_host.h"
#include <dlfcn.h>

#include "gpu_fft.h"
#include "mailbox.h"

Expand All @@ -42,11 +43,48 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define V3D_DBQITE (0xC00e2c>>2)
#define V3D_DBQITC (0xC00e30>>2)

#define GPU_FFT_MEM_MAP 0x0 // cached=0x0; direct=0x20000000
// Setting this define to zero on Pi 1 allows GPU_FFT and Open GL
// to co-exist and also improves performance of longer transforms:
#define GPU_FFT_USE_VC4_L2_CACHE 1 // Pi 1 only: cached=1; direct=0

#define GPU_FFT_NO_FLUSH 1
#define GPU_FFT_TIMEOUT 2000 // ms

struct GPU_FFT_HOST {
unsigned mem_flg, mem_map, peri_addr, peri_size;
};

int gpu_fft_get_host_info(struct GPU_FFT_HOST *info) {
void *handle;
unsigned (*bcm_host_get_sdram_address) (void);
unsigned (*bcm_host_get_peripheral_address)(void);
unsigned (*bcm_host_get_peripheral_size) (void);

// Pi 1 defaults
info->peri_addr = 0x20000000;
info->peri_size = 0x01000000;
info->mem_flg = GPU_FFT_USE_VC4_L2_CACHE? 0xC : 0x4;
info->mem_map = GPU_FFT_USE_VC4_L2_CACHE? 0x0 : 0x20000000; // Pi 1 only

handle = dlopen("libbcm_host.so", RTLD_LAZY);
if (!handle) return -1;

*(void **) (&bcm_host_get_sdram_address) = dlsym(handle, "bcm_host_get_sdram_address");
*(void **) (&bcm_host_get_peripheral_address) = dlsym(handle, "bcm_host_get_peripheral_address");
*(void **) (&bcm_host_get_peripheral_size) = dlsym(handle, "bcm_host_get_peripheral_size");

if (bcm_host_get_sdram_address && bcm_host_get_sdram_address()!=0x40000000) { // Pi 2?
info->mem_flg = 0x4; // ARM cannot see VC4 L2 on Pi 2
info->mem_map = 0x0;
}

if (bcm_host_get_peripheral_address) info->peri_addr = bcm_host_get_peripheral_address();
if (bcm_host_get_peripheral_size) info->peri_size = bcm_host_get_peripheral_size();

dlclose(handle);
return 0;
}

unsigned gpu_fft_base_exec_direct (
struct GPU_FFT_BASE *base,
int num_qpus) {
Expand Down Expand Up @@ -95,43 +133,46 @@ int gpu_fft_alloc (
unsigned size,
struct GPU_FFT_PTR *ptr) {

struct GPU_FFT_HOST host;
struct GPU_FFT_BASE *base;
volatile unsigned *peri;
unsigned handle;

if (gpu_fft_get_host_info(&host)) return -5;

if (qpu_enable(mb, 1)) return -1;

// Shared memory : cached=0xC; direct=0x4
unsigned mem_flg = bcm_host_get_sdram_address() == 0x40000000 ? 0xC : 0x4;
handle = mem_alloc(mb, size, 4096, mem_flg);
// Shared memory
handle = mem_alloc(mb, size, 4096, host.mem_flg);
if (!handle) {
qpu_enable(mb, 0);
return -3;
}

peri = (volatile unsigned *) mapmem(bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size());
peri = (volatile unsigned *) mapmem(host.peri_addr, host.peri_size);
if (!peri) {
mem_free(mb, handle);
qpu_enable(mb, 0);
return -4;
}

ptr->vc = mem_lock(mb, handle);
ptr->arm.vptr = mapmem(BUS_TO_PHYS(ptr->vc+GPU_FFT_MEM_MAP), size);
ptr->arm.vptr = mapmem(BUS_TO_PHYS(ptr->vc+host.mem_map), size);

base = (struct GPU_FFT_BASE *) ptr->arm.vptr;
base->peri = peri;
base->mb = mb;
base->handle = handle;
base->size = size;
base->peri = peri;
base->peri_size = host.peri_size;
base->mb = mb;
base->handle = handle;
base->size = size;

return 0;
}

void gpu_fft_base_release(struct GPU_FFT_BASE *base) {
int mb = base->mb;
unsigned handle = base->handle, size = base->size;
unmapmem((void*)base->peri, bcm_host_get_peripheral_size());
unmapmem((void*)base->peri, base->peri_size);
unmapmem((void*)base, size);
mem_unlock(mb, handle);
mem_free(mb, handle);
Expand Down
10 changes: 7 additions & 3 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft_shaders.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BCM2835 "GPU_FFT" release 2.0
Copyright (c) 2014, Andrew Holme.
BCM2835 "GPU_FFT" release 3.0
Copyright (c) 2015, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -68,6 +68,9 @@ static unsigned int shader_1024k[] = {
static unsigned int shader_2048k[] = {
#include "hex/shader_2048k.hex"
};
static unsigned int shader_4096k[] = {
#include "hex/shader_4096k.hex"
};

static struct {
unsigned int size, *code;
Expand All @@ -86,7 +89,8 @@ shaders[] = {
{sizeof(shader_256k), shader_256k},
{sizeof(shader_512k), shader_512k},
{sizeof(shader_1024k), shader_1024k},
{sizeof(shader_2048k), shader_2048k}
{sizeof(shader_2048k), shader_2048k},
{sizeof(shader_4096k), shader_4096k}
};

unsigned int gpu_fft_shader_size(int log2_N) {
Expand Down
31 changes: 27 additions & 4 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft_twiddles.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BCM2835 "GPU_FFT" release 2.0
Copyright (c) 2014, Andrew Holme.
BCM2835 "GPU_FFT" release 3.0
Copyright (c) 2015, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -85,6 +85,15 @@ static float *twiddles_step_32(double two_pi, float *out, double theta) {
return twiddles_step_16(two_pi, out, 2*theta);
}

static float *twiddles_step_64(double two_pi, float *out, double theta) {
int i;
for (i=0; i<32; i++) {
*out++ = ALPHA(theta);
*out++ = BETA(theta);
}
return twiddles_step_32(two_pi, out, 2*theta);
}

/****************************************************************************/

static void twiddles_256(double two_pi, float *out) {
Expand Down Expand Up @@ -256,6 +265,19 @@ static void twiddles_2048k(double two_pi, float *out) {
out = twiddles_base_32(two_pi, out, two_pi/N*q);
}

static void twiddles_4096k(double two_pi, float *out) {
double N=4096*1024;
int q;

out = twiddles_base_64(two_pi, out);
out = twiddles_step_64(two_pi, out, two_pi/N * 32*32);
out = twiddles_step_32(two_pi, out, two_pi/N * 32);
out = twiddles_step_32(two_pi, out, two_pi/N * GPU_FFT_QPUS);

for (q=0; q<GPU_FFT_QPUS; q++)
out = twiddles_base_32(two_pi, out, two_pi/N*q);
}

/****************************************************************************/

static struct {
Expand All @@ -276,11 +298,12 @@ shaders[] = {
{4, 6, 2, twiddles_256k},
{4, 7, 2, twiddles_512k},
{4, 8, 2, twiddles_1024k},
{4,10, 2, twiddles_2048k}
{4,10, 2, twiddles_2048k},
{4,12, 2, twiddles_4096k}
};

int gpu_fft_twiddle_size(int log2_N, int *shared, int *unique, int *passes) {
if (log2_N<8 || log2_N>21) return -1;
if (log2_N<8 || log2_N>22) return -1;
*shared = shaders[log2_N-8].shared;
*unique = shaders[log2_N-8].unique;
*passes = shaders[log2_N-8].passes;
Expand Down
Loading

0 comments on commit 2aad6d8

Please sign in to comment.