Examples of auto vectorizable codes

These codes are provided for checking how compilers vectorize source codes. These codes are not meant for practical use.

All source codes in this page are in public domain unless otherwise stated.

SRGB to linear image conversion

Just a simple example.

#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) float in[N][N], out[N][N];

static float srgb2linear_pix(float c) {
  float r = pow((c + 0.055) / (1 + 0.055), 2.4);
  return c < 0.04045 ? (c * (1.0 / 12.92)) : r;
}

void srgb2linear(void) {
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      out[y][x] = srgb2linear_pix(in[y][x]);
    }
  }
}

Finding roots of cubic equations

More complicated than the previous one. Conditional selection from two values.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[4][N], out[3][N];

typedef struct { double x, y, z; } double3;

static double3 execute(double a, double b, double c, double d) {
  double s1 = b*(1/a), s2 = c*(1/a), s3 = d*(1/a);
  double p = s2 - 1/3.0*s1*s1;
  double q = s3 - 1/3.0*s1*s2 + 2/27.0*s1*s1*s1;
  double z = q*q + 4/27.0*p*p*p;

  double w = cbrt((-q + sqrt(z)) * 0.5) + cbrt((-q - sqrt(z)) * 0.5) - 1/3.0*s1;

  double th = acos(0.5*(3.0/p)*q*sqrt(-(3.0/p)));
  double w0 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*0/3)-1/3.0*s1;
  double w1 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*1/3)-1/3.0*s1;
  double w2 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*2/3)-1/3.0*s1;

  double3 ret = { NAN, NAN, NAN };
  if (z >= 0) {
    ret.x = w;
  } else {
    ret.x = w0; ret.y = w1; ret.z = w2;
  }

  return ret;
}

void cardanoN(void) {
  for (int i = 0; i < N; i++) {
    double3 r = execute(in[3][i], in[2][i], in[1][i], in[0][i]);
    out[0][i] = r.x; out[1][i] = r.y; out[2][i] = r.z;
  }
}

int main(int argc, char **argv) {
  double r[N][3];

  for(int i=0;i<N;i++) {
    for(int j=0;j<3;j++)
      r[i][j] = (2.0 * rand() / RAND_MAX - 1) * 10;

    in[3][i] = 1;
    in[2][i] = - r[i][0] - r[i][1] - r[i][2];
    in[1][i] = + r[i][0] * r[i][1] + r[i][1] * r[i][2] + r[i][0] * r[i][2];
    in[0][i] = - r[i][0] * r[i][1] * r[i][2];
  }

  cardanoN();

  for(int i=0;i<N;i++)
    printf("%g, %g, %g : %g, %g, %g\n", out[0][i], out[1][i], out[2][i], r[i][0], r[i][1], r[i][2]);
}

Generating Dini's surface

The compiled code may call sincos.

#include <math.h>

typedef struct { double x, y, z; } double3;

#define N 256
__attribute__ ((__aligned__(64))) double3 out[N][N];

static double3 dini(double a, double b, double u, double v) {
  double3 ret;
  ret.x = a * cos(u) * sin(v);
  ret.y = a * sin(u) * sin(v);
  ret.z = a * (cos(v) + log(tan(v * 0.5))) + b * u;
  return ret;
}

void diniSurface(double a, double b) {
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      double u = 4.0 * M_PI * i / N;
      double v = 2.0 * j / N;
      out[i][j] = dini(a, b, u, v);
    }
  }
}

Factorial approximation formula by Peter Luschny

Calls to pow may be removed.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];

// Factorial approximation formula by Peter Luschny
#define c0 (1.0 / 24.0)
#define c1 (3.0 / 80.0)
#define c2 (18029.0 / 45360.0)
#define c3 (6272051.0 / 14869008.0)

static double lus(double x) {
  x += 0.5;
  double p = (pow(x, 5)+(c3+c2+c1)*pow(x, 3)+c1*c3*x) /
    (pow(x,4)+(c3+c2+c1+c0)*pow(x,2)+(c1+c0)*c3+c0*c2);
  return 0.5*log(2*M_PI) + x * (log(p)-1);
}

void factorialN() {
  for (int i = 0; i < N; i++) {
    out[i] = lus(in[i]);
  }
}

int main(int argc, char **argv) {
  for(int i=0;i<N;i++)
    in[i] = (rand() / (double)RAND_MAX) * 10;

  factorialN();

  for(int i=0;i<N;i++)
    printf("%.20g, %.20g\n", out[i], gamma(in[i]+1));
}

Evaluate Si(x) with Runge-Kutta method

I couldn't make gcc or clang vectorize this code, while Intel Compiler does.

// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.

#include <math.h>

static double F(double x, double y) { return sin(x)/x; }

#define M 1024

/* Runge-Kutta method */
static double runge4(double x0, double y0, double xn) {
    double x, y, h, h2, f1, f2, f3, f4;
    x = x0;  y = y0;  h = (xn - x0) / M;  h2 = h / 2;
    for (int i = 0; i < M; i++) {
        f1 = h * F(x, y);
        f2 = h * F(x + h2, y + f1 / 2);
        f3 = h * F(x + h2, y + f2 / 2);
        f4 = h * F(x + h, y + f3);
        x = x0 + i * h;
        y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
    }
    return y;
}

#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];

void runge4N() {
  for (int i = 0; i < N; i++)
    out[i] = runge4(1, 0.9460830703671830149413, in[i]);
}

icc-19 -xMIC-AVX512 -O3 -ffast-math

Evaluate tanh(x) with Runge-Kutta method

It seems still hard without a mathlib call.

// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 256
#define M 1024

static double F(double x, double y) { return 1 - y*y; }

static double runge4(double x0, double y0, double xn) {
    double x, y, h, h2, f1, f2, f3, f4;
    x = x0;  y = y0;  h = (xn - x0) / M;  h2 = h / 2;
    for (int i = 0; i < M; i++) {
        f1 = h * F(x, y);
        f2 = h * F(x + h2, y + f1 / 2);
        f3 = h * F(x + h2, y + f2 / 2);
        f4 = h * F(x + h, y + f3);
        x = x0 + i * h;
        y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
    }
    return y;
}

__attribute__ ((__aligned__(64))) double in[N], out[N];

void runge4N() {
  for (int i = 0; i < N; i++) out[i] = runge4(0, 0, in[i]);
}

int main(int argc, char **argv)
{
  for(int i=0;i<N;i++) in[i] = (rand() / (double)RAND_MAX) * 10;
  runge4N();
  for(int i = 0; i < N; i++) printf("%.20g %.20g\n", tanh(in[i]), out[i]);
}

icc-19 -xMIC-AVX512 -O3 -ffast-math

Unrolled 4x4 matrix inversion

This is for testing the SLP vectorizers. The results are disappointing.

double inv4(double * __restrict__ r, double * __restrict__ m) {
    const int N = 4;
    const double d12_01 = +m[1*N+0]*m[2*N+1]-m[1*N+1]*m[2*N+0];
    const double d13_01 = +m[1*N+0]*m[3*N+1]-m[1*N+1]*m[3*N+0];
    const double d23_01 = +m[2*N+0]*m[3*N+1]-m[2*N+1]*m[3*N+0];
    const double d12_02 = +m[1*N+0]*m[2*N+2]-m[1*N+2]*m[2*N+0];
    const double d12_03 = +m[1*N+0]*m[2*N+3]-m[1*N+3]*m[2*N+0];
    const double d13_02 = +m[1*N+0]*m[3*N+2]-m[1*N+2]*m[3*N+0];
    const double d13_03 = +m[1*N+0]*m[3*N+3]-m[1*N+3]*m[3*N+0];
    const double d23_02 = +m[2*N+0]*m[3*N+2]-m[2*N+2]*m[3*N+0];
    const double d23_03 = +m[2*N+0]*m[3*N+3]-m[2*N+3]*m[3*N+0];
    const double d12_12 = +m[1*N+1]*m[2*N+2]-m[1*N+2]*m[2*N+1];
    const double d12_13 = +m[1*N+1]*m[2*N+3]-m[1*N+3]*m[2*N+1];
    const double d12_23 = +m[1*N+2]*m[2*N+3]-m[1*N+3]*m[2*N+2];
    const double d13_12 = +m[1*N+1]*m[3*N+2]-m[1*N+2]*m[3*N+1];
    const double d13_13 = +m[1*N+1]*m[3*N+3]-m[1*N+3]*m[3*N+1];
    const double d13_23 = +m[1*N+2]*m[3*N+3]-m[1*N+3]*m[3*N+2];
    const double d23_12 = +m[2*N+1]*m[3*N+2]-m[2*N+2]*m[3*N+1];
    const double d23_13 = +m[2*N+1]*m[3*N+3]-m[2*N+3]*m[3*N+1];
    const double d23_23 = +m[2*N+2]*m[3*N+3]-m[2*N+3]*m[3*N+2];
    const double d012_012 = +m[0*N+0]*d12_12-m[0*N+1]*d12_02+m[0*N+2]*d12_01;
    const double d013_012 = +m[0*N+0]*d13_12-m[0*N+1]*d13_02+m[0*N+2]*d13_01;
    const double d023_012 = +m[0*N+0]*d23_12-m[0*N+1]*d23_02+m[0*N+2]*d23_01;
    const double d123_012 = +m[1*N+0]*d23_12-m[1*N+1]*d23_02+m[1*N+2]*d23_01;
    const double d012_013 = +m[0*N+0]*d12_13-m[0*N+1]*d12_03+m[0*N+3]*d12_01;
    const double d013_013 = +m[0*N+0]*d13_13-m[0*N+1]*d13_03+m[0*N+3]*d13_01;
    const double d023_013 = +m[0*N+0]*d23_13-m[0*N+1]*d23_03+m[0*N+3]*d23_01;
    const double d123_013 = +m[1*N+0]*d23_13-m[1*N+1]*d23_03+m[1*N+3]*d23_01;
    const double d012_023 = +m[0*N+0]*d12_23-m[0*N+2]*d12_03+m[0*N+3]*d12_02;
    const double d013_023 = +m[0*N+0]*d13_23-m[0*N+2]*d13_03+m[0*N+3]*d13_02;
    const double d023_023 = +m[0*N+0]*d23_23-m[0*N+2]*d23_03+m[0*N+3]*d23_02;
    const double d123_023 = +m[1*N+0]*d23_23-m[1*N+2]*d23_03+m[1*N+3]*d23_02;
    const double d012_123 = +m[0*N+1]*d12_23-m[0*N+2]*d12_13+m[0*N+3]*d12_12;
    const double d013_123 = +m[0*N+1]*d13_23-m[0*N+2]*d13_13+m[0*N+3]*d13_12;
    const double d023_123 = +m[0*N+1]*d23_23-m[0*N+2]*d23_13+m[0*N+3]*d23_12;
    const double d123_123 = +m[1*N+1]*d23_23-m[1*N+2]*d23_13+m[1*N+3]*d23_12;
    const double d0123_0123 = +m[0*N+0]*d123_123-m[0*N+1]*d123_023+m[0*N+2]*d123_013-m[0*N+3]*d123_012;
    r[0*N+0] = +d123_123 * (1.0/d0123_0123);
    r[0*N+1] = -d023_123 * (1.0/d0123_0123);
    r[0*N+2] = +d013_123 * (1.0/d0123_0123);
    r[0*N+3] = -d012_123 * (1.0/d0123_0123);
    r[1*N+0] = -d123_023 * (1.0/d0123_0123);
    r[1*N+1] = +d023_023 * (1.0/d0123_0123);
    r[1*N+2] = -d013_023 * (1.0/d0123_0123);
    r[1*N+3] = +d012_023 * (1.0/d0123_0123);
    r[2*N+0] = +d123_013 * (1.0/d0123_0123);
    r[2*N+1] = -d023_013 * (1.0/d0123_0123);
    r[2*N+2] = +d013_013 * (1.0/d0123_0123);
    r[2*N+3] = -d012_013 * (1.0/d0123_0123);
    r[3*N+0] = -d123_012 * (1.0/d0123_0123);
    r[3*N+1] = +d023_012 * (1.0/d0123_0123);
    r[3*N+2] = -d013_012 * (1.0/d0123_0123);
    r[3*N+3] = +d012_012 * (1.0/d0123_0123);
    return d0123_0123;
}

Auto-vectorizing calls to extern functions

clang-10 does not support vectorizing calls to extern functions. gcc-10 vectorizes calls to func0, but not func1. Adding an inline attribute to func2 does not change the result. icc-19 vectorizes calls to both functions.

#pragma omp declare simd notinbranch
double func0(double x, double y);

#pragma omp declare simd linear(r:1) notinbranch
void func1(double *r, double x, double y);

static double func2(double x, double y) {
  double t;
  func1(&t, x, y);
  return t;
}

//

#define N 1024
__attribute__ ((__aligned__(256))) double a[N], b[N], c[N];

void foo(void) {
  int i;

#pragma omp parallel for simd
  for (i = 0; i < N; i++)
    a[i] = func0(b[i], c[i]);

#pragma omp parallel for simd
  for (i = 0; i < N; i++)
    c[i] = func2(a[i], b[i]);
}

Interesting reads

Significant performance improvements in GCC 10 through Vectorization and In-lining

Provide feedback

Saved searches

Use saved searches to filter your results more quickly