Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3451 doubleword #3472

Merged
merged 3 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/silx/opencl/test/test_doubleword.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
__contact__ = "[email protected]"
__license__ = "MIT"
__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
__date__ = "18/05/2021"
__date__ = "31/05/2021"

import unittest
import numpy
Expand Down Expand Up @@ -99,7 +99,7 @@ def tearDownClass(cls):
def test_fast_sum2(self):
test_kernel = ElementwiseKernel(self.ctx,
"float *a, float *b, float *res_h, float *res_l",
"float2 tmp = fast_sum2(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
"float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
preamble=self.doubleword)
a_g = pyopencl.array.to_device(self.queue, self.ah)
b_g = pyopencl.array.to_device(self.queue, self.bl)
Expand All @@ -113,7 +113,7 @@ def test_fast_sum2(self):
def test_sum2(self):
test_kernel = ElementwiseKernel(self.ctx,
"float *a, float *b, float *res_h, float *res_l",
"float2 tmp = sum2(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
"float2 tmp = fp_plus_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
preamble=self.doubleword)
a_g = pyopencl.array.to_device(self.queue, self.ah)
b_g = pyopencl.array.to_device(self.queue, self.bh)
Expand All @@ -127,7 +127,7 @@ def test_sum2(self):
def test_prod2(self):
test_kernel = ElementwiseKernel(self.ctx,
"float *a, float *b, float *res_h, float *res_l",
"float2 tmp = prod2(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
"float2 tmp = fp_times_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
preamble=self.doubleword)
a_g = pyopencl.array.to_device(self.queue, self.ah)
b_g = pyopencl.array.to_device(self.queue, self.bh)
Expand Down
34 changes: 17 additions & 17 deletions src/silx/resources/opencl/doubleword.cl
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@
#endif

//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y|
inline fp2 fast_sum2(fp x, fp y){
inline fp2 fast_fp_plus_fp(fp x, fp y){
X87_VOLATILE fp s = x + y;
X87_VOLATILE fp z = s - x;
fp e = y - z;
return (fp2)(s, e);
}

//Algorithm 2, p24, same as fast_sum2 without the condition on e_x and e_y
inline fp2 sum2(fp x, fp y){
//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
inline fp2 fp_plus_fp(fp x, fp y){
X87_VOLATILE fp s = x + y;
X87_VOLATILE fp xp = s - y;
X87_VOLATILE fp yp = s - xp;
Expand All @@ -54,56 +54,56 @@ inline fp2 sum2(fp x, fp y){
}

//Algorithm 3, p24: multiplication with a FMA
inline fp2 prod2(fp x, fp y){
inline fp2 fp_times_fp(fp x, fp y){
fp p = x * y;
fp e = fma(x, y, -p);
return (fp2)(p, e);
}

//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
inline fp2 dw_plus_fp(fp2 x, fp y){
fp2 s = sum2(x.s0, y);
fp2 s = fp_plus_fp(x.s0, y);
X87_VOLATILE fp v = x.s1 + s.s1;
return fast_sum2(s.s0, v);
return fast_fp_plus_fp(s.s0, v);
}

//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
inline fp2 dw_plus_dw(fp2 x, fp2 y){
fp2 s = sum2(x.s0, y.s0);
fp2 t = sum2(x.s1, y.s1);
fp2 v = fast_sum2(s.s0, s.s1 + t.s0);
return fast_sum2(v.s0, t.s1 + v.s1);
fp2 s = fp_plus_fp(x.s0, y.s0);
fp2 t = fp_plus_fp(x.s1, y.s1);
fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
}

//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
inline fp2 dw_times_fp(fp2 x, fp y){
fp2 c = prod2(x.s0, y);
return fast_sum2(c.s0, fma(x.s1, y, c.s1));
fp2 c = fp_times_fp(x.s0, y);
return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
}

//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
inline fp2 dw_times_dw(fp2 x, fp2 y){
fp2 c = prod2(x.s0, y.s0);
fp2 c = fp_times_fp(x.s0, y.s0);
X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
return fast_sum2(c.s0, c.s1 + l);
return fast_fp_plus_fp(c.s0, c.s1 + l);
}

//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
inline fp2 dw_div_fp(fp2 x, fp y){
X87_VOLATILE fp th = x.s0 / y;
fp2 pi = prod2(th, y);
fp2 pi = fp_times_fp(th, y);
fp2 d = x - pi;
X87_VOLATILE fp delta = d.s0 + d.s1;
X87_VOLATILE fp tl = delta/y;
return fast_sum2(th, tl);
return fast_fp_plus_fp(th, tl);
}

//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
inline fp2 inv_dw(fp2 y){
X87_VOLATILE fp th = one/y.s0;
X87_VOLATILE fp rh = fma(-y.s0, th, one);
X87_VOLATILE fp rl = -y.s1 * th;
fp2 e = fast_sum2(rh, rl);
fp2 e = fast_fp_plus_fp(rh, rl);
fp2 delta = dw_times_fp(e, th);
return dw_plus_fp(delta, th);
}
Expand Down