-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize the PyTorch CUDA implementation for Criss Cross Attention #1088
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,24 +14,22 @@ __global__ void ca_forward_kernel(const T *t, const T *f, T *weight, int num, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int z = blockIdx.z; | ||
|
||
if (x < width && y < height && z < height + width - 1) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int plane = 0; plane < chn; ++plane) { | ||
T _t = t[(batch * chn + plane) * sp + y * width + x]; | ||
|
||
if (z < width) { | ||
int i = z; | ||
T _f = f[(batch * chn + plane) * sp + y * width + i]; | ||
weight[(batch * len + i) * sp + y * width + x] += _t * _f; | ||
} else { | ||
int i = z - width; | ||
int j = i < y ? i : i + 1; | ||
|
||
T _f = f[(batch * chn + plane) * sp + j * width + x]; | ||
weight[(batch * len + width + i) * sp + y * width + x] += _t * _f; | ||
} | ||
int z = blockIdx.z % len; | ||
int batch = blockIdx.z / len; | ||
|
||
if (x < width && y < height) { | ||
for (int plane = 0; plane < chn; ++plane) { | ||
T _t = t[(batch * chn + plane) * sp + y*width + x]; | ||
|
||
if (z < width) { | ||
int i = z; | ||
T _f = f[(batch * chn + plane) * sp + y*width + i]; | ||
weight[(batch * len + i) * sp + y*width + x] += _t*_f; | ||
} else { | ||
int i = z - width; | ||
int j = i<y ? i : i+1; | ||
T _f = f[(batch * chn + plane) * sp + j*width + x]; | ||
weight[(batch * len + width + i) * sp + y*width + x] += _t*_f; | ||
} | ||
} | ||
} | ||
|
@@ -44,23 +42,22 @@ __global__ void ca_backward_kernel_t(const T *dw, const T *t, const T *f, T *dt, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int plane = blockIdx.z; | ||
|
||
if (x < width && y < height && plane < chn) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int i = 0; i < width; ++i) { | ||
T _dw = dw[(batch * len + i) * sp + y * width + x]; | ||
T _f = f[(batch * chn + plane) * sp + y * width + i]; | ||
dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int j = i < y ? i : i - 1; | ||
int plane = blockIdx.z % chn; | ||
int batch = blockIdx.z / chn; | ||
|
||
if (x < width && y < height ) { | ||
for (int i = 0; i < width; ++i) { | ||
float _dw = dw[(batch * len + i) * sp + y*width + x]; | ||
float _f = f[(batch * chn + plane) * sp + y*width + i]; | ||
dt[(batch * chn + plane) * sp + y*width + x] += _dw * _f; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int j = i<y ? i : i-1; | ||
|
||
T _dw = dw[(batch * len + width + j) * sp + y * width + x]; | ||
T _f = f[(batch * chn + plane) * sp + i * width + x]; | ||
dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f; | ||
} | ||
float _dw = dw[(batch * len + width + j) * sp + y*width + x]; | ||
float _f = f[(batch * chn + plane) * sp + i*width + x]; | ||
Comment on lines
+50
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason for using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I'll replace |
||
dt[(batch * chn + plane) * sp + y*width + x] += _dw * _f; | ||
} | ||
} | ||
} | ||
|
@@ -72,23 +69,22 @@ __global__ void ca_backward_kernel_f(const T *dw, const T *t, const T *f, T *df, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int plane = blockIdx.z; | ||
|
||
if (x < width && y < height && plane < chn) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int i = 0; i < width; ++i) { | ||
T _dw = dw[(batch * len + x) * sp + y * width + i]; | ||
T _t = t[(batch * chn + plane) * sp + y * width + i]; | ||
df[(batch * chn + plane) * sp + y * width + x] += _dw * _t; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int j = i > y ? y : y - 1; | ||
int plane = blockIdx.z % chn; | ||
int batch = blockIdx.z / chn; | ||
|
||
if (x < width && y < height) { | ||
for (int i = 0; i < width; ++i) { | ||
T _dw = dw[(batch * len + x) * sp + y*width + i]; | ||
T _t = t[(batch * chn + plane) * sp + y*width + i]; | ||
df[(batch * chn + plane) * sp + y*width + x] += _dw * _t; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int j = i>y ? y : y-1; | ||
|
||
T _dw = dw[(batch * len + width + j) * sp + i * width + x]; | ||
T _t = t[(batch * chn + plane) * sp + i * width + x]; | ||
df[(batch * chn + plane) * sp + y * width + x] += _dw * _t; | ||
} | ||
T _dw = dw[(batch * len + width + j) * sp + i*width + x]; | ||
T _t = t[(batch * chn + plane) * sp + i*width + x]; | ||
df[(batch * chn + plane) * sp + y*width + x] += _dw * _t; | ||
} | ||
} | ||
} | ||
|
@@ -100,24 +96,23 @@ __global__ void ca_map_forward_kernel(const T *weight, const T *g, T *out, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int plane = blockIdx.z; | ||
|
||
if (x < width && y < height && plane < chn) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int i = 0; i < width; ++i) { | ||
T _g = g[(batch * chn + plane) * sp + y * width + i]; | ||
T _w = weight[(batch * len + i) * sp + y * width + x]; | ||
out[(batch * chn + plane) * sp + y * width + x] += _g * _w; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int plane = blockIdx.z % chn; | ||
int batch = blockIdx.z / chn; | ||
T res = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Has this |
||
if (x < width && y < height) { | ||
for (int i = 0; i < width; ++i) { | ||
T _g = g[(batch * chn + plane) * sp + y * width + i]; | ||
T _w = weight[(batch * len + i) * sp + y * width + x]; | ||
out[(batch * chn + plane) * sp + y * width + x] += _g * _w; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
|
||
int j = i < y ? i : i - 1; | ||
int j = i < y ? i : i - 1; | ||
|
||
T _g = g[(batch * chn + plane) * sp + i * width + x]; | ||
T _w = weight[(batch * len + width + j) * sp + y * width + x]; | ||
out[(batch * chn + plane) * sp + y * width + x] += _g * _w; | ||
} | ||
T _g = g[(batch * chn + plane) * sp + i * width + x]; | ||
T _w = weight[(batch * len + width + j) * sp + y * width + x]; | ||
out[(batch * chn + plane) * sp + y * width + x] += _g * _w; | ||
} | ||
} | ||
} | ||
|
@@ -130,25 +125,23 @@ __global__ void ca_map_backward_kernel_w(const T *dout, const T *weight, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int z = blockIdx.z; | ||
|
||
if (x < width && y < height && z < height + width - 1) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int plane = 0; plane < chn; ++plane) { | ||
T _dout = dout[(batch * chn + plane) * sp + y * width + x]; | ||
|
||
if (z < width) { | ||
int i = z; | ||
T _g = g[(batch * chn + plane) * sp + y * width + i]; | ||
dw[(batch * len + i) * sp + y * width + x] += _dout * _g; | ||
} else { | ||
int i = z - width; | ||
int j = i < y ? i : i + 1; | ||
|
||
T _g = g[(batch * chn + plane) * sp + j * width + x]; | ||
dw[(batch * len + width + i) * sp + y * width + x] += _dout * _g; | ||
} | ||
} | ||
int z = blockIdx.z % len; | ||
int batch = blockIdx.z / len; | ||
|
||
if (x < width && y < height) { | ||
int widx = (batch * len + z) * sp + y*width + x; | ||
int dout_idx = batch * chn * sp + y * width + x; | ||
int gidx = batch * chn * sp; | ||
if (z < width) { | ||
gidx += y * width + z; | ||
} else { | ||
int j = z - width; | ||
j = j < y ? j : j + 1; | ||
gidx += j * width + x; | ||
} | ||
for(int plane = 0; plane < chn; plane ++){ | ||
dw[widx] += dout[dout_idx + plane * sp] * g[gidx+plane*sp]; | ||
Comment on lines
+133
to
+144
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part looks great! Can we do the very same in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, thanks for good advice! |
||
} | ||
} | ||
} | ||
|
@@ -161,25 +154,20 @@ __global__ void ca_map_backward_kernel_g(const T *dout, const T *weight, | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
int sp = height * width; | ||
int len = height + width - 1; | ||
int plane = blockIdx.z; | ||
|
||
if (x < width && y < height && plane < chn) { | ||
for (int batch = 0; batch < num; ++batch) { | ||
for (int i = 0; i < width; ++i) { | ||
T _dout = dout[(batch * chn + plane) * sp + y * width + i]; | ||
T _w = weight[(batch * len + x) * sp + y * width + i]; | ||
dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w; | ||
} | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
int j = i > y ? y : y - 1; | ||
int plane = blockIdx.z % chn; | ||
int batch = blockIdx.z / chn; | ||
int index = (batch * chn + plane) * sp + y*width + x; | ||
|
||
T _dout = dout[(batch * chn + plane) * sp + i * width + x]; | ||
T _w = weight[(batch * len + width + j) * sp + i * width + x]; | ||
dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w; | ||
} | ||
if (x < width && y < height) { | ||
for (int i = 0; i < width; ++i) { | ||
dg[index] += dout[(batch * chn + plane) * sp + y*width + i] * weight[(batch * len + x) * sp + y*width + i]; | ||
} | ||
int j = 0; | ||
for (int i = 0; i < height; ++i) { | ||
if (i == y) continue; | ||
j = i > y ? y : y - 1; | ||
Comment on lines
+165
to
+168
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
dg[index] += dout[(batch * chn + plane) * sp + i * width + x] * weight[(batch * len + width + j) * sp + i * width + x]; | ||
} | ||
} | ||
} | ||
|
||
#endif // CC_ATTENTION_CUDA_KERNEL_CUH |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add space between variables (y, width ...) and ops(*, + ...)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I'll fix it.