-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathderived_atomic_functions.h
272 lines (243 loc) · 9.61 KB
/
derived_atomic_functions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*
* Copyright 2010 by NVIDIA Corporation. All rights reserved. All
* information contained herein is proprietary and confidential to NVIDIA
* Corporation. Any use, reproduction, or disclosure without the written
* permission of NVIDIA Corporation is prohibited.
*/
//
// DERIVED_ATOMIC_FUNCTIONS.H
//
// Certain 64-bit atomic functions are not available, so this defines
// intrinsics which implement them as efficiently as possible via CAS.
//
// NOTE: *** These do NOT work for shared-memory atomics at this time! ***
//
// Sorry that it's a mess - supporting all architectures is a huge
// pile of code spaghetti.
//
// Functions added in this package are:
// +------------------------------------------+
// | Function | int64 | uint64 | fp32 | fp64 |
// +-----------+-------+--------+------+------+
// | atomicOr | | X | | |
// | atomicAnd | | X | | |
// | atomicXor | | X | | |
// | atomicMin | X | X | X | X |
// | atomicMax | X | X | X | X |
// | atomicAdd*| | | X | X |
// +-----------+-------+--------+------+------+
// *note for atomicAdd: int64/uint64 already available on sm_13
// fp32 already available on sm_20
// int64/uint64 atomic min/max already on sm_35
// uint64 atomic and/or/xor already on sm_35
//
// NOTE: Architectural limits still apply. i.e.:
// sm_10 - supports no atomics
// sm_11 - supports only 32-bit atomics, and no doubles
// sm_12 - supports 64-bit integer atomics, but not doubles
// sm_13 - supports everything
// sm_20 - supports everything
//
// TODO: Make these work with shared memory atomics by separating
// out warp contention
//
#ifndef DERIVED_ATOMIC_FUNCTIONS_H
#define DERIVED_ATOMIC_FUNCTIONS_H
// Dummy functions for unsupported architecture
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 100)
//__device__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) { return 0; }//
//__device__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) { return 0; }
//__device__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) { return 0; }
//__device__ long long atomicMin(long long *address, long long val) { return 0; }
//__device__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) { return 0; }
//__device__ long long atomicMax(long long *address, long long val) { return 0; }
//__device__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) { return 0; }
__device__ float atomicMin(float *address, float val) { return 0; }
__device__ float atomicMax(float *address, float val) { return 0; }
__device__ double atomicMin(double *address, double val) { return 0; }
__device__ double atomicMax(double *address, double val) { return 0; }
__device__ double atomicAdd(double *address, double val) { return 0; }
#else
/**** Prototypes ****/
// longlong versions of int32 functions
#if (__CUDA_ARCH__ >= 120) && (__CUDA_ARCH__ < 350)
__device__ __forceinline__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val);
__device__ __forceinline__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val);
__device__ __forceinline__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val);
__device__ __forceinline__ long long atomicMin(long long *address, long long val);
__device__ __forceinline__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val);
__device__ __forceinline__ long long atomicMax(long long *address, long long val);
__device__ __forceinline__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val);
#endif
// Floating-point versions of int32 functions
__device__ __forceinline__ float atomicMin(float *address, float val);
__device__ __forceinline__ float atomicMax(float *address, float val);
#if __CUDA_ARCH__ >= 130
__device__ __forceinline__ double atomicMin(double *address, double val);
__device__ __forceinline__ double atomicMax(double *address, double val);
// Double-precision version of float functions
__device__ __forceinline__ double atomicAdd(double *address, double val);
#endif
// arch < sm_20 needs single precision atomicAdd as well
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ <= 130
__device__ __forceinline__ float atomicAdd(float *address, float val);
#endif
/**** Implementation ****/
#if (__CUDA_ARCH__ >= 120) && (__CUDA_ARCH__ < 350) // Gives 64-bit atomic CAS
// uint64 atomicOr
__device__ __forceinline__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
{
unsigned long long old, ret = *address;
do {
old = ret;
} while((ret = atomicCAS(address, old, old | val)) != old);
return ret;
}
// uint64 atomicAnd
__device__ __forceinline__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
{
unsigned long long old, ret = *address;
do {
old = ret;
} while((ret = atomicCAS(address, old, old & val)) != old);
return ret;
}
// uint64 atomicXor
__device__ __forceinline__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
{
unsigned long long old, ret = *address;
do {
old = ret;
} while((ret = atomicCAS(address, old, old ^ val)) != old);
return ret;
}
// int64 atomicMin
__device__ __forceinline__ long long atomicMin(long long *address, long long val)
{
long long ret = *address;
while(val < ret)
{
long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, (unsigned long long)old, (unsigned long long)val)) == old)
break;
}
return ret;
}
// uint64 atomicMin
__device__ __forceinline__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
{
unsigned long long ret = *address;
while(val < ret)
{
unsigned long long old = ret;
if((ret = atomicCAS(address, old, val)) == old)
break;
}
return ret;
}
// int64 atomicMax
__device__ __forceinline__ long long atomicMax(long long *address, long long val)
{
long long ret = *address;
while(val > ret)
{
long long old = ret;
if((ret = (long long)atomicCAS((unsigned long long *)address, (unsigned long long)old, (unsigned long long)val)) == old)
break;
}
return ret;
}
// uint64 atomicMax
__device__ __forceinline__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
{
unsigned long long ret = *address;
while(val > ret)
{
unsigned long long old = ret;
if((ret = atomicCAS(address, old, val)) == old)
break;
}
return ret;
}
#endif // (__CUDA_ARCH__ >= 120) && (__CUDA_ARCH__ < 350)
// For all float & double atomics:
// Must do the compare with integers, not floating point,
// since NaN is never equal to any other NaN
// float atomicMin
__device__ __forceinline__ float atomicMin(float *address, float val)
{
int ret = __float_as_int(*address);
while(val < __int_as_float(ret))
{
int old = ret;
if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
break;
}
return __int_as_float(ret);
}
// float atomicMax
__device__ __forceinline__ float atomicMax(float *address, float val)
{
int ret = __float_as_int(*address);
while(val > __int_as_float(ret))
{
int old = ret;
if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
break;
}
return __int_as_float(ret);
}
#if __CUDA_ARCH__ >= 130
// double atomicMin
__device__ __forceinline__ double atomicMin(double *address, double val)
{
unsigned long long ret = __double_as_longlong(*address);
while(val < __longlong_as_double(ret))
{
unsigned long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
break;
}
return __longlong_as_double(ret);
}
// double atomicMax
__device__ __forceinline__ double atomicMax(double *address, double val)
{
unsigned long long ret = __double_as_longlong(*address);
while(val > __longlong_as_double(ret))
{
unsigned long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
break;
}
return __longlong_as_double(ret);
}
// Double-precision floating point atomic add
__device__ __forceinline__ double atomicAdd(double *address, double val)
{
// Doing it all as longlongs cuts one __longlong_as_double from the inner loop
unsigned long long *ptr = (unsigned long long *)address;
unsigned long long old, newdbl, ret = *ptr;
do {
old = ret;
newdbl = __double_as_longlong(__longlong_as_double(old)+val);
} while((ret = atomicCAS(ptr, old, newdbl)) != old);
return __longlong_as_double(ret);
}
#endif
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 130)
// Single-precision floating point atomic add
__device__ __forceinline__ float atomicAdd(float *address, float val)
{
// Doing it all as longlongs cuts one __longlong_as_double from the inner loop
unsigned int *ptr = (unsigned int *)address;
unsigned int old, newint, ret = *ptr;
do {
old = ret;
newint = __float_as_int(__int_as_float(old)+val);
} while((ret = atomicCAS(ptr, old, newint)) != old);
return __int_as_float(ret);
}
#endif
#endif // DERIVED_ATOMIC_FUNCTIONS_H
#endif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 100)