You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
(mip-splatting) root@autodl-container-2a5049addd-1b9dbf2b:~/mip-splatting# OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 python train.py -s 360_v2/bonsai -m output --eval -r 8 --kernel_size 0.1
Optimizing output
Output folder: output [09/04 12:43:50]
Tensorboard not available: not logging progress [09/04 12:43:50]
Reading camera 292/292 [09/04 12:43:51]
Loading Training Cameras [09/04 12:43:51]
Loading Test Cameras [09/04 12:44:19]
Number of points at initialisation : 206613 [09/04 12:44:23]
Computing 3D filter [09/04 12:44:23]
Training progress: 0%| | 0/30000 [00:00<?, ?it/s]Traceback (most recent call last):
File "train.py", line 268, in
training(lp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations, args.checkpoint_iterations, args.start_checkpoint, args.debug_from)
File "train.py", line 125, in training
render_pkg = render(viewpoint_cam, gaussians, pipe, background, kernel_size=dataset.kernel_size, subpixel_offset=subpixel_offset)
File "/root/mip-splatting/gaussian_renderer/init.py", line 60, in render
opacity = pc.get_opacity_with_3D_filter
File "/root/mip-splatting/scene/gaussian_model.py", line 132, in get_opacity_with_3D_filter
det1 = scales_square.prod(dim=1)
RuntimeError:
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);
namespace std {
using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX()
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) / 1 /
_(int16_t, Short) / 2 /
_(int, Int) / 3 /
_(int64_t, Long) / 4 /
_(at::Half, Half) / 5 /
_(float, Float) / 6 /
_(double, Double) / 7 /
_(std::complexat::Half, ComplexHalf) / 8 /
_(std::complex, ComplexFloat) / 9 /
_(std::complex, ComplexDouble) / 10 /
_(bool, Bool) / 11 /
_(void, QInt8) / 12 /
_(void, QUInt8) / 13 /
_(void, QInt32) / 14 /
_(at::BFloat16, BFloat16) / 15 */ \
device DivMod(T _div, T _mod) {
div = _div;
mod = _mod;
}
};
//
struct IntDivider {
IntDivider() = default;
device inline unsigned int div(unsigned int n) const {
unsigned int t = __umulhi(n, m1);
return (t + n) >> shift;
}
device inline unsigned int mod(unsigned int n) const {
return n - div(n) * divisor;
}
device inline DivMod divmod(unsigned int n) const {
unsigned int q = div(n);
return DivMod(q, n - q * divisor);
}
unsigned int divisor; // d above.
unsigned int m1; // Magic number: m' above.
unsigned int shift; // Shift amounts.
};
template
struct TrivialOffsetCalculator {
// The offset for each argument. Wrapper around fixed-size array.
// The offsets are in # of elements, not in bytes.
Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
#pragma unroll
for (int arg = 0; arg < NARGS; arg++) {
offsets[arg] = linear_idx;
}
return offsets;
}
};
// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};
C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
// get GCD of num and denom using Euclid's algorithm.
// Can replace this with std::gcd if we ever support c++17.
size_t a = denominator;
size_t b = numerator;
while (b != 0) {
a %= b;
// swap(a,b)
size_t tmp = a;
a = b;
b = tmp;
}
// a is now the GCD
numerator /= a;
denominator /= a;
}
struct ReduceConfig {
//has to match host-side ReduceConfig in the eager code
static constexpr int BLOCK_X = 0;
static constexpr int BLOCK_Y = 1;
static constexpr int CTA = 2;
static constexpr int input_vec_size = 4;
int element_size_bytes;
int num_inputs;
int num_outputs;
int step_input = 1;
int step_output = 1;
int ctas_per_output = 1;
int input_mult[3] = {0, 0, 0};
int output_mult[2] = {0, 0};
int block_width;
int block_height;
int num_threads;
bool vectorize_input = false;
int output_vec_size = 1;
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
arg_t ident;
ReduceConfig config;
InputCalculator input_calc;
OutputCalculator output_calc;
const void* src;
const char* dst[2]; //it accepts at most two destinations
// acc_buf used for accumulation among sub Tensor Iterator when accumulation on
// output is not permissible
void* acc_buf;
// cta_buf used for accumulation between blocks during global reduction
void* cta_buf;
int* semaphores;
int64_t base_idx;
bool accumulate;
bool final_output;
int noutputs;
template
C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
assert(noutputs == 1);
auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
*res = x;
}
//TODO - multi-output reduction - we won't be able to use thrust::pair
//just explicitly specify typed output reads/writes
//Currently implemented for max of two outputs
// template<class T1, class T2>
// C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
// if (noutputs >= 1) {
// auto res0 = (T1*)((char*)dst[0] + base_offset);
// res0 = x.first;
// }
// if (noutputs >= 2) {
// // base offset is computed assuming element size being sizeof(T1), so we need to make a
// // correction to obtain the correct base offset
// auto res1 = (T2) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
// *res1 = x.second;
// }
// }
template
C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
assert(final_output);
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
set_results(reducer::project(value[i]), base_offset[i]);
}
}
template
C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> acc, char shared_memory) const {
using arg_vec_t = Array<arg_t, output_vec_size>;
using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
using offset_vec_t = Array<uint32_t, output_vec_size>;
arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
uint32_t output_idx = config.output_idx<output_vec_size>();
offset_vec_t base_offsets;
out_ptr_vec_t out;
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
base_offsets[i] = output_calc.get(output_idx + i)[0];
out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
}
bool should_store = config.should_store(output_idx);
if (should_store) {
uint32_t offset = config.staging_memory_offset(blockIdx.y);
reduce_buffer[offset] = value;
}
__threadfence(); // make sure writes are globally visible
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();
if (is_last_block_done) {
value = ident;
if (config.should_block_x_reduce()) {
uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
uint32_t step = blockDim.x * blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
} else {
uint32_t input_offset = threadIdx.y;
uint32_t step = blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
}
value = block_y_reduce(value, shared_memory);
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (should_store) {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::translate_idx(value[i], base_idx);
}
}
if (acc == nullptr) {
if (accumulate) {
value = accumulate_in_output<output_vec_size>(out, value);
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
*(out[i]) = get_accumulated_output(out[i], value[i]);
}
}
} else {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine((*acc)[i], value[i]);
}
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
*acc = value;
}
}
}
}
return value;
}
};
extern "C" launch_bounds(512, 4) global void reduction_prod_kernel(ReduceJitOp r){
r.run();
}
nvrtc: error: invalid value for --gpu-architecture (-arch)
Training progress: 0%| | 0/30000 [00:00<?, ?it/s]
(mip-splatting) root@autodl-container-2a5049addd-1b9dbf2b:~/mip-splatting#
The text was updated successfully, but these errors were encountered:
(mip-splatting) root@autodl-container-2a5049addd-1b9dbf2b:~/mip-splatting# OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 python train.py -s 360_v2/bonsai -m output --eval -r 8 --kernel_size 0.1
Optimizing output
Output folder: output [09/04 12:43:50]
Tensorboard not available: not logging progress [09/04 12:43:50]
Reading camera 292/292 [09/04 12:43:51]
Loading Training Cameras [09/04 12:43:51]
Loading Test Cameras [09/04 12:44:19]
Number of points at initialisation : 206613 [09/04 12:44:23]
Computing 3D filter [09/04 12:44:23]
Training progress: 0%| | 0/30000 [00:00<?, ?it/s]Traceback (most recent call last):
File "train.py", line 268, in
training(lp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations, args.checkpoint_iterations, args.start_checkpoint, args.debug_from)
File "train.py", line 125, in training
render_pkg = render(viewpoint_cam, gaussians, pipe, background, kernel_size=dataset.kernel_size, subpixel_offset=subpixel_offset)
File "/root/mip-splatting/gaussian_renderer/init.py", line 60, in render
opacity = pc.get_opacity_with_3D_filter
File "/root/mip-splatting/scene/gaussian_model.py", line 132, in get_opacity_with_3D_filter
det1 = scales_square.prod(dim=1)
RuntimeError:
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);
namespace std {
using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX()
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) / 1 /
_(int16_t, Short) / 2 /
_(int, Int) / 3 /
_(int64_t, Long) / 4 /
_(at::Half, Half) / 5 /
_(float, Float) / 6 /
_(double, Double) / 7 /
_(std::complexat::Half, ComplexHalf) / 8 /
_(std::complex, ComplexFloat) / 9 /
_(std::complex, ComplexDouble) / 10 /
_(bool, Bool) / 11 /
_(void, QInt8) / 12 /
_(void, QUInt8) / 13 /
_(void, QInt32) / 14 /
_(at::BFloat16, BFloat16) / 15 */ \
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_)
_(uint8_t, Byte)
_(int8_t, Char)
_(int16_t, Short)
_(int, Int)
_(int64_t, Long)
_(at::Half, Half)
_(float, Float)
_(double, Double)
_(std::complexat::Half, ComplexHalf)
_(std::complex, ComplexFloat)
_(std::complex, ComplexDouble)
_(bool, Bool)
_(at::BFloat16, BFloat16)
enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};
template <typename T, int size>
struct Array {
T data[size];
device T operator[](int i) const {
return data[i];
}
device T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
device Array(T x) {
for (int i = 0; i < size; i++) {
data[i] = x;
}
}
};
template
struct DivMod {
T div;
T mod;
device DivMod(T _div, T _mod) {
div = _div;
mod = _mod;
}
};
//
struct IntDivider {
IntDivider() = default;
device inline unsigned int div(unsigned int n) const {
unsigned int t = __umulhi(n, m1);
return (t + n) >> shift;
}
device inline unsigned int mod(unsigned int n) const {
return n - div(n) * divisor;
}
device inline DivMod divmod(unsigned int n) const {
unsigned int q = div(n);
return DivMod(q, n - q * divisor);
}
unsigned int divisor; // d above.
unsigned int m1; // Magic number: m' above.
unsigned int shift; // Shift amounts.
};
template
struct TrivialOffsetCalculator {
// The offset for each argument. Wrapper around fixed-size array.
// The offsets are in # of elements, not in bytes.
Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
#pragma unroll
for (int arg = 0; arg < NARGS; arg++) {
offsets[arg] = linear_idx;
}
return offsets;
}
};
template
struct OffsetCalculator {
OffsetCalculator() = default;
device forceinline Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
#pragma unroll
for (int arg = 0; arg < NARGS; ++arg) {
offsets[arg] = 0;
}
}
};
#define C10_HOST_DEVICE host device
#define C10_DEVICE device
template
device forceinline T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
return __shfl_down_sync(mask, value, delta, width);
}
#if 0
template
device forceinline std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
return std::complex(
__shfl_down_sync(mask, value.real(), delta, width),
__shfl_down_sync(mask, value.imag(), delta, width));
}
#endif
// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};
C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
// get GCD of num and denom using Euclid's algorithm.
// Can replace this with std::gcd if we ever support c++17.
size_t a = denominator;
size_t b = numerator;
while (b != 0) {
a %= b;
// swap(a,b)
size_t tmp = a;
a = b;
b = tmp;
}
}
struct ReduceConfig {
//has to match host-side ReduceConfig in the eager code
static constexpr int BLOCK_X = 0;
static constexpr int BLOCK_Y = 1;
static constexpr int CTA = 2;
static constexpr int input_vec_size = 4;
int element_size_bytes;
int num_inputs;
int num_outputs;
int step_input = 1;
int step_output = 1;
int ctas_per_output = 1;
int input_mult[3] = {0, 0, 0};
int output_mult[2] = {0, 0};
int block_width;
int block_height;
int num_threads;
bool vectorize_input = false;
int output_vec_size = 1;
C10_HOST_DEVICE bool should_block_x_reduce() const {
return input_mult[BLOCK_X] != 0;
}
C10_HOST_DEVICE bool should_block_y_reduce() const {
return input_mult[BLOCK_Y] != 0;
}
C10_HOST_DEVICE bool should_global_reduce() const {
return input_mult[CTA] != 0;
}
C10_DEVICE bool should_store(int output_idx) const {
return output_idx < num_outputs &&
(!should_block_x_reduce() || threadIdx.x == 0) &&
(!should_block_y_reduce() || threadIdx.y == 0);
}
C10_DEVICE bool should_reduce_tail() const {
return (!should_block_y_reduce() || threadIdx.y == 0) &&
(!should_global_reduce() || blockIdx.y == 0);
}
C10_HOST_DEVICE int input_idx() const {
int lane = threadIdx.x;
int warp = threadIdx.y;
int cta2 = blockIdx.y;
return (lane * input_mult[BLOCK_X] +
warp * input_mult[BLOCK_Y] +
cta2 * input_mult[CTA]);
}
template
C10_HOST_DEVICE int output_idx() const {
int lane = threadIdx.x;
int warp = threadIdx.y;
int cta1 = blockIdx.x;
return (lane * output_mult[BLOCK_X] +
warp * output_mult[BLOCK_Y] +
cta1 * step_output) * output_vec_size;
}
C10_DEVICE int shared_memory_offset(int offset) const {
return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
}
C10_DEVICE int staging_memory_offset(int cta2) const {
int offset = cta2 + blockIdx.x * gridDim.y;
if (!should_block_x_reduce()) {
offset = threadIdx.x + offset * blockDim.x;
}
return offset;
}
};
//TODO this will need to be different for more generic reduction functions
namespace reducer {
using scalar_t = float;
using arg_t = float;
using out_scalar_t = float;
inline device arg_t combine(arg_t a, arg_t b) { return a * b; }
inline device out_scalar_t project(arg_t arg) {
return (out_scalar_t) arg;
}
inline device arg_t warp_shfl_down(arg_t arg, int offset) {
return WARP_SHFL_DOWN(arg, offset);
}
inline device arg_t translate_idx(arg_t acc, int64_t /idx/) {
return acc;
}
// wrap a normal reduction that ignores the index
inline device arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
return combine(acc, val);
}
}
struct ReduceJitOp {
using scalar_t = float;
using arg_t = float;
using out_scalar_t = float;
using InputCalculator = OffsetCalculator<1>;
using OutputCalculator = OffsetCalculator<2>;
// static constexpr bool can_accumulate_in_output =
// std::is_convertible<arg_t, out_scalar_t>::value
// && std::is_convertible<out_scalar_t, arg_t>::value;
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
arg_t ident;
ReduceConfig config;
InputCalculator input_calc;
OutputCalculator output_calc;
const void* src;
const char* dst[2]; //it accepts at most two destinations
// acc_buf used for accumulation among sub Tensor Iterator when accumulation on
// output is not permissible
void* acc_buf;
// cta_buf used for accumulation between blocks during global reduction
void* cta_buf;
int* semaphores;
int64_t base_idx;
bool accumulate;
bool final_output;
int noutputs;
C10_DEVICE void run() const {
extern shared char shared_memory[];
uint32_t output_idx = config.output_idx<1>();
uint32_t input_idx = config.input_idx();
auto base_offsets1 = output_calc.get(output_idx)[1];
}
template
C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
if (config.vectorize_input) {
assert(output_vec_size == 1);
// reduce at the header of input_slice where memory is not aligned,
// so that thread_reduce will have an aligned memory to work on.
return {input_vectorized_thread_reduce_impl(data)};
} else {
uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
if (is_contiguous) {
return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
} else if (input_calc.dims == 1) {
return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
} else {
return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
}
}
}
C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
uint32_t end = config.num_inputs;
}
template <int output_vec_size, typename offset_calc_t>
C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
uint32_t idx = config.input_idx();
const uint32_t end = config.num_inputs;
const uint32_t stride = config.step_input;
const int vt0=4;
}
template
C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
int dim_x = blockDim.x;
args_vec_t* shared = (args_vec_t*)shared_memory;
if (dim_x > warpSize) {
int address_base = threadIdx.x + threadIdx.y*blockDim.x;
shared[address_base] = value;
for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
__syncthreads();
if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
args_vec_t other = shared[address_base + offset];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[address_base] = value;
}
}
dim_x = warpSize;
}
}
template
C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
args_vec_t* shared = (args_vec_t*)shared_memory;
shared[config.shared_memory_offset(0)] = value;
for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
__syncthreads();
if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
args_vec_t other = shared[config.shared_memory_offset(offset)];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[config.shared_memory_offset(0)] = value;
}
}
return value;
}
C10_DEVICE bool mark_block_finished() const {
shared bool is_last_block_done_shared;
}
template
C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
Array<out_scalar_t*, output_vec_size> out,
Array<arg_t, output_vec_size> value
) const {
Array<arg_t, output_vec_size> ret;
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
ret[i] = reducer::combine(*(out[i]), value[i]);
}
return ret;
}
C10_DEVICE out_scalar_t get_accumulated_output(
out_scalar_t* out, arg_t value
) const {
assert(!final_output);
return (out_scalar_t)value;
}
template
C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
assert(noutputs == 1);
auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
*res = x;
}
//TODO - multi-output reduction - we won't be able to use thrust::pair
//just explicitly specify typed output reads/writes
//Currently implemented for max of two outputs
// template<class T1, class T2>
// C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
// if (noutputs >= 1) {
// auto res0 = (T1*)((char*)dst[0] + base_offset);
// res0 = x.first;
// }
// if (noutputs >= 2) {
// // base offset is computed assuming element size being sizeof(T1), so we need to make a
// // correction to obtain the correct base offset
// auto res1 = (T2) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
// *res1 = x.second;
// }
// }
template
C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
assert(final_output);
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
set_results(reducer::project(value[i]), base_offset[i]);
}
}
template
C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> acc, char shared_memory) const {
using arg_vec_t = Array<arg_t, output_vec_size>;
using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
using offset_vec_t = Array<uint32_t, output_vec_size>;
}
};
extern "C"
launch_bounds(512, 4)
global void reduction_prod_kernel(ReduceJitOp r){
r.run();
}
nvrtc: error: invalid value for --gpu-architecture (-arch)
Training progress: 0%| | 0/30000 [00:00<?, ?it/s]
(mip-splatting) root@autodl-container-2a5049addd-1b9dbf2b:~/mip-splatting#
The text was updated successfully, but these errors were encountered: