#define TORCH_ASSERT_NO_OPERATORS #include #include #include #include #include #include #include #include namespace at::native { template struct sum_functor { void operator()(TensorIterator& iter) { #ifdef USE_ROCM // Half and BFloat16 can be packed in groups of up to 8 elements and // can use *_DWORDX4 instructions to achieve that. const bool is_16_bits = ( (std::is_same::value) || (std::is_same::value) ); if (is_16_bits) { gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a + b; })); return; } #endif gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a + b; })); } }; // jiterated specialization for `complex` constexpr char sum_name[] = "sum"; template <> struct sum_functor> { // jiterator reduction fails on windows // Ref: https://github.com/pytorch/pytorch/issues/77305 #if AT_USE_JITERATOR() && !defined(_MSC_VER) void operator()(TensorIterator& iter) { using scalar_t = c10::complex; std::string func = jiterator_stringify( arg_t combine(arg_t a, arg_t b) { return a + b; } ); jitted_gpu_reduce_kernel( iter, func, 0.); } #else void operator()(TensorIterator& iter) { using scalar_t = c10::complex; using acc_t = at::opmath_type; gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a + b; }), acc_t{0.}); } #endif }; template struct nansum_functor { void operator()(TensorIterator& iter) { gpu_reduce_kernel( iter, NanSumOps{}); } }; constexpr char nansum_name[] = "nansum"; template struct nansum_functor_complex { #if AT_USE_JITERATOR() void operator()(TensorIterator& iter) { std::string func = jiterator_stringify( arg_t combine(arg_t a, scalar_t b) { return a + (std::isnan(b) ? arg_t{0.} : arg_t{b}); } ); jitted_gpu_reduce_kernel( iter, func, 0.); } #else void operator()(TensorIterator& iter) { using acc_t = at::opmath_type; gpu_reduce_kernel( iter, NanSumOps{}); } #endif }; constexpr char prod_name[] = "prod"; template struct prod_functor { // jiterator reduction fails on windows // Ref: https://github.com/pytorch/pytorch/issues/77305 #if AT_USE_JITERATOR() && !defined(_MSC_VER) void operator()(TensorIterator& iter) { std::string func = jiterator_stringify( arg_t combine(arg_t a, arg_t b) { return a * b; } ); jitted_gpu_reduce_kernel( iter, func, 1.); } #else void operator()(TensorIterator& iter) { gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; }), 1.); } #endif }; // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] template <> struct prod_functor { void operator()(TensorIterator& iter) { gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(bool a, bool b) -> bool { return a && b; }), 1); } }; // jiterated specialization for `complex` template <> struct prod_functor> { // jiterator reduction fails on windows // Ref: https://github.com/pytorch/pytorch/issues/77305 #if AT_USE_JITERATOR() && !defined(_MSC_VER) void operator()(TensorIterator& iter) { using scalar_t = c10::complex; std::string func = jiterator_stringify(arg_t combine(arg_t a, arg_t b) { return a * b; }); jitted_gpu_reduce_kernel(iter, func, 1.); } #else void operator()(TensorIterator& iter) { using scalar_t = c10::complex; using acc_t = at::opmath_type; gpu_reduce_kernel( iter, func_wrapper( [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; }), acc_t{1.}); } #endif }; // The function `reduce_dispatch` below dispatches to the kernel based // on the type of `iter`. It takes care of the common logic // for handling Half-Precision floating types. // Otherwise the functor `op` is called to dispatch to the kernel // of relevant type. // // Note: Functor `op` should take care of all the types to be supported // except for `at::Half` and `at::BFloat16`. template < template < typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t> typename OpFunctor, typename GeneralDispatcher> static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) { if (iter.dtype() == kHalf) { return OpFunctor{}(iter); } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel return OpFunctor{}(iter); } else if (iter.dtype() == kBFloat16) { return OpFunctor{}(iter); } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel return OpFunctor{}(iter); } op(iter); } static void sum_kernel_cuda(TensorIterator& iter){ auto general_dispatcher = [](TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( kBool, kComplexHalf, iter.dtype(), "sum_cuda", [&]() { sum_functor{}(iter); }); }; reduce_dispatch(iter, general_dispatcher); } static void nansum_kernel_cuda(TensorIterator& iter) { auto general_dispatcher = [](TensorIterator& iter) { auto dtype = iter.dtype(); if (at::isComplexType(dtype)) { AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "nansum_cuda", [&]() { nansum_functor_complex{}(iter); }); } else { AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_cuda", [&]() { nansum_functor{}(iter); }); } }; reduce_dispatch(iter, general_dispatcher); } static void prod_kernel_cuda(TensorIterator& iter) { auto general_dispatcher = [](TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kComplexHalf, kBool, iter.dtype(), "prod_cuda", [&]() { prod_functor{}(iter); }); }; reduce_dispatch(iter, general_dispatcher); } REGISTER_DISPATCH(sum_stub, &sum_kernel_cuda) REGISTER_DISPATCH(nansum_stub, &nansum_kernel_cuda) REGISTER_DISPATCH(prod_stub, &prod_kernel_cuda) } // namespace at::native