#define TORCH_ASSERT_NO_OPERATORS #include #include #include #include #include #include #include #include namespace at::native { template void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) { // reducing unrolling factor to 2 for welford kernel // This is necessary to lower register usage that leads to register spills. using accscalar_t = at::acc_type; using ops_t = WelfordOps>; ops_t ops(static_cast(correction), take_sqrt); gpu_reduce_kernel(iter, ops, typename ops_t::acc_t{}); } static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool take_sqrt) { const auto input_dtype = iter.input_dtype(); if (input_dtype == kHalf && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel std_var_kernel_impl(iter, correction, take_sqrt); } else if (input_dtype == kBFloat16 && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel std_var_kernel_impl(iter, correction, take_sqrt); } else { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "std_cuda", [&]() { std_var_kernel_impl(iter, correction, take_sqrt); }); } } template void mean_kernel_impl(TensorIterator& iter) { // returns acc_t for all non-complex dtypes and returns T for c10::complex using factor_t = typename c10::scalar_value_type::type; factor_t factor = static_cast(iter.num_output_elements()) / iter.numel(); gpu_reduce_kernel(iter, MeanOps {factor}); } static void mean_kernel_cuda(TensorIterator& iter) { if (iter.dtype() == kHalf) { mean_kernel_impl(iter); } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel mean_kernel_impl(iter); } else if(iter.dtype() == kBFloat16) { mean_kernel_impl(iter); } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel mean_kernel_impl(iter); } else { AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "mean_cuda", [&]() { mean_kernel_impl(iter); }); } } REGISTER_DISPATCH(std_var_stub, &std_var_kernel_cuda) REGISTER_DISPATCH(mean_stub, &mean_kernel_cuda) } // namespace at::native