#define TORCH_ASSERT_NO_OPERATORS #include #include #include #include #include #include #include #include namespace at::native { // This reduction accumulates results as the type `acc_t`. By default, when // `scalar_t` is complex, `acc_t` is the downgraded real number type. // Otherwise, `acc_t` and `scalar_t` are the same type. template ::type, typename out_t=typename scalar_value_type::type> void norm_kernel_cuda_impl(TensorIterator& iter, double p) { if (p == static_cast(0)) { gpu_reduce_kernel(iter, NormZeroOps(), 0); } else if (p == static_cast(1)) { gpu_reduce_kernel(iter, NormOneOps(), 0); } else if (p == static_cast(2)) { gpu_reduce_kernel(iter, NormTwoOps(), 0); } else if (p == static_cast(INFINITY)) { gpu_reduce_kernel(iter, AbsMaxOps(), 0); } else if (p == static_cast(-INFINITY)) { gpu_reduce_kernel(iter, AbsMinOps(), std::numeric_limits::infinity()); } else { gpu_reduce_kernel(iter, NormOps{acc_t(p)}, 0); } } void norm_launch_kernel(TensorIterator& iter, double ord) { if (iter.dtype(0) == kHalf) { return norm_kernel_cuda_impl(iter, ord); } else if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, ord); } else if(iter.dtype(0) == kBFloat16) { return norm_kernel_cuda_impl(iter, ord); } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, ord); } AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_cuda", [&] { norm_kernel_cuda_impl(iter, ord); }); } } // namespace at::native