#define TORCH_ASSERT_NO_OPERATORS #include #include #include #include #include #include #include #include #include #include #include #include // NOTE: CUDA on Windows requires that the enclosing function // of a __device__ lambda not have internal linkage. namespace at::native { constexpr char mul_name[] = "mul_kernel"; void mul_kernel_cuda(TensorIteratorBase& iter) { auto common_dtype = iter.common_dtype(); if (common_dtype == kComplexHalf) { using scalar_t = c10::complex; #if AT_USE_JITERATOR() static const auto mul_string = jiterator_stringify( template T mul_kernel(T a, T b) { return a * b; }); opmath_jitted_gpu_kernel_with_scalars( iter, mul_string); #else using opmath_t = at::opmath_type; opmath_symmetric_gpu_kernel_with_scalars( iter, binary_internal::MulFunctor()); #endif } else { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { using opmath_t = at::opmath_type; opmath_symmetric_gpu_kernel_with_scalars( iter, binary_internal::MulFunctor()); }); } } REGISTER_DISPATCH(mul_stub, &mul_kernel_cuda) } // namespace at::native