#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #else #include #endif #include #include #include #include #include #include #include namespace at::native { void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_indices, Tensor &sorted_indices, int64_t num_indices) { sorted_indices.copy_(linearIndex); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); at::cuda::ThrustAllocator allocator; auto policy = thrust::cuda::par(allocator).on(stream); using device_ptr = thrust::device_ptr; // Fill sortedOrigIndices with sequential indices const auto count_iter = thrust::counting_iterator(0); auto orig_data = device_ptr(orig_indices.mutable_data_ptr()); thrust::copy(policy, count_iter, count_iter + num_indices, orig_data); // Sort the inputs into sorted with the corresponding indices; we // don't need a stable or multidimensional sort, so just use Thrust // directly // Sort; a stable sort is not required // NB - not passing comparator causes thrust to use radix sort, and it hurts perf A LOT, at least for medium (few K) sized indices auto sorted_data = device_ptr(sorted_indices.mutable_data_ptr()); thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp()); } #if !CUB_SUPPORTS_SCAN_BY_KEY() template void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); at::cuda::ThrustAllocator allocator; auto policy = thrust::cuda::par(allocator).on(stream); auto num_indices = count.numel(); // Compute an increasing sequence per unique item in sortedIndices: // sorted: 2 5 5 5 7 7 8 9 9 // count: 1 1 2 3 1 2 1 1 2 auto sorted_data = thrust::device_ptr(sorted_indices.const_data_ptr()); auto count_data = thrust::device_ptr(count.mutable_data_ptr()); thrust::inclusive_scan_by_key( policy, sorted_data, sorted_data + num_indices, thrust::make_constant_iterator(1), count_data ); // Take the maximum of each count per unique key in reverse: // sorted: 2 5 5 5 7 7 8 9 9 // count: 1 3 3 3 2 2 1 2 2 thrust::inclusive_scan_by_key( policy, thrust::make_reverse_iterator(sorted_data + num_indices), thrust::make_reverse_iterator(sorted_data), thrust::make_reverse_iterator(count_data + num_indices), thrust::make_reverse_iterator(count_data + num_indices), thrust::equal_to(), thrust::maximum() ); } template void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); template void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); #endif template int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) { auto stream = at::cuda::getCurrentCUDAStream(); at::cuda::ThrustAllocator allocator; auto policy = thrust::cuda::par(allocator).on(stream); const ptrdiff_t numel = sorted_indices.numel(); auto sorted_indices_dev = thrust::device_ptr(sorted_indices.const_data_ptr()); auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); auto dummy_dev = thrust::device_ptr(dummy.mutable_data_ptr()); auto ends = thrust::unique_by_key_copy( policy, sorted_indices_dev, sorted_indices_dev + numel, thrust::make_counting_iterator(0), dummy_dev, thrust::device_ptr(segment_offsets.mutable_data_ptr())); return thrust::get<0>(ends) - dummy_dev; } template int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); template int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); } // namespace at::native