// Copyright 2020 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include "xnnpack/unpool.h" void xnn_x32_unpool_ukernel__sse2( size_t kernel_elements, size_t channels, uint32_t fill, const uint32_t* input, const uint32_t* index, uint32_t** output) { // Pre-initialize outputs with constant. const __m128i vfill = _mm_set1_epi32((int) fill); uint32_t** os = output; do { uint32_t* o = *os++; size_t c = channels; for (; c >= 4; c -= 4) { _mm_storeu_si128((__m128i*) o, vfill); o += 4; } if (c != 0) { if (c & 2) { _mm_storel_epi64((__m128i*) o, vfill); o += 2; } if (c & 1) { *((int*) o) = _mm_cvtsi128_si32(vfill); } } } while (--kernel_elements != 0); // Copy indexed elements to output. size_t offset = 0; do { const uint32_t i = *index++; *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++; offset += sizeof(uint32_t); } while (--channels != 0); }