// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
$assert OUT_PTRS in ["SWITCH", "MOV"]
$TILE_SIZE = int(256/SIZE)
$NUM_ITERS = TILE_SIZE.bit_length() - 1

$XINT = {8: "uint8_t", 16: "uint16_t"}[SIZE]
$_MM256_UNPACKLO_PX = {8: "_mm256_unpacklo_epi8", 16: "_mm256_unpacklo_epi16"}[SIZE]
$_MM256_UNPACKHI_PX = {8: "_mm256_unpackhi_epi8", 16: "_mm256_unpackhi_epi16"}[SIZE]

#include <immintrin.h>

#include <assert.h>

#include "xnnpack/common.h"
#include "xnnpack/intrinsics-polyfill.h"
#include "xnnpack/math.h"
#include "xnnpack/transpose.h"
#include "xnnpack/unaligned.h"

void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_reuse_${OUT_PTRS.lower()}_avx2(
    const uint${SIZE}_t* input,
    uint${SIZE}_t* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height) XNN_OOB_READS
{
  assert(block_width == 1 || output_stride >= block_height * sizeof(${XINT}));
  assert(block_height == 1 || input_stride >= block_width * sizeof(${XINT}));

  $if SIZE == 64:
    static const int64_t mask_table[7] = {-1, -1, -1, -1, 0, 0, 0};
  $else:
    static const int32_t mask_table[15] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};

  const size_t tile_height = ${TILE_SIZE};
  const size_t tile_width = ${TILE_SIZE};
  const size_t tile_hbytes = tile_height * sizeof(${XINT});
  const size_t tile_wbytes = tile_width * sizeof(${XINT});
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  $if OUT_PTRS == "SWITCH":
    ${XINT}* o = (${XINT}*) output;
  $else:
    ${XINT}* o = (${XINT}*) ((uintptr_t) output - tile_hbytes);
  $if OUT_PTRS == "MOV":
    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(${XINT}) - tile_hbytes;
  $else:
    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(${XINT});

  const ${XINT}* i0 = (const ${XINT}*) input;
  const size_t minus_output_stride = -output_stride;

  do {
    const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
    $if OUT_PTRS == "MOV":
      const size_t oN_stride = rem * output_stride;
      const size_t oN_offset = oN_stride + tile_hbytes;
    $else:
      const size_t oN_stride = rem * output_stride;

    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7 ^ (rem>>${NUM_ITERS-3})]));

    size_t bh = block_height;
    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
      $for N in range(TILE_SIZE):
        const __m256i v${NUM_ITERS}_${N} = _mm256_maskload_epi32((const int*) i0, vmask);
        i0 = (${XINT}*) ((uintptr_t) i0 + input_stride);

      $for M in range(0, NUM_ITERS-1):
        $for N in range(2):
          $for O in range(TILE_SIZE>>2):
            const __m256i v${NUM_ITERS-M-1}_${N*(TILE_SIZE>>1)+O*2} = ${_MM256_UNPACKLO_PX}(v${NUM_ITERS-M}_${O+N*(TILE_SIZE>>1)}, v${NUM_ITERS-M}_${(O+N*(TILE_SIZE>>1))+int(TILE_SIZE/4)});
            const __m256i v${NUM_ITERS-M-1}_${N*(TILE_SIZE>>1)+O*2+1} = ${_MM256_UNPACKHI_PX}(v${NUM_ITERS-M}_${O+N*(TILE_SIZE>>1)}, v${NUM_ITERS-M}_${(O+N*(TILE_SIZE>>1))+int(TILE_SIZE/4)});

      $if OUT_PTRS == "MOV":
        $for N in range(0,(TILE_SIZE>>1)):
          const __m256i v0_${(N)} = _mm256_insertf128_si256(v1_${N}, _mm256_castsi256_si128(v1_${N+(TILE_SIZE>>1)}), 1);
          const __m256i v0_${(N)+(TILE_SIZE>>1)} = _mm256_permute2f128_si256(v1_${N}, v1_${N+(TILE_SIZE>>1)}, 0x31);

      $if OUT_PTRS == "SWITCH":
        ${XINT}* oN = (${XINT}*) ((uintptr_t) o + oN_stride);
        switch (rem) {
          default:
            XNN_UNREACHABLE;
          $for N in reversed(range(TILE_SIZE>>1, TILE_SIZE)):
            case ${N}: {
              const __m256i v0_${N} = _mm256_permute2f128_si256(v1_${N-(TILE_SIZE>>1)}, v1_${N}, 0x31);
              _mm256_storeu_si256((__m256i*) oN, v0_${N});
              oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
            }
            XNN_FALLTHROUGH
          $for N in reversed(range(2, TILE_SIZE>>1)):
            case ${N}: {
              const __m256i v0_${(N)} = _mm256_insertf128_si256(v1_${N}, _mm256_castsi256_si128(v1_${N+(TILE_SIZE>>1)}), 1);
              _mm256_storeu_si256((__m256i*) oN, v0_${N});
              oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
            }
            XNN_FALLTHROUGH
          case 1: {
            const __m256i v0_1 = _mm256_insertf128_si256(v1_1, _mm256_castsi256_si128(v1_${(TILE_SIZE>>1)+1}), 1);
            _mm256_storeu_si256((__m256i*) oN, v0_1);
          }
          XNN_FALLTHROUGH
          case 0: {
            const __m256i v0_0 = _mm256_insertf128_si256(v1_0, _mm256_castsi256_si128(v1_${(TILE_SIZE>>1)}), 1);
            _mm256_storeu_si256((__m256i*) o, v0_0);
            o = (${XINT}*) ((uintptr_t) o + tile_hbytes);
          }
        }
      $elif OUT_PTRS == "MOV":
        o = (${XINT}*) ((uintptr_t) o + oN_offset);
        _mm256_storeu_si256((__m256i*) o, v0_${TILE_SIZE-1});
        ${XINT} *oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
        $for N in reversed(range(2, TILE_SIZE, 2)):
          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
            o = oN;
          }
          _mm256_storeu_si256((__m256i*) o, v0_${N});
          oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
            o = oN;
          }
          _mm256_storeu_si256((__m256i*) o, v0_${N-1});
          oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = oN;
        }
        _mm256_storeu_si256((__m256i*) o, v0_0);
    }
    $if OUT_PTRS == "MOV":
      o = (${XINT}*) ((uintptr_t) o + tile_hbytes);
    if (bh != 0) {
      const __m256i v${NUM_ITERS}_0 = _mm256_maskload_epi32((const int*) i0, vmask);
      $for N in range(1, TILE_SIZE - 1, 2):
        const ${XINT} *i${N} = (const ${XINT}*) ((uintptr_t) i${N-1} + input_stride);
        if XNN_UNPREDICTABLE(bh < ${N+1}) {
          i${N} = i${N-1};
        }
        const __m256i v${NUM_ITERS}_${N} = _mm256_maskload_epi32((const int*) i${N}, vmask);
        const ${XINT} *i${N+1} = (const ${XINT}*) ((uintptr_t) i${N} + input_stride);
        if XNN_UNPREDICTABLE(bh <= ${N+1}) {
          i${N+1} = i${N};
        }
        const __m256i v${NUM_ITERS}_${N+1} = _mm256_maskload_epi32((const int*) i${N+1}, vmask);
      const __m256i v${NUM_ITERS}_${TILE_SIZE-1} = _mm256_undefined_si256();

      $for M in range(0, NUM_ITERS-1):
        $for N in range(2):
          $for O in range(TILE_SIZE>>2):
            const __m256i v${NUM_ITERS-M-1}_${N*(TILE_SIZE>>1)+O*2} = ${_MM256_UNPACKLO_PX}(v${NUM_ITERS-M}_${O+N*(TILE_SIZE>>1)}, v${NUM_ITERS-M}_${(O+N*(TILE_SIZE>>1))+int(TILE_SIZE/4)});
            const __m256i v${NUM_ITERS-M-1}_${N*(TILE_SIZE>>1)+O*2+1} = ${_MM256_UNPACKHI_PX}(v${NUM_ITERS-M}_${O+N*(TILE_SIZE>>1)}, v${NUM_ITERS-M}_${(O+N*(TILE_SIZE>>1))+int(TILE_SIZE/4)});

      $for N in range(0,TILE_SIZE>>1):
        __m128i v0_${N}_lo = _mm256_castsi256_si128(v1_${N});
      $for N in range(TILE_SIZE>>1, TILE_SIZE):
        __m128i v0_${N}_lo = _mm256_extractf128_si256(v1_${N-(TILE_SIZE>>1)}, 0x1);

      if (bh & ${TILE_SIZE>>1}) {
        $if OUT_PTRS == "SWITCH":
          ${XINT}* oN = (${XINT}*) ((uintptr_t) o + oN_stride);
          switch (rem) {
            $for N in reversed(range(TILE_SIZE>>1, TILE_SIZE)):
              case ${N}:
                _mm_storeu_si128((__m128i*) oN, v0_${N}_lo);
                 v0_${N}_lo = _mm256_extractf128_si256(v1_${N}, 0x1);
                oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
                XNN_FALLTHROUGH
            $for N in reversed(range(2, TILE_SIZE>>1)):
              case ${N}:
                _mm_storeu_si128((__m128i*) oN, v0_${N}_lo);
                 v0_${N}_lo = _mm256_castsi256_si128(v1_${N+(TILE_SIZE>>1)});
                oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
                XNN_FALLTHROUGH
            case 1:
              _mm_storeu_si128((__m128i*) oN, v0_1_lo);
              v0_1_lo = _mm256_castsi256_si128(v1_${1+(TILE_SIZE>>1)});
              XNN_FALLTHROUGH
            case 0:
              _mm_storeu_si128((__m128i*) o, v0_0_lo);
              v0_0_lo = _mm256_castsi256_si128(v1_${TILE_SIZE>>1});
              break;
            default:
              XNN_UNREACHABLE;
          }
          o += ${TILE_SIZE>>1};
        $elif OUT_PTRS == "MOV":
          o = (${XINT}*) ((uintptr_t) o + oN_stride);
          _mm_storeu_si128((__m128i*) o, v0_${TILE_SIZE-1}_lo);
          v0_${TILE_SIZE-1}_lo = _mm256_extractf128_si256(v1_${TILE_SIZE-1}, 0x1);
          ${XINT} *oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          $for N in reversed(range(TILE_SIZE>>1, TILE_SIZE, 2)):
            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
              o = oN;
            }
            _mm_storeu_si128((__m128i*) o, v0_${N}_lo);
            v0_${N}_lo = _mm256_extractf128_si256(v1_${N}, 0x1);
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
            $if N == TILE_SIZE>>1:
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                o = oN;
              }
              _mm_storeu_si128((__m128i*) o, v0_${N-1}_lo);
              v0_${N-1}_lo = _mm256_castsi256_si128(v1_${N-1+(TILE_SIZE>>1)});
              oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
            $else:
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                o = oN;
              }
              _mm_storeu_si128((__m128i*) o, v0_${N-1}_lo);
              v0_${N-1}_lo = _mm256_extractf128_si256(v1_${N-1}, 0x1);
              oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          $for N in reversed(range(2, TILE_SIZE>>1, 2)):
            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
              o = oN;
            }
            _mm_storeu_si128((__m128i*) o, v0_${N}_lo);
            v0_${N}_lo = _mm256_castsi256_si128(v1_${N+(TILE_SIZE>>1)});
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
              o = oN;
            }
            _mm_storeu_si128((__m128i*) o, v0_${N-1}_lo);
            v0_${N-1}_lo = _mm256_castsi256_si128(v1_${N-1+(TILE_SIZE>>1)});
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          if XNN_UNPREDICTABLE(block_width > 1) {
            o = oN;
          }
          _mm_storeu_si128((__m128i*) o, v0_0_lo);
          v0_0_lo = _mm256_castsi256_si128(v1_${TILE_SIZE>>1});
          o += ${TILE_SIZE>>1};
      }

      if (bh & ${TILE_SIZE>>2}) {
        $if OUT_PTRS == "SWITCH":
          ${XINT}* oN = (${XINT}*) ((uintptr_t) o + oN_stride);
          switch (rem) {
            $for N in reversed(range(2, TILE_SIZE)):
              case ${N}:
                _mm_storel_epi64((__m128i*) oN, v0_${N}_lo);
                v0_${N}_lo = _mm_unpackhi_epi64(v0_${N}_lo, v0_${N}_lo);
                oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
                XNN_FALLTHROUGH
            case 1:
              _mm_storel_epi64((__m128i*) oN, v0_1_lo);
              v0_1_lo = _mm_unpackhi_epi64(v0_1_lo, v0_1_lo);
              XNN_FALLTHROUGH
            case 0:
              _mm_storel_epi64((__m128i*) o, v0_0_lo);
              v0_0_lo = _mm_unpackhi_epi64(v0_0_lo, v0_0_lo);
              break;
            default:
              XNN_UNREACHABLE;
          }
          o += ${TILE_SIZE>>2};
        $elif OUT_PTRS == "MOV":
          o = (${XINT}*) ((uintptr_t) o + oN_stride);
          _mm_storel_epi64((__m128i*) o, v0_${TILE_SIZE-1}_lo);
          v0_${TILE_SIZE-1}_lo = _mm_unpackhi_epi64(v0_${TILE_SIZE-1}_lo, v0_${TILE_SIZE-1}_lo);
          ${XINT} *oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          $for N in reversed(range(2, TILE_SIZE, 2)):
            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
              o = oN;
            }
            _mm_storel_epi64((__m128i*) o, v0_${N}_lo);
            v0_${N}_lo = _mm_unpackhi_epi64(v0_${N}_lo, v0_${N}_lo);
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
              o = oN;
            }
            _mm_storel_epi64((__m128i*) o, v0_${N-1}_lo);
            v0_${N-1}_lo = _mm_unpackhi_epi64(v0_${N-1}_lo, v0_${N-1}_lo);
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          if XNN_UNPREDICTABLE(block_width > 1) {
            o = oN;
          }
          _mm_storel_epi64((__m128i*) o, v0_0_lo);
          v0_0_lo = _mm_unpackhi_epi64(v0_0_lo, v0_0_lo);
          o += ${TILE_SIZE>>2};
      }
      if (bh & ${TILE_SIZE>>3}) {
        $if OUT_PTRS == "SWITCH":
          ${XINT}* oN = (${XINT}*) ((uintptr_t) o + oN_stride);
          switch (rem) {
            $for N in reversed(range(2, TILE_SIZE)):
              case ${N}:
                _mm_storeu_si32(oN, v0_${N}_lo);
                $if NUM_ITERS>3:
                  v0_${N}_lo = _mm_srli_epi64(v0_${N}_lo, 32);
                oN = (${XINT}*) ((uintptr_t) oN + minus_output_stride);
                XNN_FALLTHROUGH
            case 1:
              _mm_storeu_si32(oN, v0_1_lo);
              $if NUM_ITERS>3:
                v0_1_lo = _mm_srli_epi64(v0_1_lo, 32);
              XNN_FALLTHROUGH
            case 0:
              _mm_storeu_si32(o, v0_0_lo);
              $if NUM_ITERS>3:
                v0_0_lo = _mm_srli_epi64(v0_0_lo, 32);
              break;
            default:
              XNN_UNREACHABLE;
          }
          o += ${TILE_SIZE>>3};
        $elif OUT_PTRS == "MOV":
          o = (${XINT}*) ((uintptr_t) o + oN_stride);
          _mm_storeu_si32(o, v0_${TILE_SIZE-1}_lo);
          $if NUM_ITERS>3:
            v0_${TILE_SIZE-1}_lo = _mm_srli_epi64(v0_${TILE_SIZE-1}_lo, 32);
          ${XINT}* oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          $for N in reversed(range(2, TILE_SIZE, 2)):
            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
              o = oN;
            }
            _mm_storeu_si32(o, v0_${N}_lo);
            $if NUM_ITERS>3:
              v0_${N}_lo = _mm_srli_epi64(v0_${N}_lo, 32);
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
              o = oN;
            }
            _mm_storeu_si32(o, v0_${N-1}_lo);
            $if NUM_ITERS>3:
              v0_${N-1}_lo = _mm_srli_epi64(v0_${N-1}_lo, 32);
            oN = (${XINT}*) ((uintptr_t) o + minus_output_stride);
          if XNN_UNPREDICTABLE(block_width > 1) {
            o = oN;
          }
          _mm_storeu_si32(o, v0_0_lo);
          $if NUM_ITERS>3:
            v0_0_lo = _mm_srli_epi64(v0_0_lo, 32);
          o += ${TILE_SIZE>>3};
      }
      $if NUM_ITERS>3:
        if (bh & ${TILE_SIZE>>4}) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_${N}_lo));
                   $if NUM_ITERS > 4:
                     v0_${N}_lo = _mm_srli_epi32(v0_${N}_lo, 16);
                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
                  XNN_FALLTHROUGH
              case 1:
                 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_1_lo));
                 $if NUM_ITERS > 4:
                   v0_1_lo = _mm_srli_epi32(v0_1_lo, 16);
                 XNN_FALLTHROUGH
              case 0:
                 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0_lo));
                 $if NUM_ITERS > 4:
                   v0_0_lo = _mm_srli_epi32(v0_0_lo, 16);
                break;
              default:
                XNN_UNREACHABLE;
            }
            $if NUM_ITERS>4:
              o += ${TILE_SIZE>>4};
          $elif OUT_PTRS == "MOV":
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1}_lo));
            $if NUM_ITERS > 4:
              v0_${TILE_SIZE-1}_lo = _mm_srli_epi32(v0_${TILE_SIZE-1}_lo, 16);
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                o = oN;
              }
              unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N}_lo));
              $if NUM_ITERS > 4:
                v0_${N}_lo = _mm_srli_epi32(v0_${N}_lo, 16);
              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                o = oN;
              }
              unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N-1}_lo));
              $if NUM_ITERS > 4:
                v0_${N-1}_lo = _mm_srli_epi32(v0_${N-1}_lo, 16);
              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              o = oN;
            }
            unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0_lo));
            $if NUM_ITERS > 4:
              v0_0_lo = _mm_srli_epi32(v0_0_lo, 16);
            $if NUM_ITERS > 4:
              o += ${TILE_SIZE>>4};
        }
      $if SIZE == 8:
        if (bh & 1) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  *oN = (uint8_t) _mm_cvtsi128_si32(v0_${N}_lo);
                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
                  XNN_FALLTHROUGH
              case 1:
                  *oN = (uint8_t) _mm_cvtsi128_si32(v0_1_lo);
                  XNN_FALLTHROUGH
              case 0:
                *o = (uint8_t) _mm_cvtsi128_si32(v0_0_lo);
                break;
              default:
                XNN_UNREACHABLE;
            }
          $elif OUT_PTRS == "MOV":
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            *o = (uint8_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1}_lo);
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                o = oN;
              }
              *o = (uint8_t) _mm_cvtsi128_si32(v0_${N}_lo);
              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                o = oN;
              }
              *o = (uint8_t) _mm_cvtsi128_si32(v0_${N-1}_lo);
              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              o = oN;
            }
            *o = (uint8_t) _mm_cvtsi128_si32(v0_0_lo);
        }
    }

    i0 = (const ${XINT}*) ((uintptr_t) i0 + input_reset);
    o = (${XINT}*) ((uintptr_t) o + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}
