/*******************************************************************************
* Copyright 2019-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include <array>
#include <memory>

#include "common/bfloat16.hpp"
#include "common/bit_cast.hpp"
#include "common/dnnl_thread.hpp"

#include "cpu/platform.hpp"

#if DNNL_X64
#include "cpu/x64/cpu_isa_traits.hpp"
#include "cpu/x64/jit_avx512_core_bf16cvt.hpp"
#include "cpu/x64/jit_uni_convert_xf16.hpp"
#endif

namespace dnnl {
namespace impl {

bool try_cvt_float_to_bfloat16(bfloat16_t *out, const float *inp) {

#if DNNL_X64
    using namespace cpu::x64;
    if (mayiuse(cpu_isa_t::avx512_core) || mayiuse(avx2_vnni_2)) {
        cpu::x64::cvt_xf16_support::jit_call_t p_;
        p_.inp = (void *)inp;
        p_.out = (void *)out;
        static const cpu::x64::jit_cvt_ps_to_xf16_t cvt_one_ps_to_bf16(
                data_type::bf16, 1);
        cvt_one_ps_to_bf16(&p_);
        return true;
    }
#endif
    return false;
}

void cvt_float_to_bfloat16(bfloat16_t *out, const float *inp, size_t nelems) {
#if DNNL_X64
    using namespace cpu::x64;
    if (mayiuse(cpu_isa_t::avx512_core) || mayiuse(avx2_vnni_2)) {
        cpu::x64::cvt_xf16_support::jit_call_t p_;
        p_.inp = (void *)inp;
        p_.out = (void *)out;
        p_.nelems = nelems;
        static const cpu::x64::jit_cvt_ps_to_xf16_t cvt_ps_to_bf16(
                data_type::bf16);
        cvt_ps_to_bf16(&p_);
        return;
    }
#endif

    PRAGMA_OMP_SIMD()
    for (size_t i = 0; i < nelems; ++i)
        out[i] = inp[i];
}

void cvt_bfloat16_to_float(float *out, const bfloat16_t *inp, size_t nelems) {
#if DNNL_X64
    using namespace cpu::x64;
    if (mayiuse(cpu_isa_t::avx512_core) || mayiuse(avx2_vnni_2)) {
        static const cpu::x64::jit_cvt_xf16_to_ps_t kernel(
                data_type::bf16, false);
        return kernel(out, inp, nelems);
    }
#endif

    PRAGMA_OMP_SIMD()
    for (size_t i = 0; i < nelems; ++i)
        out[i] = inp[i];
}

void add_floats_and_cvt_to_bfloat16(
        bfloat16_t *out, const float *inp0, const float *inp1, size_t nelems) {
#if DNNL_X64
    if (cpu::x64::mayiuse(cpu::x64::cpu_isa_t::avx512_core)) {
        cpu::x64::bf16_support::jit_call_t p_;
        p_.inp = (void *)inp0;
        p_.add = (void *)inp1;
        p_.out = (void *)out;
        p_.nelems = nelems;
        static const cpu::x64::jit_avx512_core_add_cvt_ps_to_bf16_t
                add_cvt_ps_to_bf16;
        add_cvt_ps_to_bf16(&p_);
        return;
    }
#endif

    PRAGMA_OMP_SIMD()
    for (size_t i = 0; i < nelems; ++i)
        out[i] = inp0[i] + inp1[i];
}

} // namespace impl
} // namespace dnnl
