/*
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
#endif
#if !defined(__DEVICE_FUNCTIONS_H__)
#define __DEVICE_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__cplusplus) && defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__ __cudart_builtin__
#else
#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __cudart_builtin__
#endif /* __CUDACC_RTC__ */
#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern "C"
{
/**
* \ingroup CUDA_MATH_INTRINSIC_INT
* \brief Calculate the most significant 32 bits of the product of the two 32-bit integers.
*
* Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
* are 32-bit integers.
*
* \return Returns the most significant 32 bits of the product \p x * \p y.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int __mulhi(int x, int y);
/**
* \ingroup CUDA_MATH_INTRINSIC_INT
* \brief Calculate the most significant 32 bits of the product of the two 32-bit unsigned integers.
*
* Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
* are 32-bit unsigned integers.
*
* \return Returns the most significant 32 bits of the product \p x * \p y.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __umulhi(unsigned int x, unsigned int y);
/**
* \ingroup CUDA_MATH_INTRINSIC_INT
* \brief Calculate the most significant 64 bits of the product of the two 64-bit integers.
*
* Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
* are 64-bit integers.
*
* \return Returns the most significant 64 bits of the product \p x * \p y.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int __mul64hi(long long int x, long long int y);
/**
* \ingroup CUDA_MATH_INTRINSIC_INT
* \brief Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.
*
* Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
* are 64-bit unsigned integers.
*
* \return Returns the most significant 64 bits of the product \p x * \p y.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
/**
* \ingroup CUDA_MATH_INTRINSIC_CAST
* \brief Reinterpret bits in an integer as a float.
*
* Reinterpret the bits in the signed integer value \p x as a single-precision
* floating-point value.
* \return Returns reinterpreted value.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __int_as_float(int x);
/**
* \ingroup CUDA_MATH_INTRINSIC_CAST
* \brief Reinterpret bits in a float as a signed integer.
*
* Reinterpret the bits in the single-precision floating-point value \p x
* as a signed integer.
* \return Returns reinterpreted value.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int __float_as_int(float x);
/**
* \ingroup CUDA_MATH_INTRINSIC_CAST
* \brief Reinterpret bits in an unsigned integer as a float.
*
* Reinterpret the bits in the unsigned integer value \p x as a single-precision
* floating-point value.
* \return Returns reinterpreted value.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __uint_as_float(unsigned int x);
/**
* \ingroup CUDA_MATH_INTRINSIC_CAST
* \brief Reinterpret bits in a float as a unsigned integer.
*
* Reinterpret the bits in the single-precision floating-point value \p x
* as a unsigned integer.
* \return Returns reinterpreted value.
*/
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __float_as_uint(float x);
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __syncthreads(void);
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __prof_trigger(int);
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __threadfence(void);
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __threadfence_block(void);
__DEVICE_FUNCTIONS_DECL__
#if defined(__GNUC__) || defined(__CUDACC_RTC__)
__attribute__((__noreturn__))
#elif defined(_MSC_VER)
__declspec(noreturn)
#endif /* defined(__GNUC__) || defined(__CUDACC_RTC__) */
__device_builtin__ void __trap(void);
__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __brkpt();
/**
* \ingroup CUDA_MATH_INTRINSIC_SINGLE
* \brief Clamp the input argument to [+0.0, 1.0].
*
* Clamp the input argument \p x to be within the interval [+0.0, 1.0].
* \return
* - __saturatef(\p x) returns 0 if \p x < 0.
* - __saturatef(\p x) returns 1 if \p x > 1.
* - __saturatef(\p x) returns \p x if
* \latexonly $0 \le x \le 1$ \endlatexonly
* \xmlonly
*
* input[0] = x<7:0> input[1] = x<15:8> * input[2] = x<23:16> input[3] = x<31:24> * input[4] = y<7:0> input[5] = y<15:8> * input[6] = y<23:16> input[7] = y<31:24> ** The selector indices are as follows (the upper 16-bits of the selector are not used): *
* selector[0] = s<2:0> selector[1] = s<6:4> * selector[2] = s<10:8> selector[3] = s<14:12> ** \return The returned value r is computed to be: * result[n] := input[selector[n]] * where result[n] is the nth byte of r. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s); /** * \ingroup CUDA_MATH_INTRINSIC_INT * \brief Compute average of signed input arguments, avoiding overflow * in the intermediate sum. * * Compute average of signed input arguments \p x and \p y * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum. * * \return Returns a signed integer value representing the signed * average value of the two inputs. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int __hadd(int x, int y); /** * \ingroup CUDA_MATH_INTRINSIC_INT * \brief Compute rounded average of signed input arguments, avoiding * overflow in the intermediate sum. * * Compute average of signed input arguments \p x and \p y * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate * sum. * * \return Returns a signed integer value representing the signed * rounded average value of the two inputs. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int __rhadd(int x, int y); /** * \ingroup CUDA_MATH_INTRINSIC_INT * \brief Compute average of unsigned input arguments, avoiding overflow * in the intermediate sum. * * Compute average of unsigned input arguments \p x and \p y * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum. * * \return Returns an unsigned integer value representing the unsigned * average value of the two inputs. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __uhadd(unsigned int x, unsigned int y); /** * \ingroup CUDA_MATH_INTRINSIC_INT * \brief Compute rounded average of unsigned input arguments, avoiding * overflow in the intermediate sum. * * Compute average of unsigned input arguments \p x and \p y * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate * sum. * * \return Returns an unsigned integer value representing the unsigned * rounded average value of the two inputs. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __urhadd(unsigned int x, unsigned int y); /** * \ingroup CUDA_MATH_INTRINSIC_CAST * \brief Convert a double to a signed int in round-towards-zero mode. * * Convert the double-precision floating-point value \p x to a * signed integer value in round-towards-zero mode. * \return Returns converted value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int __double2int_rz(double x); /** * \ingroup CUDA_MATH_INTRINSIC_CAST * \brief Convert a double to an unsigned int in round-towards-zero mode. * * Convert the double-precision floating-point value \p x to an * unsigned integer value in round-towards-zero mode. * \return Returns converted value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __double2uint_rz(double x); /** * \ingroup CUDA_MATH_INTRINSIC_CAST * \brief Convert a double to a signed 64-bit int in round-towards-zero mode. * * Convert the double-precision floating-point value \p x to a * signed 64-bit integer value in round-towards-zero mode. * \return Returns converted value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int __double2ll_rz(double x); /** * \ingroup CUDA_MATH_INTRINSIC_CAST * \brief Convert a double to an unsigned 64-bit int in round-towards-zero mode. * * Convert the double-precision floating-point value \p x to an * unsigned 64-bit integer value in round-towards-zero mode. * \return Returns converted value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __double2ull_rz(double x); __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __pm0(void); __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __pm1(void); __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __pm2(void); __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __pm3(void); /******************************************************************************* * * * FP16 SIMD functions * * * *******************************************************************************/ // #include "fp16.h" /******************************************************************************* * * * SIMD functions * * * *******************************************************************************/ /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword absolute value. * * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes, * then computes absolute value for each of parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs2(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword absolute value with signed saturation. * * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes, * then computes absolute value with signed saturation for each of parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss2(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed addition, with wrap-around: a + b * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then performs unsigned addition on corresponding parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword addition with signed saturation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then performs addition with signed saturation on corresponding parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss2 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword addition with unsigned saturation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then performs addition with unsigned saturation on corresponding parts. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus2 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed rounded average computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then computes signed rounded average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned rounded average computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then computes unsigned rounded average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned average computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, * then computes unsigned average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if they are equal, and 0000 otherwise. * For example __vcmpeq2(0x1234aba5, 0x1234aba6) returns 0xffff0000. * \return Returns 0xffff computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison: a >= b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise. * For example __vcmpges2(0x1234aba5, 0x1234aba6) returns 0xffff0000. * \return Returns 0xffff if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison: a >= b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise. * For example __vcmpgeu2(0x1234aba5, 0x1234aba6) returns 0xffff0000. * \return Returns 0xffff if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison: a > b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise. * For example __vcmpgts2(0x1234aba5, 0x1234aba6) returns 0x00000000. * \return Returns 0xffff if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison: a > b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise. * For example __vcmpgtu2(0x1234aba5, 0x1234aba6) returns 0x00000000. * \return Returns 0xffff if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison: a <= b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise. * For example __vcmples2(0x1234aba5, 0x1234aba6) returns 0xffffffff. * \return Returns 0xffff if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison: a <= b ? 0xffff : 0. * * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise. * For example __vcmpleu2(0x1234aba5, 0x1234aba6) returns 0xffffffff. * \return Returns 0xffff if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison: a < b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise. * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff. * \return Returns 0xffff if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison: a < b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise. * For example __vcmpltu2(0x1234aba5, 0x1234aba6) returns 0x0000ffff. * \return Returns 0xffff if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed comparison: a != b ? 0xffff : 0. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts result is ffff if 'a' part != 'b' part, and 0000 otherwise. * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff. * \return Returns 0xffff if a != b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword absolute difference of unsigned integer computation: |a - b| * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes absolute difference. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed maximum computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes signed maximum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned maximum computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes unsigned maximum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed minimum computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes signed minimum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned minimum computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes unsigned minimum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part == 'b' part. * If both equalities are satisfied, function returns 1. * \return Returns 1 if a = b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part >= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned minimum unsigned comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part >= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part > 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part > 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned minimum computation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword unsigned comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed comparison. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs comparison 'a' part != 'b' part. * If both conditions are satisfied, function returns 1. * \return Returns 1 if a != b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword sum of abs diff of unsigned. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes absolute differences and returns * sum of those differences. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed subtraction, with wrap-around. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs subtraction. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword (un)signed subtraction, with signed saturation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs subtraction with signed saturation. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss2 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword subtraction with unsigned saturation. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function performs subtraction with unsigned saturation. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus2 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword negation. * * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes. * For each part function computes negation. Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg2(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword negation with signed saturation. * * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes. * For each part function computes negation. Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss2(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-halfword sum of absolute difference of signed integer. * * Splits 4 bytes of each into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes absolute difference. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-halfword sum of absolute difference of signed. * * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. * For corresponding parts function computes absolute difference and sum it up. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads2(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte absolute value. * * Splits argument by bytes. Computes absolute value of each byte. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs4(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte absolute value with signed saturation. * * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte, * then computes absolute value with signed saturation for each of parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss4(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte (un)signed addition. * * Splits 'a' into 4 bytes, then performs unsigned addition on each of these * bytes with the corresponding byte from 'b', ignoring overflow. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte addition with signed saturation. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte, * then performs addition with signed saturation on corresponding parts. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss4 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte addition with unsigned saturation. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte, * then performs addition with unsigned saturation on corresponding parts. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus4 (unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte signed rounded average. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * then computes signed rounded average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned rounded average. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * then computes unsigned rounded average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte unsigned average. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * then computes unsigned average of corresponding parts. Partial results are * recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte (un)signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if they are equal, and 00 otherwise. * For example __vcmpeq4(0x1234aba5, 0x1234aba6) returns 0xffffff00. * \return Returns 0xff if a = b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise. * For example __vcmpges4(0x1234aba5, 0x1234aba6) returns 0xffffff00. * \return Returns 0xff if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise. * For example __vcmpgeu4(0x1234aba5, 0x1234aba6) returns 0xffffff00. * \return Returns 0xff if a = b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise. * For example __vcmpgts4(0x1234aba5, 0x1234aba6) returns 0x00000000. * \return Returns 0xff if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise. * For example __vcmpgtu4(0x1234aba5, 0x1234aba6) returns 0x00000000. * \return Returns 0xff if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise. * For example __vcmples4(0x1234aba5, 0x1234aba6) returns 0xffffffff. * \return Returns 0xff if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise. * For example __vcmpleu4(0x1234aba5, 0x1234aba6) returns 0xffffffff. * \return Returns 0xff if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise. * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff. * \return Returns 0xff if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise. * For example __vcmpltu4(0x1234aba5, 0x1234aba6) returns 0x000000ff. * \return Returns 0xff if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte (un)signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts result is ff if 'a' part != 'b' part, and 00 otherwise. * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff. * \return Returns 0xff if a != b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte absolute difference of unsigned integer. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes absolute difference. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte signed maximum. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes signed maximum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte unsigned maximum. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes unsigned maximum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte signed minimum. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes signed minimum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte unsigned minimum. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes unsigned minimum. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte (un)signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part == 'b' part. * If both equalities are satisfied, function returns 1. * \return Returns 1 if a = b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 part, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a <= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part <= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a < b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part >= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part >= 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a >= b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part > 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte unsigned comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part > 'b' part. * If both inequalities are satisfied, function returns 1. * \return Returns 1 if a > b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte (un)signed comparison. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs comparison 'a' part != 'b' part. * If both conditions are satisfied, function returns 1. * \return Returns 1 if a != b, else returns 0. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte sum of abs difference of unsigned. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes absolute differences and returns * sum of those differences. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte subtraction. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs subtraction. Partial results * are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte subtraction with signed saturation. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs subtraction with signed saturation. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte subtraction with unsigned saturation. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function performs subtraction with unsigned saturation. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte negation. * * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte. * For each part function computes negation. Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg4(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Performs per-byte negation with signed saturation. * * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte. * For each part function computes negation. Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss4(unsigned int a); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte absolute difference of signed integer. * * Splits 4 bytes of each into 4 parts, each consisting of 1 byte. * For corresponding parts function computes absolute difference. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs4(unsigned int a, unsigned int b); /** * \ingroup CUDA_MATH_INTRINSIC_SIMD * \brief Computes per-byte sum of abs difference of signed. * * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. * For corresponding parts function computes absolute difference and sum it up. * Partial results are recombined and returned as unsigned int. * \return Returns computed value. */ __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads4(unsigned int a, unsigned int b); /******************************************************************************* * * * END SIMD functions * * * *******************************************************************************/ } /******************************************************************************* * * * * * * *******************************************************************************/ #if defined(_WIN32) # define __DEVICE_FUNCTIONS_DEPRECATED__(msg) __declspec(deprecated(msg)) #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) # define __DEVICE_FUNCTIONS_DEPRECATED__(msg) __attribute__((deprecated)) #else # define __DEVICE_FUNCTIONS_DEPRECATED__(msg) __attribute__((deprecated(msg))) #endif #define ___DEVICE_FUNCTIONS_STRINGIFY_INNERMOST(x) #x #define __DEVICE_FUNCTIONS_STRINGIFY(x) ___DEVICE_FUNCTIONS_STRINGIFY_INNERMOST(x) #define __DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(x) __DEVICE_FUNCTIONS_STRINGIFY(x) "() is deprecated in favor of __" __DEVICE_FUNCTIONS_STRINGIFY(x) "() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." #define __DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(x,y) __DEVICE_FUNCTIONS_STRINGIFY(x) "() is deprecated in favor of __" __DEVICE_FUNCTIONS_STRINGIFY(x) __DEVICE_FUNCTIONS_STRINGIFY(y) "() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mulhi)) int mulhi(const int a, const int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mulhi)) unsigned int mulhi(const unsigned int a, const unsigned int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mulhi)) unsigned int mulhi(const int a, const unsigned int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mulhi)) unsigned int mulhi(const unsigned int a, const int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mul64hi)) long long int mul64hi(const long long int a, const long long int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mul64hi)) unsigned long long int mul64hi(const unsigned long long int a, const unsigned long long int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mul64hi)) unsigned long long int mul64hi(const long long int a, const unsigned long long int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mul64hi)) unsigned long long int mul64hi(const unsigned long long int a, const long long int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(float_as_int)) int float_as_int(const float a); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(int_as_float)) float int_as_float(const int a); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(float_as_uint)) unsigned int float_as_uint(const float a); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(uint_as_float)) float uint_as_float(const unsigned int a); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(saturate,f)) float saturate(const float a); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(mul24)) int mul24(const int a, const int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1(umul24)) unsigned int umul24(const unsigned int a, const unsigned int b); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(float2int,_ru|_rd|_rn|_rz)) int float2int(const float a, const enum cudaRoundMode mode = cudaRoundZero); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(float2uint,_ru|_rd|_rn|_rz)) unsigned int float2uint(const float a, const enum cudaRoundMode mode = cudaRoundZero); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(int2float,_ru|_rd|_rn|_rz)) float int2float(const int a, const enum cudaRoundMode mode = cudaRoundNearest); __DEVICE_FUNCTIONS_STATIC_DECL__ __DEVICE_FUNCTIONS_DEPRECATED__(__DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2(uint2float,_ru|_rd|_rn|_rz)) float uint2float(const unsigned int a, const enum cudaRoundMode mode = cudaRoundNearest); #undef __DEVICE_FUNCTIONS_DEPRECATED__ #undef ___DEVICE_FUNCTIONS_STRINGIFY_INNERMOST #undef __DEVICE_FUNCTIONS_STRINGIFY #undef __DEVICE_FUNCTIONS_DEPRECATION_MESSAGE1 #undef __DEVICE_FUNCTIONS_DEPRECATION_MESSAGE2 #undef __DEVICE_FUNCTIONS_DECL__ #undef __DEVICE_FUNCTIONS_STATIC_DECL__ #endif /* __cplusplus && __CUDACC__ */ /******************************************************************************* * * * * * * *******************************************************************************/ #if !defined(__CUDACC_RTC__) #include "device_functions.hpp" #endif /* !defined(__CUDACC_RTC__) */ #include "device_atomic_functions.h" #include "device_double_functions.h" #include "sm_20_atomic_functions.h" #include "sm_32_atomic_functions.h" #include "sm_35_atomic_functions.h" #include "sm_60_atomic_functions.h" #include "sm_20_intrinsics.h" #include "sm_30_intrinsics.h" #include "sm_32_intrinsics.h" #include "sm_35_intrinsics.h" #include "sm_61_intrinsics.h" #include "sm_70_rt.h" #include "sm_80_rt.h" #include "sm_90_rt.h" #include "surface_functions.h" #include "texture_fetch_functions.h" #include "texture_indirect_functions.h" #include "surface_indirect_functions.h" #include "cudacc_ext.h" #ifdef __CUDACC__ extern "C" __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, struct CUstream_st *stream = 0); #endif /* __CUDACC__ */ #endif /* !__DEVICE_FUNCTIONS_H__ */ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__) #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__ #endif