cpp/en/intrin__cpp_8hpp_source.html

/*M///////////////////////////////////////////////////////////////////////////////////////

//

//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.

//

//  By downloading, copying, installing or using the software you agree to this license.

//  If you do not agree to this license, do not download, install,

//  copy or use the software.

//

//

//                          License Agreement

//                For Open Source Computer Vision Library

//

// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.

// Copyright (C) 2009, Willow Garage Inc., all rights reserved.

// Copyright (C) 2013, OpenCV Foundation, all rights reserved.

// Copyright (C) 2015, Itseez Inc., all rights reserved.

// Third party copyrights are property of their respective owners.

//

// Redistribution and use in source and binary forms, with or without modification,

// are permitted provided that the following conditions are met:

//

//   * Redistribution's of source code must retain the above copyright notice,

//     this list of conditions and the following disclaimer.

//

//   * Redistribution's in binary form must reproduce the above copyright notice,

//     this list of conditions and the following disclaimer in the documentation

//     and/or other materials provided with the distribution.

//

//   * The name of the copyright holders may not be used to endorse or promote products

//     derived from this software without specific prior written permission.

//

// This software is provided by the copyright holders and contributors "as is" and

// any express or implied warranties, including, but not limited to, the implied

// warranties of merchantability and fitness for a particular purpose are disclaimed.

// In no event shall the Intel Corporation or contributors be liable for any direct,

// indirect, incidental, special, exemplary, or consequential damages

// (including, but not limited to, procurement of substitute goods or services;

// loss of use, data, or profits; or business interruption) however caused

// and on any theory of liability, whether in contract, strict liability,

// or tort (including negligence or otherwise) arising in any way out of

// the use of this software, even if advised of the possibility of such damage.

//

//M*/


#ifndef OPENCV_HAL_INTRIN_CPP_HPP

#define OPENCV_HAL_INTRIN_CPP_HPP


#include <limits>

#include <cstring>

#include <algorithm>

#include "opencv2/core/saturate.hpp"


#define CV_SIMD128_CPP 1

#if defined(CV_FORCE_SIMD128_CPP)

#define CV_SIMD128 1

#define CV_SIMD128_64F 1

#endif

#if defined(CV_DOXYGEN)

#define CV_SIMD128 1

#define CV_SIMD128_64F 1

#define CV_SIMD256 1

#define CV_SIMD256_64F 1

#define CV_SIMD512 1

#define CV_SIMD512_64F 1

#else

#define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation

#define CV_SIMD512 0 // to avoid warnings during compilation

#endif


namespace cv

{


#ifndef CV_DOXYGEN

CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

#endif


template<typename _Tp, int n> struct v_reg

{

    typedef _Tp lane_type;

    enum { nlanes = n };

// !@endcond


    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }


    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }


    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }


    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,

           _Tp s4, _Tp s5, _Tp s6, _Tp s7)

    {

        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;

        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;

    }


    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,

           _Tp s4, _Tp s5, _Tp s6, _Tp s7,

           _Tp s8, _Tp s9, _Tp s10, _Tp s11,

           _Tp s12, _Tp s13, _Tp s14, _Tp s15)

    {

        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;

        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;

        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;

        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;

    }


    v_reg() {}


    v_reg(const v_reg<_Tp, n> & r)

    {

        for( int i = 0; i < n; i++ )

            s[i] = r.s[i];

    }

    _Tp get0() const { return s[0]; }


    _Tp get(const int i) const { return s[i]; }

    v_reg<_Tp, n> high() const

    {

        v_reg<_Tp, n> c;

        int i;

        for( i = 0; i < n/2; i++ )

        {

            c.s[i] = s[i+(n/2)];

            c.s[i+(n/2)] = 0;

        }

        return c;

    }


    static v_reg<_Tp, n> zero()

    {

        v_reg<_Tp, n> c;

        for( int i = 0; i < n; i++ )

            c.s[i] = (_Tp)0;

        return c;

    }


    static v_reg<_Tp, n> all(_Tp s)

    {

        v_reg<_Tp, n> c;

        for( int i = 0; i < n; i++ )

            c.s[i] = s;

        return c;

    }


    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const

    {

        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);

        v_reg<_Tp2, n2> c;

        std::memcpy(&c.s[0], &s[0], bytes);

        return c;

    }


    v_reg& operator=(const v_reg<_Tp, n> & r)

    {

        for( int i = 0; i < n; i++ )

            s[i] = r.s[i];

        return *this;

    }


    _Tp s[n];

};


typedef v_reg<uchar, 16> v_uint8x16;

typedef v_reg<schar, 16> v_int8x16;

typedef v_reg<ushort, 8> v_uint16x8;

typedef v_reg<short, 8> v_int16x8;

typedef v_reg<unsigned, 4> v_uint32x4;

typedef v_reg<int, 4> v_int32x4;

typedef v_reg<float, 4> v_float32x4;

typedef v_reg<double, 2> v_float64x2;

typedef v_reg<uint64, 2> v_uint64x2;

typedef v_reg<int64, 2> v_int64x2;


#if CV_SIMD256

typedef v_reg<uchar, 32> v_uint8x32;

typedef v_reg<schar, 32> v_int8x32;

typedef v_reg<ushort, 16> v_uint16x16;

typedef v_reg<short, 16> v_int16x16;

typedef v_reg<unsigned, 8> v_uint32x8;

typedef v_reg<int, 8> v_int32x8;

typedef v_reg<float, 8> v_float32x8;

typedef v_reg<double, 4> v_float64x4;

typedef v_reg<uint64, 4> v_uint64x4;

typedef v_reg<int64, 4> v_int64x4;

#endif


#if CV_SIMD512

typedef v_reg<uchar, 64> v_uint8x64;

typedef v_reg<schar, 64> v_int8x64;

typedef v_reg<ushort, 32> v_uint16x32;

typedef v_reg<short, 32> v_int16x32;

typedef v_reg<unsigned, 16> v_uint32x16;

typedef v_reg<int, 16> v_int32x16;

typedef v_reg<float, 16> v_float32x16;

typedef v_reg<double, 8> v_float64x8;

typedef v_reg<uint64, 8> v_uint64x8;

typedef v_reg<int64, 8> v_int64x8;

#endif


enum {

    simd128_width = 16,

#if CV_SIMD256

    simd256_width = 32,

#endif

#if CV_SIMD512

    simd512_width = 64,

    simdmax_width = simd512_width

#elif CV_SIMD256

    simdmax_width = simd256_width

#else

    simdmax_width = simd128_width

#endif

};


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);


#ifndef CV_DOXYGEN


#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \

__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \

__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \

__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \

__CV_EXPAND(macro_name(short, __VA_ARGS__)) \

__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \

__CV_EXPAND(macro_name(int, __VA_ARGS__)) \

__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \

__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \


#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \

__CV_EXPAND(macro_name(float, __VA_ARGS__)) \

__CV_EXPAND(macro_name(double, __VA_ARGS__)) \


#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \

CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \

CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \


#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \

template<int n> inline \

v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \

    return c; \

} \

template<int n> inline \

v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    for( int i = 0; i < n; i++ ) \

        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \

    return a; \

}


#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)


CV__HAL_INTRIN_IMPL_BIN_OP(+)

CV__HAL_INTRIN_IMPL_BIN_OP(-)

CV__HAL_INTRIN_IMPL_BIN_OP(*)

CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)


#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \

template<int n> CV_INLINE \

v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tp, n> c; \

    typedef typename V_TypeTraits<_Tp>::int_type itype; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \

                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \

    return c; \

} \

template<int n> CV_INLINE \

v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    typedef typename V_TypeTraits<_Tp>::int_type itype; \

    for( int i = 0; i < n; i++ ) \

        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \

                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \

    return a; \

}


#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \

CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \

CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */


CV__HAL_INTRIN_IMPL_BIT_OP(&)

CV__HAL_INTRIN_IMPL_BIT_OP(|)

CV__HAL_INTRIN_IMPL_BIT_OP(^)


#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \

template<int n> CV_INLINE \

v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \

{ \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \

    return c; \

} \


CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)


#endif  // !CV_DOXYGEN


#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \

template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \

{ \

    v_reg<_Tp2, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = cfunc(a.s[i]); \

    return c; \

}


OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)


OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)

OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)

OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)

OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)


OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,

                          typename V_TypeTraits<_Tp>::abs_type)


#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \

template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = cfunc(a.s[i], b.s[i]); \

    return c; \

}


#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \

template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \

{ \

    _Tp c = a.s[0]; \

    for( int i = 1; i < n; i++ ) \

        c = cfunc(c, a.s[i]); \

    return c; \

}


OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)


OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)


OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)


OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)


static const unsigned char popCountTable[] =

{

    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,

    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,

};

template<typename _Tp, int n>

inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)

{

    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();

    for (int i = 0; i < n*(int)sizeof(_Tp); i++)

        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];

    return b;

}


template<typename _Tp, int n>

inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )

{

    for( int i = 0; i < n; i++ )

    {

        minval.s[i] = std::min(a.s[i], b.s[i]);

        maxval.s[i] = std::max(a.s[i], b.s[i]);

    }

}


#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \

template<typename _Tp, int n> \

inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    typedef typename V_TypeTraits<_Tp>::int_type itype; \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \

    return c; \

}


OPENCV_HAL_IMPL_CMP_OP(<)


OPENCV_HAL_IMPL_CMP_OP(>)


OPENCV_HAL_IMPL_CMP_OP(<=)


OPENCV_HAL_IMPL_CMP_OP(>=)


OPENCV_HAL_IMPL_CMP_OP(==)


OPENCV_HAL_IMPL_CMP_OP(!=)


template<int n>

inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)

{

    typedef typename V_TypeTraits<float>::int_type itype;

    v_reg<float, n> c;

    for (int i = 0; i < n; i++)

        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));

    return c;

}

template<int n>

inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)

{

    typedef typename V_TypeTraits<double>::int_type itype;

    v_reg<double, n> c;

    for (int i = 0; i < n; i++)

        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));

    return c;

}


#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \

template<typename _Tp, int n> \

inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    typedef _Tp2 rtype; \

    v_reg<rtype, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \

    return c; \

}


OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)


OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)


OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)


template<typename T> inline T _absdiff(T a, T b)

{

    return a > b ? a - b : b - a;

}


template<typename _Tp, int n>

inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)

{

    typedef typename V_TypeTraits<_Tp>::abs_type rtype;

    v_reg<rtype, n> c;

    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);

    for( int i = 0; i < n; i++ )

    {

        rtype ua = a.s[i] ^ mask;

        rtype ub = b.s[i] ^ mask;

        c.s[i] = _absdiff(ua, ub);

    }

    return c;

}


template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)

{

    v_reg<float, n> c;

    for( int i = 0; i < c.nlanes; i++ )

        c.s[i] = _absdiff(a.s[i], b.s[i]);

    return c;

}


template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)

{

    v_reg<double, n> c;

    for( int i = 0; i < c.nlanes; i++ )

        c.s[i] = _absdiff(a.s[i], b.s[i]);

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++)

        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = 1.f/std::sqrt(a.s[i]);

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                           const v_reg<_Tp, n>& c)

{

    v_reg<_Tp, n> d;

    for( int i = 0; i < n; i++ )

        d.s[i] = a.s[i]*b.s[i] + c.s[i];

    return d;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                              const v_reg<_Tp, n>& c)

{

    return v_fma(a, b, c);

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<w_type, n/2> c;

    for( int i = 0; i < (n/2); i++ )

        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];

    return c;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)

{

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<w_type, n/2> s;

    for( int i = 0; i < (n/2); i++ )

        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];

    return s;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{ return v_dotprod(a, b); }


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)

{ return v_dotprod(a, b, c); }


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>

v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    typedef typename V_TypeTraits<_Tp>::q_type q_type;

    v_reg<q_type, n/4> s;

    for( int i = 0; i < (n/4); i++ )

        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +

                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];

    return s;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>

v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)

{

    typedef typename V_TypeTraits<_Tp>::q_type q_type;

    v_reg<q_type, n/4> s;

    for( int i = 0; i < (n/4); i++ )

        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +

                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];

    return s;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>

v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{ return v_dotprod_expand(a, b); }


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>

v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)

{ return v_dotprod_expand(a, b, c); }


template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,

                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)

{

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    for( int i = 0; i < (n/2); i++ )

    {

        c.s[i] = (w_type)a.s[i]*b.s[i];

        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];

    }

}


template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<_Tp, n> c;

    for (int i = 0; i < n; i++)

        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);

    return c;

}


template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,

                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)

{

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    for( int i = 0; i < (n/2); i++ )

    {

        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];

    }

}


#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \

template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \

{ \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = (_Tp)(a.s[i] shift_op imm); \

    return c; \

}


OPENCV_HAL_IMPL_SHIFT_OP(<< )


OPENCV_HAL_IMPL_SHIFT_OP(>> )


#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \

template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \

{ \

    v_reg<_Tp, n> b; \

    for (int i = 0; i < n; i++) \

    { \

        int sIndex = i opA imm; \

        if (0 <= sIndex && sIndex < n) \

        { \

            b.s[i] = a.s[sIndex]; \

        } \

        else \

        { \

            b.s[i] = 0; \

        } \

    } \

    return b; \

} \

template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tp, n> c; \

    for (int i = 0; i < n; i++) \

    { \

        int aIndex = i opA imm; \

        int bIndex = i opA imm opB n; \

        if (0 <= bIndex && bIndex < n) \

        { \

            c.s[i] = b.s[bIndex]; \

        } \

        else if (0 <= aIndex && aIndex < n) \

        { \

            c.s[i] = a.s[aIndex]; \

        } \

        else \

        { \

            c.s[i] = 0; \

        } \

    } \

    return c; \

}


OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)


OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)


template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)

{

    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];

    for( int i = 1; i < n; i++ )

        c += a.s[i];

    return c;

}


template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,

    const v_reg<float, n>& c, const v_reg<float, n>& d)

{

    v_reg<float, n> r;

    for(int i = 0; i < (n/4); i++)

    {

        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];

        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];

        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];

        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];

    }

    return r;

}


template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);

    for (int i = 1; i < n; i++)

        c += _absdiff(a.s[i], b.s[i]);

    return c;

}


template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)

{

    int mask = 0;

    for( int i = 0; i < n; i++ )

        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;

    return mask;

}


template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)

{

    for (int i = 0; i < n; i++)

        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)

            return i;

    return 0;

}


template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)

{

    for( int i = 0; i < n; i++ )

        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )

            return false;

    return true;

}


template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)

{

    for( int i = 0; i < n; i++ )

        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )

            return true;

    return false;

}


template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,

                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    typedef V_TypeTraits<_Tp> Traits;

    typedef typename Traits::int_type int_type;

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++ )

    {

        int_type m = Traits::reinterpret_int(mask.s[i]);

        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc

        c.s[i] = m ? a.s[i] : b.s[i];

    }

    return c;

}


template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,

                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,

                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)

{

    for( int i = 0; i < (n/2); i++ )

    {

        b0.s[i] = a.s[i];

        b1.s[i] = a.s[i+(n/2)];

    }

}


template<typename _Tp, int n>

inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_expand_low(const v_reg<_Tp, n>& a)

{

    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;

    for( int i = 0; i < (n/2); i++ )

        b.s[i] = a.s[i];

    return b;

}


template<typename _Tp, int n>

inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>

v_expand_high(const v_reg<_Tp, n>& a)

{

    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;

    for( int i = 0; i < (n/2); i++ )

        b.s[i] = a.s[i+(n/2)];

    return b;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>

    v_reinterpret_as_int(const v_reg<_Tp, n>& a)

{

    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);

    return c;

}


template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>

    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)

{

    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);

    return c;

}


template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,

                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )

{

    int i;

    for( i = 0; i < n/2; i++ )

    {

        b0.s[i*2] = a0.s[i];

        b0.s[i*2+1] = a1.s[i];

    }

    for( ; i < n; i++ )

    {

        b1.s[i*2-n] = a0.s[i];

        b1.s[i*2-n+1] = a1.s[i];

    }

}


template<typename _Tp>

inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);

}

#endif


template<typename _Tp>

inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));

    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));

    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));

    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);

}

#endif


template<typename _Tp>

inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;

    for( int i = 0; i < c.nlanes/2; i++ )

    {

        c.s[i] = ptr[i];

    }

    return c;

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes / 2; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes / 2; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


template<typename _Tp>

inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(loptr));

    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));

#endif

    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;

    for( int i = 0; i < c.nlanes/2; i++ )

    {

        c.s[i] = loptr[i];

        c.s[i+c.nlanes/2] = hiptr[i];

    }

    return c;

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(loptr));

    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));

#endif

    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes / 2; i++)

    {

        c.s[i] = loptr[i];

        c.s[i + c.nlanes / 2] = hiptr[i];

    }

    return c;

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(loptr));

    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));

#endif

    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes / 2; i++)

    {

        c.s[i] = loptr[i];

        c.s[i + c.nlanes / 2] = hiptr[i];

    }

    return c;

}

#endif


template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>

v_load_expand(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<w_type, simd128_width / sizeof(w_type)> c;

    for( int i = 0; i < c.nlanes; i++ )

    {

        c.s[i] = ptr[i];

    }

    return c;

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>

v256_load_expand(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<w_type, simd256_width / sizeof(w_type)> c;

    for (int i = 0; i < c.nlanes; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>

v512_load_expand(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::w_type w_type;

    v_reg<w_type, simd512_width / sizeof(w_type)> c;

    for (int i = 0; i < c.nlanes; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>

v_load_expand_q(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::q_type q_type;

    v_reg<q_type, simd128_width / sizeof(q_type)> c;

    for( int i = 0; i < c.nlanes; i++ )

    {

        c.s[i] = ptr[i];

    }

    return c;

}


#if CV_SIMD256

template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>

v256_load_expand_q(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::q_type q_type;

    v_reg<q_type, simd256_width / sizeof(q_type)> c;

    for (int i = 0; i < c.nlanes; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


#if CV_SIMD512

template<typename _Tp>

inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>

v512_load_expand_q(const _Tp* ptr)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    typedef typename V_TypeTraits<_Tp>::q_type q_type;

    v_reg<q_type, simd512_width / sizeof(q_type)> c;

    for (int i = 0; i < c.nlanes; i++)

    {

        c.s[i] = ptr[i];

    }

    return c;

}

#endif


template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,

                                                            v_reg<_Tp, n>& b)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i2;

    for( i = i2 = 0; i < n; i++, i2 += 2 )

    {

        a.s[i] = ptr[i2];

        b.s[i] = ptr[i2+1];

    }

}


template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,

                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i3;

    for( i = i3 = 0; i < n; i++, i3 += 3 )

    {

        a.s[i] = ptr[i3];

        b.s[i] = ptr[i3+1];

        c.s[i] = ptr[i3+2];

    }

}


template<typename _Tp, int n>

inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,

                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,

                                v_reg<_Tp, n>& d)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i4;

    for( i = i4 = 0; i < n; i++, i4 += 4 )

    {

        a.s[i] = ptr[i4];

        b.s[i] = ptr[i4+1];

        c.s[i] = ptr[i4+2];

        d.s[i] = ptr[i4+3];

    }

}


template<typename _Tp, int n>

inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,

                               const v_reg<_Tp, n>& b,

                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i2;

    for( i = i2 = 0; i < n; i++, i2 += 2 )

    {

        ptr[i2] = a.s[i];

        ptr[i2+1] = b.s[i];

    }

}


template<typename _Tp, int n>

inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,

                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,

                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i3;

    for( i = i3 = 0; i < n; i++, i3 += 3 )

    {

        ptr[i3] = a.s[i];

        ptr[i3+1] = b.s[i];

        ptr[i3+2] = c.s[i];

    }

}


template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,

                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,

                                                            const v_reg<_Tp, n>& d,

                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    int i, i4;

    for( i = i4 = 0; i < n; i++, i4 += 4 )

    {

        ptr[i4] = a.s[i];

        ptr[i4+1] = b.s[i];

        ptr[i4+2] = c.s[i];

        ptr[i4+3] = d.s[i];

    }

}


template<typename _Tp, int n>

inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    for( int i = 0; i < n; i++ )

        ptr[i] = a.s[i];

}


template<typename _Tp, int n>

inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    v_store(ptr, a);

}


template<typename _Tp, int n>

inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    for( int i = 0; i < (n/2); i++ )

        ptr[i] = a.s[i];

}


template<typename _Tp, int n>

inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)

{

#if CV_STRONG_ALIGNMENT

    CV_Assert(isAligned<sizeof(_Tp)>(ptr));

#endif

    for( int i = 0; i < (n/2); i++ )

        ptr[i] = a.s[i+(n/2)];

}


template<typename _Tp, int n>

inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));

    v_store(ptr, a);

}


template<typename _Tp, int n>

inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));

    v_store(ptr, a);

}


template<typename _Tp, int n>

inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)

{

    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));

    v_store(ptr, a);

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < (n/2); i++ )

    {

        c.s[i] = a.s[i];

        c.s[i+(n/2)] = b.s[i];

    }

    return c;

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < (n/2); i++ )

    {

        c.s[i] = a.s[i+(n/2)];

        c.s[i+(n/2)] = b.s[i+(n/2)];

    }

    return c;

}


template<typename _Tp, int n>

inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)

{

    for( int i = 0; i < (n/2); i++ )

    {

        low.s[i] = a.s[i];

        low.s[i+(n/2)] = b.s[i];

        high.s[i] = a.s[i+(n/2)];

        high.s[i+(n/2)] = b.s[i+(n/2)];

    }

}


template<typename _Tp, int n>

inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)

{

    v_reg<_Tp, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = a.s[n-i-1];

    return c;

}


template<int s, typename _Tp, int n>

inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    v_reg<_Tp, n> r;

    const int shift = n - s;

    int i = 0;

    for (; i < shift; ++i)

        r.s[i] = a.s[i+s];

    for (; i < n; ++i)

        r.s[i] = b.s[i-shift];

    return r;

}


template<int s, typename _Tp, int n>

inline _Tp v_extract_n(const v_reg<_Tp, n>& v)

{

    CV_DbgAssert(s >= 0 && s < n);

    return v.s[s];

}


template<int i, typename _Tp, int n>

inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)

{

    CV_DbgAssert(i >= 0 && i < n);

    return v_reg<_Tp, n>::all(a.s[i]);

}


template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)

{

    v_reg<int, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = cvRound(a.s[i]);

    return c;

}


template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)

{

    v_reg<int, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = cvRound(a.s[i]);

        c.s[i+n] = cvRound(b.s[i]);

    }

    return c;

}


template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)

{

    v_reg<int, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = cvFloor(a.s[i]);

    return c;

}


template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)

{

    v_reg<int, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = cvCeil(a.s[i]);

    return c;

}


template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)

{

    v_reg<int, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = (int)(a.s[i]);

    return c;

}


template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)

{

    v_reg<int, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = cvRound(a.s[i]);

        c.s[i+n] = 0;

    }

    return c;

}


template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)

{

    v_reg<int, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = cvFloor(a.s[i]);

        c.s[i+n] = 0;

    }

    return c;

}


template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)

{

    v_reg<int, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = cvCeil(a.s[i]);

        c.s[i+n] = 0;

    }

    return c;

}


template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)

{

    v_reg<int, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = (int)(a.s[i]);

        c.s[i+n] = 0;

    }

    return c;

}


template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)

{

    v_reg<float, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = (float)a.s[i];

    return c;

}


template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)

{

    v_reg<float, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = (float)a.s[i];

        c.s[i+n] = 0;

    }

    return c;

}


template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)

{

    v_reg<float, n*2> c;

    for( int i = 0; i < n; i++ )

    {

        c.s[i] = (float)a.s[i];

        c.s[i+n] = (float)b.s[i];

    }

    return c;

}


template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)

{

    v_reg<double, (n/2)> c;

    for( int i = 0; i < (n/2); i++ )

        c.s[i] = (double)a.s[i];

    return c;

}


template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)

{

    v_reg<double, (n/2)> c;

    for( int i = 0; i < (n/2); i++ )

        c.s[i] = (double)a.s[i + (n/2)];

    return c;

}


template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)

{

    v_reg<double, (n/2)> c;

    for( int i = 0; i < (n/2); i++ )

        c.s[i] = (double)a.s[i];

    return c;

}


template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)

{

    v_reg<double, (n/2)> c;

    for( int i = 0; i < (n/2); i++ )

        c.s[i] = (double)a.s[i + (n/2)];

    return c;

}


template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)

{

    v_reg<double, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = (double)a.s[i];

    return c;

}


template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)

{

    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes; i++)

        c.s[i] = tab[idx[i]];

    return c;

}

template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)

{

    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes; i++)

        c.s[i] = tab[idx[i / 2] + i % 2];

    return c;

}

template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)

{

    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;

    for (int i = 0; i < c.nlanes; i++)

        c.s[i] = tab[idx[i / 4] + i % 4];

    return c;

}


template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)

{

    v_reg<int, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = tab[idx.s[i]];

    return c;

}


template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)

{

    v_reg<int, n> c;

    for (int i = 0; i < n; i++)

        c.s[i] = tab[idx.s[i]];

    return c;

}


template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)

{

    v_reg<float, n> c;

    for( int i = 0; i < n; i++ )

        c.s[i] = tab[idx.s[i]];

    return c;

}


template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)

{

    v_reg<double, n/2> c;

    for( int i = 0; i < n/2; i++ )

        c.s[i] = tab[idx.s[i]];

    return c;

}


template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,

                                               v_reg<float, n>& x, v_reg<float, n>& y)

{

    for( int i = 0; i < n; i++ )

    {

        int j = idx.s[i];

        x.s[i] = tab[j];

        y.s[i] = tab[j+1];

    }

}


template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,

                                               v_reg<double, n>& x, v_reg<double, n>& y)

{

    for( int i = 0; i < n; i++ )

    {

        int j = idx.s[i];

        x.s[i] = tab[j];

        y.s[i] = tab[j+1];

    }

}


template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)

{

    v_reg<_Tp, n> c;

    for (int i = 0; i < n/4; i++)

    {

        c.s[4*i  ] = vec.s[4*i  ];

        c.s[4*i+1] = vec.s[4*i+2];

        c.s[4*i+2] = vec.s[4*i+1];

        c.s[4*i+3] = vec.s[4*i+3];

    }

    return c;

}


template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)

{

    v_reg<_Tp, n> c;

    for (int i = 0; i < n/8; i++)

    {

        c.s[8*i  ] = vec.s[8*i  ];

        c.s[8*i+1] = vec.s[8*i+4];

        c.s[8*i+2] = vec.s[8*i+1];

        c.s[8*i+3] = vec.s[8*i+5];

        c.s[8*i+4] = vec.s[8*i+2];

        c.s[8*i+5] = vec.s[8*i+6];

        c.s[8*i+6] = vec.s[8*i+3];

        c.s[8*i+7] = vec.s[8*i+7];

    }

    return c;

}


template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)

{

    v_reg<_Tp, n> c;

    for (int i = 0; i < n/4; i++)

    {

        c.s[3*i  ] = vec.s[4*i  ];

        c.s[3*i+1] = vec.s[4*i+1];

        c.s[3*i+2] = vec.s[4*i+2];

    }

    return c;

}


template<typename _Tp, int n>

inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,

                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,

                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,

                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )

{

    for (int i = 0; i < n / 4; i++)

    {

        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];

        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];

        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];

        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];

        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];

        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];

        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];

        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];

    }

}


#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \

inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }


OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)


#if CV_SIMD256

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)

#endif


#if CV_SIMD512

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)

OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)

#endif


#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \

inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }


OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)


#if CV_SIMD256

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)

#endif


#if CV_SIMD512

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)

OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)

OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)

#endif


#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \

template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \

    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \

{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }


OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)

OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)

OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)

OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)

OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)

OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)

OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)

OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)

OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)

OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)


#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \

template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \

{ return a << shift; }


OPENCV_HAL_IMPL_C_SHIFTL(ushort)

OPENCV_HAL_IMPL_C_SHIFTL(short)

OPENCV_HAL_IMPL_C_SHIFTL(unsigned)

OPENCV_HAL_IMPL_C_SHIFTL(int)

OPENCV_HAL_IMPL_C_SHIFTL(uint64)

OPENCV_HAL_IMPL_C_SHIFTL(int64)


#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \

template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \

{ return a >> shift; }


OPENCV_HAL_IMPL_C_SHIFTR(ushort)

OPENCV_HAL_IMPL_C_SHIFTR(short)

OPENCV_HAL_IMPL_C_SHIFTR(unsigned)

OPENCV_HAL_IMPL_C_SHIFTR(int)

OPENCV_HAL_IMPL_C_SHIFTR(uint64)

OPENCV_HAL_IMPL_C_SHIFTR(int64)


#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \

template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \

{ \

    v_reg<_Tp, n> c; \

    for( int i = 0; i < n; i++ ) \

        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \

    return c; \

}


OPENCV_HAL_IMPL_C_RSHIFTR(ushort)

OPENCV_HAL_IMPL_C_RSHIFTR(short)

OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)

OPENCV_HAL_IMPL_C_RSHIFTR(int)

OPENCV_HAL_IMPL_C_RSHIFTR(uint64)

OPENCV_HAL_IMPL_C_RSHIFTR(int64)


#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \

template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tpn, 2*n> c; \

    for( int i = 0; i < n; i++ ) \

    { \

        c.s[i] = cast<_Tpn>(a.s[i]); \

        c.s[i+n] = cast<_Tpn>(b.s[i]); \

    } \

    return c; \

}


OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)

OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)

OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)

OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)


#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \

template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \

{ \

    v_reg<_Tpn, 2*n> c; \

    for( int i = 0; i < n; i++ ) \

    { \

        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \

        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \

    } \

    return c; \

}


OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)


#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \

template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \

{ \

    for( int i = 0; i < n; i++ ) \

        ptr[i] = cast<_Tpn>(a.s[i]); \

}


OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)

OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)


#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \

template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \

{ \

    for( int i = 0; i < n; i++ ) \

        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \

}


OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)


template<typename _Tpm, typename _Tp, int n>

inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)

{

    for (int i = 0; i < n; ++i)

    {

        mptr[i] = (_Tpm)a.s[i];

        mptr[i + n] = (_Tpm)b.s[i];

    }

}


template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)

{

    v_reg<uchar, 2*n> mask;

    _pack_b(mask.s, a, b);

    return mask;

}


template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,

                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)

{

    v_reg<uchar, 4*n> mask;

    _pack_b(mask.s, a, b);

    _pack_b(mask.s + 2*n, c, d);

    return mask;

}


template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,

                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,

                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,

                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)

{

    v_reg<uchar, 8*n> mask;

    _pack_b(mask.s, a, b);

    _pack_b(mask.s + 2*n, c, d);

    _pack_b(mask.s + 4*n, e, f);

    _pack_b(mask.s + 6*n, g, h);

    return mask;

}


template<int n>

inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,

                                const v_reg<float, n>& a, const v_reg<float, n>& b,

                                const v_reg<float, n>& c, const v_reg<float, n>& d)

{

    v_reg<float, n> res;

    for (int i = 0; i < n / 4; i++)

    {

        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];

        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];

        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];

        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];

    }

    return res;

}


template<int n>

inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,

                                   const v_reg<float, n>& a, const v_reg<float, n>& b,

                                   const v_reg<float, n>& c, const v_reg<float, n>& d)

{

    v_reg<float, n> res;

    for (int i = 0; i < n / 4; i++)

    {

        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];

        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];

        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];

        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];

    }

    return res;

}


template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)

{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }

template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,

                                                           const v_reg<double, n/2>& c)

{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }


template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)

{ return v_dotprod_expand(a, b); }

template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,

                                                                const v_reg<double, n/2>& c)

{ return v_dotprod_expand(a, b, c); }


inline v_reg<float, simd128_width / sizeof(float)>

v_load_expand(const float16_t* ptr)

{

    v_reg<float, simd128_width / sizeof(float)> v;

    for( int i = 0; i < v.nlanes; i++ )

    {

        v.s[i] = ptr[i];

    }

    return v;

}

#if CV_SIMD256

inline v_reg<float, simd256_width / sizeof(float)>

v256_load_expand(const float16_t* ptr)

{

    v_reg<float, simd256_width / sizeof(float)> v;

    for (int i = 0; i < v.nlanes; i++)

    {

        v.s[i] = ptr[i];

    }

    return v;

}

#endif

#if CV_SIMD512

inline v_reg<float, simd512_width / sizeof(float)>

v512_load_expand(const float16_t* ptr)

{

    v_reg<float, simd512_width / sizeof(float)> v;

    for (int i = 0; i < v.nlanes; i++)

    {

        v.s[i] = ptr[i];

    }

    return v;

}

#endif


template<int n> inline void

v_pack_store(float16_t* ptr, const v_reg<float, n>& v)

{

    for( int i = 0; i < v.nlanes; i++ )

    {

        ptr[i] = float16_t(v.s[i]);

    }

}


inline void v_cleanup() {}

#if CV_SIMD256

inline void v256_cleanup() {}

#endif

#if CV_SIMD512

inline void v512_cleanup() {}

#endif


#ifndef CV_DOXYGEN

CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

#endif

}


#if !defined(CV_DOXYGEN)

#undef CV_SIMD256

#undef CV_SIMD512

#endif


#endif

cv::max
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.

cv::sqrt
CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst)
Calculates a square root of array elements.

cv::exp
CV_EXPORTS_W void exp(InputArray src, OutputArray dst)
Calculates the exponent of every array element.

cv::min
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.

cv::log
CV_EXPORTS_W void log(InputArray src, OutputArray dst)
Calculates the natural logarithm of every array element.

OPENCV_HAL_IMPL_C_INIT_VAL
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix)
Helper macro
Definition: intrin_cpp.hpp:2830

OPENCV_HAL_IMPL_C_RSHIFTR
#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2932

OPENCV_HAL_IMPL_C_SHIFTR
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2915

OPENCV_HAL_IMPL_C_RSHR_PACK
#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:2989

OPENCV_HAL_IMPL_ROTATE_SHIFT_OP
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix, opA, opB)
Bitwise shift left
Definition: intrin_cpp.hpp:1280

OPENCV_HAL_IMPL_CMP_OP
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op)
Helper macro
Definition: intrin_cpp.hpp:851

cv::OPENCV_HAL_IMPL_MATH_FUNC
OPENCV_HAL_IMPL_MATH_FUNC(v_abs,(typename V_TypeTraits< _Tp >::abs_type) std::abs, typename V_TypeTraits< _Tp >::abs_type) static const unsigned char popCountTable[]
Square root of elements

OPENCV_HAL_IMPL_C_SHIFTL
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2898

OPENCV_HAL_IMPL_C_INIT_ZERO
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix)
Helper macro
Definition: intrin_cpp.hpp:2784

OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:3054

OPENCV_HAL_IMPL_C_REINTERPRET
#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix)
Helper macro
Definition: intrin_cpp.hpp:2876

OPENCV_HAL_IMPL_C_PACK
#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:2954

OPENCV_HAL_IMPL_ARITHM_OP
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2)
Helper macro
Definition: intrin_cpp.hpp:913

OPENCV_HAL_IMPL_C_PACK_STORE
#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:3024

OPENCV_HAL_IMPL_SHIFT_OP
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op)
Helper macro
Definition: intrin_cpp.hpp:1259

cv::v_check_any
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436

cv::v_combine_high
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors
Definition: intrin_cpp.hpp:2307

cv::v_matmul
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196

cv::v_round
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427

cv::operator|
CV_INLINE v_reg< _Tp, n > operator|(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise OR

cv::v_int8x16
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490

cv::v_uint8x16
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488

cv::v_store_high
void v_store_high(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (higher half)
Definition: intrin_cpp.hpp:2236

cv::v_signmask
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395

cv::v_zip
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition: intrin_cpp.hpp:1557

cv::v_int64x2
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506

cv::v_store
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193

cv::v_dotprod_expand
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145

cv::v_reduce_sad
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377

cv::v_ceil
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465

cv::v_uint16x8
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492

cv::operator&
CV_INLINE v_reg< _Tp, n > operator&(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise AND

cv::v_store_low
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2219

cv::v_floor
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452

cv::v_dotprod
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080

cv::v_scan_forward
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412

cv::v_reverse
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346

cv::v_load_expand
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875

cv::v_int32x4
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498

cv::v_absdiff
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956

cv::v_reduce_sum
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338

cv::v_muladd
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060

cv::v_sqr_magnitude
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036

cv::v_trunc
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478

cv::v_uint32x4
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496

cv::operator/
CV_INLINE v_reg< _Tp, n > operator/(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Divide values

cv::v_invsqrt
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010

cv::v_magnitude
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023

cv::v_dotprod_expand_fast
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188

cv::v_cvt_f64_high
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587

cv::v_load_low
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_low(const _Tp *ptr)
Load 64-bits of data to lower part (high part is undefined).
Definition: intrin_cpp.hpp:1702

cv::v_recombine
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors
Definition: intrin_cpp.hpp:2325

cv::v_reduce_sum4
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356

cv::v_mul_expand
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219

cv::v_load_aligned
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_aligned(const _Tp *ptr)
Load register contents from memory (aligned)
Definition: intrin_cpp.hpp:1652

cv::v_broadcast_element
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416

cv::v_select
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1454

cv::v_load
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587

cv::v_expand_low
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition: intrin_cpp.hpp:1499

cv::operator~
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT

cv::v_cvt_f64
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576

cv::v_load_expand_q
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964

cv::v_expand
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477

cv::v_pack_b
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114

cv::v_fma
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049

cv::operator^
CV_INLINE v_reg< _Tp, n > operator^(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise XOR

cv::v_store_interleave
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118

cv::v_transpose4x4
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition: intrin_cpp.hpp:2764

cv::v_absdiffs
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997

cv::v_uint64x2
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504

cv::v_expand_high
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition: intrin_cpp.hpp:1518

cv::v_dotprod_fast
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119

cv::v_load_halves
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks
Definition: intrin_cpp.hpp:1784

cv::v_mul_hi
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236

cv::v_combine_low
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors
Definition: intrin_cpp.hpp:2285

cv::v_float32x4
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500

cv::v_cvt_f32
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537

cv::v_check_all
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424

cv::v_matmuladd
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226

cv::v_extract_n
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400

cv::v_not_nan
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893

cv::v_popcount
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827

cv::v_store_aligned
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254

cv::v_int16x8
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494

cv::v_float64x2
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502

cv::v_extract
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition: intrin_cpp.hpp:2374

cv::v_load_deinterleave
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046

cv::abs
softfloat abs(softfloat a)
Absolute value
Definition: softfloat.hpp:444

cvRound
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer
Definition: fast_math.hpp:200

cvCeil
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition: fast_math.hpp:254

cv::saturate_cast
static _Tp saturate_cast(uchar v)
Template function for accurate conversion from one primitive type to another.
Definition: saturate.hpp:80

cvFloor
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition: fast_math.hpp:234

cv::isAligned
static bool isAligned(const T &data)
Alignment check of passed values
Definition: utility.hpp:517

CV_Assert
#define CV_Assert(expr)
Checks a condition at runtime and throws exception if it fails
Definition: base.hpp:342

CV_DbgAssert
#define CV_DbgAssert(expr)
Definition: base.hpp:375

cv::cos
Quat< T > cos(const Quat< T > &q)

cv::sin
Quat< T > sin(const Quat< T > &q)

cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75

cv::V_TypeTraits
Definition: intrin.hpp:104

cv::v_reg
Definition: intrin_cpp.hpp:369

cv::v_reg::get0
_Tp get0() const
Access first value
Definition: intrin_cpp.hpp:436

cv::v_reg::v_reg
v_reg(const v_reg< _Tp, n > &r)
Copy constructor
Definition: intrin_cpp.hpp:421

cv::v_reg::v_reg
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7, _Tp s8, _Tp s9, _Tp s10, _Tp s11, _Tp s12, _Tp s13, _Tp s14, _Tp s15)
Constructor
Definition: intrin_cpp.hpp:404

cv::v_reg::v_reg
v_reg(const _Tp *ptr)
Constructor
Definition: intrin_cpp.hpp:379

cv::v_reg::v_reg
v_reg()
Default constructor
Definition: intrin_cpp.hpp:418

cv::v_reg::v_reg
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7)
Constructor
Definition: intrin_cpp.hpp:394

cv::v_reg::v_reg
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3)
Constructor
Definition: intrin_cpp.hpp:389

cv::v_reg::v_reg
v_reg(_Tp s0, _Tp s1)
Constructor
Definition: intrin_cpp.hpp:384