OpenCV 4.5.3(日本語機械翻訳)
intrin_cpp.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46 #define OPENCV_HAL_INTRIN_CPP_HPP
47
48 #include <limits>
49 #include <cstring>
50 #include <algorithm>
51 #include "opencv2/core/saturate.hpp"
52
54 #define CV_SIMD128_CPP 1
55 #if defined(CV_FORCE_SIMD128_CPP)
56 #define CV_SIMD128 1
57 #define CV_SIMD128_64F 1
58 #endif
59 #if defined(CV_DOXYGEN)
60 #define CV_SIMD128 1
61 #define CV_SIMD128_64F 1
62 #define CV_SIMD256 1
63 #define CV_SIMD256_64F 1
64 #define CV_SIMD512 1
65 #define CV_SIMD512_64F 1
66 #else
67 #define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
68 #define CV_SIMD512 0 // to avoid warnings during compilation
69 #endif
71
72 namespace cv
73{
74
75 #ifndef CV_DOXYGEN
76CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
77 #endif
78
368 template<typename _Tp, int n> struct v_reg
369{
371 typedef _Tp lane_type;
372 enum { nlanes = n };
373 // !@endcond
374
379 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
380
384 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
385
389 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
390
394 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
395 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
396 {
397 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
398 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
399 }
400
404 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
405 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
406 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
407 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
408 {
409 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
410 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
411 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
412 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
413 }
414
418 v_reg() {}
419
422 {
423 for( int i = 0; i < n; i++ )
424 s[i] = r.s[i];
425 }
436 _Tp get0() const { return s[0]; }
437
439 _Tp get(const int i) const { return s[i]; }
440 v_reg<_Tp, n> high() const
441 {
442 v_reg<_Tp, n> c;
443 int i;
444 for( i = 0; i < n/2; i++ )
445 {
446 c.s[i] = s[i+(n/2)];
447 c.s[i+(n/2)] = 0;
448 }
449 return c;
450 }
451
452 static v_reg<_Tp, n> zero()
453 {
454 v_reg<_Tp, n> c;
455 for( int i = 0; i < n; i++ )
456 c.s[i] = (_Tp)0;
457 return c;
458 }
459
460 static v_reg<_Tp, n> all(_Tp s)
461 {
462 v_reg<_Tp, n> c;
463 for( int i = 0; i < n; i++ )
464 c.s[i] = s;
465 return c;
466 }
467
468 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
469 {
470 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
471 v_reg<_Tp2, n2> c;
472 std::memcpy(&c.s[0], &s[0], bytes);
473 return c;
474 }
475
476 v_reg& operator=(const v_reg<_Tp, n> & r)
477 {
478 for( int i = 0; i < n; i++ )
479 s[i] = r.s[i];
480 return *this;
481 }
482
483 _Tp s[n];
485};
486
507
508 #if CV_SIMD256
510 typedef v_reg<uchar, 32> v_uint8x32;
512 typedef v_reg<schar, 32> v_int8x32;
514 typedef v_reg<ushort, 16> v_uint16x16;
516 typedef v_reg<short, 16> v_int16x16;
518 typedef v_reg<unsigned, 8> v_uint32x8;
520 typedef v_reg<int, 8> v_int32x8;
522 typedef v_reg<float, 8> v_float32x8;
524 typedef v_reg<double, 4> v_float64x4;
526 typedef v_reg<uint64, 4> v_uint64x4;
528 typedef v_reg<int64, 4> v_int64x4;
529 #endif
530
531 #if CV_SIMD512
533 typedef v_reg<uchar, 64> v_uint8x64;
535 typedef v_reg<schar, 64> v_int8x64;
537 typedef v_reg<ushort, 32> v_uint16x32;
539 typedef v_reg<short, 32> v_int16x32;
541 typedef v_reg<unsigned, 16> v_uint32x16;
543 typedef v_reg<int, 16> v_int32x16;
545 typedef v_reg<float, 16> v_float32x16;
547 typedef v_reg<double, 8> v_float64x8;
549 typedef v_reg<uint64, 8> v_uint64x8;
551 typedef v_reg<int64, 8> v_int64x8;
552 #endif
553
554 enum {
555 simd128_width = 16,
556 #if CV_SIMD256
557 simd256_width = 32,
558 #endif
559 #if CV_SIMD512
560 simd512_width = 64,
561 simdmax_width = simd512_width
562 #elif CV_SIMD256
563 simdmax_width = simd256_width
564 #else
565 simdmax_width = simd128_width
566 #endif
567};
568
572 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
573 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
574
578 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
579 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
580
584 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
585 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
586
590 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
591 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
592
593
597 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
598 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
599
603 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
604 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
605
609 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
610 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
611
615 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
616
617
618 #ifndef CV_DOXYGEN
619
620 #define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
621 __CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
622 __CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
623 __CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
624 __CV_EXPAND(macro_name(short, __VA_ARGS__)) \
625 __CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
626 __CV_EXPAND(macro_name(int, __VA_ARGS__)) \
627 __CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
628 __CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
629
630 #define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
631 __CV_EXPAND(macro_name(float, __VA_ARGS__)) \
632 __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
633
634 #define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
635 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
636 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
637
638 #define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
639 template<int n> inline \
640 v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
641 { \
642 v_reg<_Tp, n> c; \
643 for( int i = 0; i < n; i++ ) \
644 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
645 return c; \
646 } \
647 template<int n> inline \
648 v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
649 { \
650 for( int i = 0; i < n; i++ ) \
651 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
652 return a; \
653 }
654
655 #define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
656
657CV__HAL_INTRIN_IMPL_BIN_OP(+)
658CV__HAL_INTRIN_IMPL_BIN_OP(-)
659CV__HAL_INTRIN_IMPL_BIN_OP(*)
660CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
661
662 #define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
663 template<int n> CV_INLINE \
664 v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
665 { \
666 v_reg<_Tp, n> c; \
667 typedef typename V_TypeTraits<_Tp>::int_type itype; \
668 for( int i = 0; i < n; i++ ) \
669 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
670 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
671 return c; \
672 } \
673 template<int n> CV_INLINE \
674 v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
675 { \
676 typedef typename V_TypeTraits<_Tp>::int_type itype; \
677 for( int i = 0; i < n; i++ ) \
678 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
679 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
680 return a; \
681 }
682
683 #define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
684 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
685 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
686
687
688CV__HAL_INTRIN_IMPL_BIT_OP(&)
689CV__HAL_INTRIN_IMPL_BIT_OP(|)
690CV__HAL_INTRIN_IMPL_BIT_OP(^)
691
692 #define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
693 template<int n> CV_INLINE \
694 v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
695 { \
696 v_reg<_Tp, n> c; \
697 for( int i = 0; i < n; i++ ) \
698 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
699 return c; \
700 } \
701
702CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
703
704 #endif // !CV_DOXYGEN
705
706
709 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
710 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
711 { \
712 v_reg<_Tp2, n> c; \
713 for( int i = 0; i < n; i++ ) \
714 c.s[i] = cfunc(a.s[i]); \
715 return c; \
716 }
717
722
723
729
733 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
734 typename V_TypeTraits<_Tp>::abs_type)
735
738 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
739 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
740 { \
741 v_reg<_Tp, n> c; \
742 for( int i = 0; i < n; i++ ) \
743 c.s[i] = cfunc(a.s[i], b.s[i]); \
744 return c; \
745 }
746
749 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
750 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
751 { \
752 _Tp c = a.s[0]; \
753 for( int i = 1; i < n; i++ ) \
754 c = cfunc(c, a.s[i]); \
755 return c; \
756 }
757
768OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
769
770
780OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
781
782
789OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
790
791
798OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
799
800 static const unsigned char popCountTable[] =
801{
802 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
803 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
804 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
805 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
806 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
807 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
808 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
809 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
810 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
811 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
812 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
813 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
814 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
815 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
816 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
817 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
818};
826 template<typename _Tp, int n>
828{
830 for (int i = 0; i < n*(int)sizeof(_Tp); i++)
831 b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
832 return b;
833}
834
835
837 template<typename _Tp, int n>
838 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
839 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
840{
841 for( int i = 0; i < n; i++ )
842 {
843 minval.s[i] = std::min(a.s[i], b.s[i]);
844 maxval.s[i] = std::max(a.s[i], b.s[i]);
845 }
846}
848
851 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
852 template<typename _Tp, int n> \
853 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
854 { \
855 typedef typename V_TypeTraits<_Tp>::int_type itype; \
856 v_reg<_Tp, n> c; \
857 for( int i = 0; i < n; i++ ) \
858 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
859 return c; \
860 }
861
866
867
871
876
881
886
891
892template<int n>
893 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
894{
895 typedef typename V_TypeTraits<float>::int_type itype;
897 for (int i = 0; i < n; i++)
898 c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
899 return c;
900}
901 template<int n>
902 inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
903{
904 typedef typename V_TypeTraits<double>::int_type itype;
905 v_reg<double, n> c;
906 for (int i = 0; i < n; i++)
907 c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
908 return c;
909}
910
913 #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
914 template<typename _Tp, int n> \
915 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
916 { \
917 typedef _Tp2 rtype; \
918 v_reg<rtype, n> c; \
919 for( int i = 0; i < n; i++ ) \
920 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
921 return c; \
922 }
923
927 OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
928
929
932 OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
933
934
937 OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
938
939
940 template<typename T> inline T _absdiff(T a, T b)
941{
942 return a > b ? a - b : b - a;
943}
945
955 template<typename _Tp, int n>
957{
958 typedef typename V_TypeTraits<_Tp>::abs_type rtype;
960 const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
961 for( int i = 0; i < n; i++ )
962 {
963 rtype ua = a.s[i] ^ mask;
964 rtype ub = b.s[i] ^ mask;
965 c.s[i] = _absdiff(ua, ub);
966 }
967 return c;
968}
969
973 template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
974{
976 for( int i = 0; i < c.nlanes; i++ )
977 c.s[i] = _absdiff(a.s[i], b.s[i]);
978 return c;
979}
980
984 template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
985{
987 for( int i = 0; i < c.nlanes; i++ )
988 c.s[i] = _absdiff(a.s[i], b.s[i]);
989 return c;
990}
991
996 template<typename _Tp, int n>
998{
1000 for( int i = 0; i < n; i++)
1001 c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
1002 return c;
1003}
1004
1009 template<typename _Tp, int n>
1011{
1012 v_reg<_Tp, n> c;
1013 for( int i = 0; i < n; i++ )
1014 c.s[i] = 1.f/std::sqrt(a.s[i]);
1015 return c;
1016}
1017
1022 template<typename _Tp, int n>
1024{
1025 v_reg<_Tp, n> c;
1026 for( int i = 0; i < n; i++ )
1027 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
1028 return c;
1029}
1030
1035 template<typename _Tp, int n>
1037{
1038 v_reg<_Tp, n> c;
1039 for( int i = 0; i < n; i++ )
1040 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
1041 return c;
1042}
1043
1048 template<typename _Tp, int n>
1050 const v_reg<_Tp, n>& c)
1051{
1052 v_reg<_Tp, n> d;
1053 for( int i = 0; i < n; i++ )
1054 d.s[i] = a.s[i]*b.s[i] + c.s[i];
1055 return d;
1056}
1057
1059 template<typename _Tp, int n>
1061 const v_reg<_Tp, n>& c)
1062{
1063 return v_fma(a, b, c);
1064}
1065
1079 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1081{
1082 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1083 v_reg<w_type, n/2> c;
1084 for( int i = 0; i < (n/2); i++ )
1085 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
1086 return c;
1087}
1088
1100 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1102 const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1103{
1104 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1105 v_reg<w_type, n/2> s;
1106 for( int i = 0; i < (n/2); i++ )
1107 s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
1108 return s;
1109}
1110
1118 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1120{ return v_dotprod(a, b); }
1121
1126 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1128 const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1129{ return v_dotprod(a, b, c); }
1130
1144 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1146{
1147 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1148 v_reg<q_type, n/4> s;
1149 for( int i = 0; i < (n/4); i++ )
1150 s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1151 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
1152 return s;
1153}
1154
1166 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1168 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1169{
1170 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1171 v_reg<q_type, n/4> s;
1172 for( int i = 0; i < (n/4); i++ )
1173 s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1174 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
1175 return s;
1176}
1177
1187 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1189{ return v_dotprod_expand(a, b); }
1190
1195 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1197 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1198{ return v_dotprod_expand(a, b, c); }
1199
1219 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1220 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
1221 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
1222{
1223 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1224 for( int i = 0; i < (n/2); i++ )
1225 {
1226 c.s[i] = (w_type)a.s[i]*b.s[i];
1227 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
1228 }
1229}
1230
1236 template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1237{
1238 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1239 v_reg<_Tp, n> c;
1240 for (int i = 0; i < n; i++)
1241 c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
1242 return c;
1243}
1244
1246 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
1247 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
1248{
1249 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1250 for( int i = 0; i < (n/2); i++ )
1251 {
1252 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
1253 }
1254}
1256
1259 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
1260 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
1261 { \
1262 v_reg<_Tp, n> c; \
1263 for( int i = 0; i < n; i++ ) \
1264 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
1265 return c; \
1266 }
1267
1272
1273
1277
1280 #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
1281 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
1282 { \
1283 v_reg<_Tp, n> b; \
1284 for (int i = 0; i < n; i++) \
1285 { \
1286 int sIndex = i opA imm; \
1287 if (0 <= sIndex && sIndex < n) \
1288 { \
1289 b.s[i] = a.s[sIndex]; \
1290 } \
1291 else \
1292 { \
1293 b.s[i] = 0; \
1294 } \
1295 } \
1296 return b; \
1297 } \
1298 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
1299 { \
1300 v_reg<_Tp, n> c; \
1301 for (int i = 0; i < n; i++) \
1302 { \
1303 int aIndex = i opA imm; \
1304 int bIndex = i opA imm opB n; \
1305 if (0 <= bIndex && bIndex < n) \
1306 { \
1307 c.s[i] = b.s[bIndex]; \
1308 } \
1309 else if (0 <= aIndex && aIndex < n) \
1310 { \
1311 c.s[i] = a.s[aIndex]; \
1312 } \
1313 else \
1314 { \
1315 c.s[i] = 0; \
1316 } \
1317 } \
1318 return c; \
1319 }
1320
1325
1326
1330
1338 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
1339{
1340 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
1341 for( int i = 1; i < n; i++ )
1342 c += a.s[i];
1343 return c;
1344}
1345
1356 template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
1357 const v_reg<float, n>& c, const v_reg<float, n>& d)
1358{
1360 for(int i = 0; i < (n/4); i++)
1361 {
1362 r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
1363 r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
1364 r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
1365 r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
1366 }
1367 return r;
1368}
1369
1377 template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1378{
1379 typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
1380 for (int i = 1; i < n; i++)
1381 c += _absdiff(a.s[i], b.s[i]);
1382 return c;
1383}
1384
1395 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1396{
1397 int mask = 0;
1398 for( int i = 0; i < n; i++ )
1399 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1400 return mask;
1401}
1402
1412 template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
1413{
1414 for (int i = 0; i < n; i++)
1416 return i;
1417 return 0;
1418}
1419
1424 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1425{
1426 for( int i = 0; i < n; i++ )
1427 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1428 return false;
1429 return true;
1430}
1431
1436 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1437{
1438 for( int i = 0; i < n; i++ )
1439 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1440 return true;
1441 return false;
1442}
1443
1454 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1455 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1456{
1457 typedef V_TypeTraits<_Tp> Traits;
1458 typedef typename Traits::int_type int_type;
1459 v_reg<_Tp, n> c;
1460 for( int i = 0; i < n; i++ )
1461 {
1462 int_type m = Traits::reinterpret_int(mask.s[i]);
1463 CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
1464 c.s[i] = m ? a.s[i] : b.s[i];
1465 }
1466 return c;
1467}
1468
1477 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1478 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1479 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1480{
1481 for( int i = 0; i < (n/2); i++ )
1482 {
1483 b0.s[i] = a.s[i];
1484 b1.s[i] = a.s[i+(n/2)];
1485 }
1486}
1487
1497 template<typename _Tp, int n>
1498 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1500{
1502 for( int i = 0; i < (n/2); i++ )
1503 b.s[i] = a.s[i];
1504 return b;
1505}
1506
1516 template<typename _Tp, int n>
1517 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1519{
1521 for( int i = 0; i < (n/2); i++ )
1522 b.s[i] = a.s[i+(n/2)];
1523 return b;
1524}
1525
1527 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
1528 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1529{
1530 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1531 for( int i = 0; i < n; i++ )
1532 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1533 return c;
1534}
1535
1536 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
1537 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1538{
1539 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1540 for( int i = 0; i < n; i++ )
1541 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1542 return c;
1543}
1545
1557 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1558 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1559{
1560 int i;
1561 for( i = 0; i < n/2; i++ )
1562 {
1563 b0.s[i*2] = a0.s[i];
1564 b0.s[i*2+1] = a1.s[i];
1565 }
1566 for( ; i < n; i++ )
1567 {
1568 b1.s[i*2-n] = a0.s[i];
1569 b1.s[i*2-n+1] = a1.s[i];
1570 }
1571}
1572
1586 template<typename _Tp>
1587 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
1588{
1589 #if CV_STRONG_ALIGNMENT
1590 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1591 #endif
1592 return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1593}
1594
1595 #if CV_SIMD256
1610 template<typename _Tp>
1611 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
1612{
1613 #if CV_STRONG_ALIGNMENT
1614 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1615 #endif
1616 return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1617}
1618 #endif
1619
1620 #if CV_SIMD512
1635 template<typename _Tp>
1636 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
1637{
1638 #if CV_STRONG_ALIGNMENT
1639 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1640 #endif
1641 return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1642}
1643 #endif
1644
1651 template<typename _Tp>
1652 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
1653{
1654 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
1655 return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1656}
1657
1658 #if CV_SIMD256
1666 template<typename _Tp>
1667 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
1668{
1669 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
1670 return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1671}
1672 #endif
1673
1674 #if CV_SIMD512
1682 template<typename _Tp>
1683 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
1684{
1685 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
1686 return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1687}
1688 #endif
1689
1701 template<typename _Tp>
1702 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
1703{
1704 #if CV_STRONG_ALIGNMENT
1705 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1706 #endif
1707 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1708 for( int i = 0; i < c.nlanes/2; i++ )
1709 {
1710 c.s[i] = ptr[i];
1711 }
1712 return c;
1713}
1714
1715 #if CV_SIMD256
1728 template<typename _Tp>
1729 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
1730{
1731 #if CV_STRONG_ALIGNMENT
1732 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1733 #endif
1734 v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1735 for (int i = 0; i < c.nlanes / 2; i++)
1736 {
1737 c.s[i] = ptr[i];
1738 }
1739 return c;
1740}
1741 #endif
1742
1743 #if CV_SIMD512
1756 template<typename _Tp>
1757 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
1758{
1759 #if CV_STRONG_ALIGNMENT
1760 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1761 #endif
1762 v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1763 for (int i = 0; i < c.nlanes / 2; i++)
1764 {
1765 c.s[i] = ptr[i];
1766 }
1767 return c;
1768}
1769 #endif
1770
1783 template<typename _Tp>
1784 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1785{
1786 #if CV_STRONG_ALIGNMENT
1787 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1788 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1789 #endif
1790 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1791 for( int i = 0; i < c.nlanes/2; i++ )
1792 {
1793 c.s[i] = loptr[i];
1794 c.s[i+c.nlanes/2] = hiptr[i];
1795 }
1796 return c;
1797}
1798
1799 #if CV_SIMD256
1813 template<typename _Tp>
1814 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
1815{
1816 #if CV_STRONG_ALIGNMENT
1817 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1818 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1819 #endif
1820 v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1821 for (int i = 0; i < c.nlanes / 2; i++)
1822 {
1823 c.s[i] = loptr[i];
1824 c.s[i + c.nlanes / 2] = hiptr[i];
1825 }
1826 return c;
1827}
1828 #endif
1829
1830 #if CV_SIMD512
1844 template<typename _Tp>
1845 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
1846{
1847 #if CV_STRONG_ALIGNMENT
1848 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1849 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1850 #endif
1851 v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1852 for (int i = 0; i < c.nlanes / 2; i++)
1853 {
1854 c.s[i] = loptr[i];
1855 c.s[i + c.nlanes / 2] = hiptr[i];
1856 }
1857 return c;
1858}
1859 #endif
1860
1873 template<typename _Tp>
1874 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1875 v_load_expand(const _Tp* ptr)
1876{
1877 #if CV_STRONG_ALIGNMENT
1878 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1879 #endif
1880 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1881 v_reg<w_type, simd128_width / sizeof(w_type)> c;
1882 for( int i = 0; i < c.nlanes; i++ )
1883 {
1884 c.s[i] = ptr[i];
1885 }
1886 return c;
1887}
1888
1889 #if CV_SIMD256
1903 template<typename _Tp>
1904 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1905v256_load_expand(const _Tp* ptr)
1906{
1907 #if CV_STRONG_ALIGNMENT
1908 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1909 #endif
1910 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1911 v_reg<w_type, simd256_width / sizeof(w_type)> c;
1912 for (int i = 0; i < c.nlanes; i++)
1913 {
1914 c.s[i] = ptr[i];
1915 }
1916 return c;
1917}
1918 #endif
1919
1920 #if CV_SIMD512
1934 template<typename _Tp>
1935 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1936v512_load_expand(const _Tp* ptr)
1937{
1938 #if CV_STRONG_ALIGNMENT
1939 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1940 #endif
1941 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1942 v_reg<w_type, simd512_width / sizeof(w_type)> c;
1943 for (int i = 0; i < c.nlanes; i++)
1944 {
1945 c.s[i] = ptr[i];
1946 }
1947 return c;
1948}
1949 #endif
1950
1962 template<typename _Tp>
1963 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
1964 v_load_expand_q(const _Tp* ptr)
1965{
1966 #if CV_STRONG_ALIGNMENT
1967 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1968 #endif
1969 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1970 v_reg<q_type, simd128_width / sizeof(q_type)> c;
1971 for( int i = 0; i < c.nlanes; i++ )
1972 {
1973 c.s[i] = ptr[i];
1974 }
1975 return c;
1976}
1977
1978 #if CV_SIMD256
1991 template<typename _Tp>
1992 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
1993v256_load_expand_q(const _Tp* ptr)
1994{
1995 #if CV_STRONG_ALIGNMENT
1996 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1997 #endif
1998 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1999 v_reg<q_type, simd256_width / sizeof(q_type)> c;
2000 for (int i = 0; i < c.nlanes; i++)
2001 {
2002 c.s[i] = ptr[i];
2003 }
2004 return c;
2005}
2006 #endif
2007
2008 #if CV_SIMD512
2021 template<typename _Tp>
2022 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
2023v512_load_expand_q(const _Tp* ptr)
2024{
2025 #if CV_STRONG_ALIGNMENT
2026 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2027 #endif
2028 typedef typename V_TypeTraits<_Tp>::q_type q_type;
2029 v_reg<q_type, simd512_width / sizeof(q_type)> c;
2030 for (int i = 0; i < c.nlanes; i++)
2031 {
2032 c.s[i] = ptr[i];
2033 }
2034 return c;
2035}
2036 #endif
2037
2046 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2047 v_reg<_Tp, n>& b)
2048{
2049 #if CV_STRONG_ALIGNMENT
2050 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2051 #endif
2052 int i, i2;
2053 for( i = i2 = 0; i < n; i++, i2 += 2 )
2054 {
2055 a.s[i] = ptr[i2];
2056 b.s[i] = ptr[i2+1];
2057 }
2058}
2059
2068 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2070{
2071 #if CV_STRONG_ALIGNMENT
2072 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2073 #endif
2074 int i, i3;
2075 for( i = i3 = 0; i < n; i++, i3 += 3 )
2076 {
2077 a.s[i] = ptr[i3];
2078 b.s[i] = ptr[i3+1];
2079 c.s[i] = ptr[i3+2];
2080 }
2081}
2082
2091 template<typename _Tp, int n>
2092 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2094 v_reg<_Tp, n>& d)
2095{
2096 #if CV_STRONG_ALIGNMENT
2097 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2098 #endif
2099 int i, i4;
2100 for( i = i4 = 0; i < n; i++, i4 += 4 )
2101 {
2102 a.s[i] = ptr[i4];
2103 b.s[i] = ptr[i4+1];
2104 c.s[i] = ptr[i4+2];
2105 d.s[i] = ptr[i4+3];
2106 }
2107}
2108
2117 template<typename _Tp, int n>
2118 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2119 const v_reg<_Tp, n>& b,
2120 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2121{
2122 #if CV_STRONG_ALIGNMENT
2123 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2124 #endif
2125 int i, i2;
2126 for( i = i2 = 0; i < n; i++, i2 += 2 )
2127 {
2128 ptr[i2] = a.s[i];
2129 ptr[i2+1] = b.s[i];
2130 }
2131}
2132
2141 template<typename _Tp, int n>
2142 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2143 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2144 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2145{
2146 #if CV_STRONG_ALIGNMENT
2147 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2148 #endif
2149 int i, i3;
2150 for( i = i3 = 0; i < n; i++, i3 += 3 )
2151 {
2152 ptr[i3] = a.s[i];
2153 ptr[i3+1] = b.s[i];
2154 ptr[i3+2] = c.s[i];
2155 }
2156}
2157
2166 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2167 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2168 const v_reg<_Tp, n>& d,
2169 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2170{
2171 #if CV_STRONG_ALIGNMENT
2172 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2173 #endif
2174 int i, i4;
2175 for( i = i4 = 0; i < n; i++, i4 += 4 )
2176 {
2177 ptr[i4] = a.s[i];
2178 ptr[i4+1] = b.s[i];
2179 ptr[i4+2] = c.s[i];
2180 ptr[i4+3] = d.s[i];
2181 }
2182}
2183
2192 template<typename _Tp, int n>
2193 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
2194{
2195 #if CV_STRONG_ALIGNMENT
2196 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2197 #endif
2198 for( int i = 0; i < n; i++ )
2199 ptr[i] = a.s[i];
2200}
2201
2202 template<typename _Tp, int n>
2203 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2204{
2205 #if CV_STRONG_ALIGNMENT
2206 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2207 #endif
2208 v_store(ptr, a);
2209}
2210
2218 template<typename _Tp, int n>
2219 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
2220{
2221 #if CV_STRONG_ALIGNMENT
2222 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2223 #endif
2224 for( int i = 0; i < (n/2); i++ )
2225 ptr[i] = a.s[i];
2226}
2227
2235 template<typename _Tp, int n>
2236 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
2237{
2238 #if CV_STRONG_ALIGNMENT
2239 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2240 #endif
2241 for( int i = 0; i < (n/2); i++ )
2242 ptr[i] = a.s[i+(n/2)];
2243}
2244
2253 template<typename _Tp, int n>
2254 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
2255{
2256 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2257 v_store(ptr, a);
2258}
2259
2260 template<typename _Tp, int n>
2261 inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
2262{
2263 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2264 v_store(ptr, a);
2265}
2266
2267 template<typename _Tp, int n>
2268 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2269{
2270 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2271 v_store(ptr, a);
2272}
2273
2284 template<typename _Tp, int n>
2286{
2287 v_reg<_Tp, n> c;
2288 for( int i = 0; i < (n/2); i++ )
2289 {
2290 c.s[i] = a.s[i];
2291 c.s[i+(n/2)] = b.s[i];
2292 }
2293 return c;
2294}
2295
2306 template<typename _Tp, int n>
2308{
2309 v_reg<_Tp, n> c;
2310 for( int i = 0; i < (n/2); i++ )
2311 {
2312 c.s[i] = a.s[i+(n/2)];
2313 c.s[i+(n/2)] = b.s[i+(n/2)];
2314 }
2315 return c;
2316}
2317
2324 template<typename _Tp, int n>
2325 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
2326 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
2327{
2328 for( int i = 0; i < (n/2); i++ )
2329 {
2330 low.s[i] = a.s[i];
2331 low.s[i+(n/2)] = b.s[i];
2332 high.s[i] = a.s[i+(n/2)];
2333 high.s[i+(n/2)] = b.s[i+(n/2)];
2334 }
2335}
2336
2345 template<typename _Tp, int n>
2347{
2348 v_reg<_Tp, n> c;
2349 for( int i = 0; i < n; i++ )
2350 c.s[i] = a.s[n-i-1];
2351 return c;
2352}
2353
2373 template<int s, typename _Tp, int n>
2375{
2376 v_reg<_Tp, n> r;
2377 const int shift = n - s;
2378 int i = 0;
2379 for (; i < shift; ++i)
2380 r.s[i] = a.s[i+s];
2381 for (; i < n; ++i)
2382 r.s[i] = b.s[i-shift];
2383 return r;
2384}
2385
2399 template<int s, typename _Tp, int n>
2400 inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
2401{
2402 CV_DbgAssert(s >= 0 && s < n);
2403 return v.s[s];
2404}
2405
2415 template<int i, typename _Tp, int n>
2417{
2418 CV_DbgAssert(i >= 0 && i < n);
2419 return v_reg<_Tp, n>::all(a.s[i]);
2420}
2421
2427 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
2428{
2429 v_reg<int, n> c;
2430 for( int i = 0; i < n; i++ )
2431 c.s[i] = cvRound(a.s[i]);
2432 return c;
2433}
2434
2436 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
2437{
2439 for( int i = 0; i < n; i++ )
2440 {
2441 c.s[i] = cvRound(a.s[i]);
2442 c.s[i+n] = cvRound(b.s[i]);
2443 }
2444 return c;
2445}
2446
2452 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
2453{
2454 v_reg<int, n> c;
2455 for( int i = 0; i < n; i++ )
2456 c.s[i] = cvFloor(a.s[i]);
2457 return c;
2458}
2459
2465 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
2466{
2467 v_reg<int, n> c;
2468 for( int i = 0; i < n; i++ )
2469 c.s[i] = cvCeil(a.s[i]);
2470 return c;
2471}
2472
2478 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
2479{
2480 v_reg<int, n> c;
2481 for( int i = 0; i < n; i++ )
2482 c.s[i] = (int)(a.s[i]);
2483 return c;
2484}
2485
2487 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
2488{
2490 for( int i = 0; i < n; i++ )
2491 {
2492 c.s[i] = cvRound(a.s[i]);
2493 c.s[i+n] = 0;
2494 }
2495 return c;
2496}
2497
2499 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
2500{
2502 for( int i = 0; i < n; i++ )
2503 {
2504 c.s[i] = cvFloor(a.s[i]);
2505 c.s[i+n] = 0;
2506 }
2507 return c;
2508}
2509
2511 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
2512{
2514 for( int i = 0; i < n; i++ )
2515 {
2516 c.s[i] = cvCeil(a.s[i]);
2517 c.s[i+n] = 0;
2518 }
2519 return c;
2520}
2521
2523 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
2524{
2526 for( int i = 0; i < n; i++ )
2527 {
2528 c.s[i] = (int)(a.s[i]);
2529 c.s[i+n] = 0;
2530 }
2531 return c;
2532}
2533
2537 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
2538{
2540 for( int i = 0; i < n; i++ )
2541 c.s[i] = (float)a.s[i];
2542 return c;
2543}
2544
2548 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
2549{
2551 for( int i = 0; i < n; i++ )
2552 {
2553 c.s[i] = (float)a.s[i];
2554 c.s[i+n] = 0;
2555 }
2556 return c;
2557}
2558
2562 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
2563{
2565 for( int i = 0; i < n; i++ )
2566 {
2567 c.s[i] = (float)a.s[i];
2568 c.s[i+n] = (float)b.s[i];
2569 }
2570 return c;
2571}
2572
2576 template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
2577{
2578 v_reg<double, (n/2)> c;
2579 for( int i = 0; i < (n/2); i++ )
2580 c.s[i] = (double)a.s[i];
2581 return c;
2582}
2583
2587 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
2588{
2589 v_reg<double, (n/2)> c;
2590 for( int i = 0; i < (n/2); i++ )
2591 c.s[i] = (double)a.s[i + (n/2)];
2592 return c;
2593}
2594
2598 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
2599{
2600 v_reg<double, (n/2)> c;
2601 for( int i = 0; i < (n/2); i++ )
2602 c.s[i] = (double)a.s[i];
2603 return c;
2604}
2605
2609 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
2610{
2611 v_reg<double, (n/2)> c;
2612 for( int i = 0; i < (n/2); i++ )
2613 c.s[i] = (double)a.s[i + (n/2)];
2614 return c;
2615}
2616
2620 template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
2621{
2623 for( int i = 0; i < n; i++ )
2624 c.s[i] = (double)a.s[i];
2625 return c;
2626}
2627
2628
2629 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
2630{
2631 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2632 for (int i = 0; i < c.nlanes; i++)
2633 c.s[i] = tab[idx[i]];
2634 return c;
2635}
2636 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
2637{
2638 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2639 for (int i = 0; i < c.nlanes; i++)
2640 c.s[i] = tab[idx[i / 2] + i % 2];
2641 return c;
2642}
2643 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
2644{
2645 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2646 for (int i = 0; i < c.nlanes; i++)
2647 c.s[i] = tab[idx[i / 4] + i % 4];
2648 return c;
2649}
2650
2651 template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
2652{
2653 v_reg<int, n> c;
2654 for( int i = 0; i < n; i++ )
2655 c.s[i] = tab[idx.s[i]];
2656 return c;
2657}
2658
2659 template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
2660{
2661 v_reg<int, n> c;
2662 for (int i = 0; i < n; i++)
2663 c.s[i] = tab[idx.s[i]];
2664 return c;
2665}
2666
2667 template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
2668{
2669 v_reg<float, n> c;
2670 for( int i = 0; i < n; i++ )
2671 c.s[i] = tab[idx.s[i]];
2672 return c;
2673}
2674
2675 template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
2676{
2677 v_reg<double, n/2> c;
2678 for( int i = 0; i < n/2; i++ )
2679 c.s[i] = tab[idx.s[i]];
2680 return c;
2681}
2682
2683
2684 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
2685 v_reg<float, n>& x, v_reg<float, n>& y)
2686{
2687 for( int i = 0; i < n; i++ )
2688 {
2689 int j = idx.s[i];
2690 x.s[i] = tab[j];
2691 y.s[i] = tab[j+1];
2692 }
2693}
2694
2695 template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
2696 v_reg<double, n>& x, v_reg<double, n>& y)
2697{
2698 for( int i = 0; i < n; i++ )
2699 {
2700 int j = idx.s[i];
2701 x.s[i] = tab[j];
2702 y.s[i] = tab[j+1];
2703 }
2704}
2705
2706 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
2707{
2708 v_reg<_Tp, n> c;
2709 for (int i = 0; i < n/4; i++)
2710 {
2711 c.s[4*i ] = vec.s[4*i ];
2712 c.s[4*i+1] = vec.s[4*i+2];
2713 c.s[4*i+2] = vec.s[4*i+1];
2714 c.s[4*i+3] = vec.s[4*i+3];
2715 }
2716 return c;
2717}
2718
2719 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
2720{
2721 v_reg<_Tp, n> c;
2722 for (int i = 0; i < n/8; i++)
2723 {
2724 c.s[8*i ] = vec.s[8*i ];
2725 c.s[8*i+1] = vec.s[8*i+4];
2726 c.s[8*i+2] = vec.s[8*i+1];
2727 c.s[8*i+3] = vec.s[8*i+5];
2728 c.s[8*i+4] = vec.s[8*i+2];
2729 c.s[8*i+5] = vec.s[8*i+6];
2730 c.s[8*i+6] = vec.s[8*i+3];
2731 c.s[8*i+7] = vec.s[8*i+7];
2732 }
2733 return c;
2734}
2735
2736 template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
2737{
2738 v_reg<_Tp, n> c;
2739 for (int i = 0; i < n/4; i++)
2740 {
2741 c.s[3*i ] = vec.s[4*i ];
2742 c.s[3*i+1] = vec.s[4*i+1];
2743 c.s[3*i+2] = vec.s[4*i+2];
2744 }
2745 return c;
2746}
2747
2763 template<typename _Tp, int n>
2764 inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
2765 const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
2767 v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
2768{
2769 for (int i = 0; i < n / 4; i++)
2770 {
2771 b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
2772 b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
2773 b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
2774 b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
2775 b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
2776 b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
2777 b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
2778 b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
2779 }
2780}
2781
2784 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
2785 inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
2786
2800
2801 #if CV_SIMD256
2802 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
2803 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
2804 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
2805 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
2806 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
2807 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
2808 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
2809 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
2810 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
2811 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
2812 #endif
2813
2814 #if CV_SIMD512
2815 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
2816 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
2817 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
2818 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
2819 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
2820 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
2821 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
2822 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
2823 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
2824 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
2825 #endif
2827
2830 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
2831 inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
2832
2838 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
2840 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
2844 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
2846
2847 #if CV_SIMD256
2848 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
2849 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
2850 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
2851 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
2852 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
2853 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
2854 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
2855 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
2856 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
2857 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
2858 #endif
2859
2860 #if CV_SIMD512
2861 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
2862 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
2863 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
2864 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
2865 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
2866 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
2867 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
2868 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
2869 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
2870 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
2871 #endif
2873
2876 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
2877 template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
2878 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
2879 { return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
2880
2888 OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
2895
2898 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
2899 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
2900 { return a << shift; }
2901
2907 OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
2912
2915 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
2916 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
2917 { return a >> shift; }
2918
2924 OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
2929
2932 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
2933 template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
2934 { \
2935 v_reg<_Tp, n> c; \
2936 for( int i = 0; i < n; i++ ) \
2937 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2938 return c; \
2939 }
2940
2951
2954 #define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
2955 template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2956 { \
2957 v_reg<_Tpn, 2*n> c; \
2958 for( int i = 0; i < n; i++ ) \
2959 { \
2960 c.s[i] = cast<_Tpn>(a.s[i]); \
2961 c.s[i+n] = cast<_Tpn>(b.s[i]); \
2962 } \
2963 return c; \
2964 }
2965
2977 OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
2978 OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
2979 OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
2980 OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
2981 OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
2982 OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
2983 OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
2984 OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
2986
2989 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
2990 template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2991 { \
2992 v_reg<_Tpn, 2*n> c; \
2993 for( int i = 0; i < n; i++ ) \
2994 { \
2995 c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2996 c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2997 } \
2998 return c; \
2999 }
3000
3012 OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
3013 OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
3014 OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
3016 OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
3017 OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
3018 OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
3019 OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
3021
3024 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3025 template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3026 { \
3027 for( int i = 0; i < n; i++ ) \
3028 ptr[i] = cast<_Tpn>(a.s[i]); \
3029 }
3030
3042 OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
3043 OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
3044 OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3046 OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
3047 OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
3048 OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
3049 OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
3051
3054 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3055 template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3056 { \
3057 for( int i = 0; i < n; i++ ) \
3058 ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
3059 }
3060
3074 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3076 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
3077 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
3081
3083 template<typename _Tpm, typename _Tp, int n>
3084 inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
3085{
3086 for (int i = 0; i < n; ++i)
3087 {
3088 mptr[i] = (_Tpm)a.s[i];
3089 mptr[i + n] = (_Tpm)b.s[i];
3090 }
3091}
3093
3099
3102
3114 template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
3115{
3116 v_reg<uchar, 2*n> mask;
3117 _pack_b(mask.s, a, b);
3118 return mask;
3119}
3120
3137 template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
3138 const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
3139{
3140 v_reg<uchar, 4*n> mask;
3141 _pack_b(mask.s, a, b);
3142 _pack_b(mask.s + 2*n, c, d);
3143 return mask;
3144}
3145
3166 template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
3167 const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
3168 const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
3169 const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
3170{
3171 v_reg<uchar, 8*n> mask;
3172 _pack_b(mask.s, a, b);
3173 _pack_b(mask.s + 2*n, c, d);
3174 _pack_b(mask.s + 4*n, e, f);
3175 _pack_b(mask.s + 6*n, g, h);
3176 return mask;
3177}
3179
3195 template<int n>
3197 const v_reg<float, n>& a, const v_reg<float, n>& b,
3198 const v_reg<float, n>& c, const v_reg<float, n>& d)
3199{
3200 v_reg<float, n> res;
3201 for (int i = 0; i < n / 4; i++)
3202 {
3203 res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
3204 res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
3205 res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
3206 res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
3207 }
3208 return res;
3209}
3210
3225 template<int n>
3227 const v_reg<float, n>& a, const v_reg<float, n>& b,
3228 const v_reg<float, n>& c, const v_reg<float, n>& d)
3229{
3230 v_reg<float, n> res;
3231 for (int i = 0; i < n / 4; i++)
3232 {
3233 res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
3234 res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
3235 res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
3236 res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
3237 }
3238 return res;
3239}
3240
3241
3242 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
3243{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
3244 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
3245 const v_reg<double, n/2>& c)
3246{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
3247
3248 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
3249{ return v_dotprod_expand(a, b); }
3250 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
3251 const v_reg<double, n/2>& c)
3252{ return v_dotprod_expand(a, b, c); }
3253
3255
3256 inline v_reg<float, simd128_width / sizeof(float)>
3257 v_load_expand(const float16_t* ptr)
3258{
3259 v_reg<float, simd128_width / sizeof(float)> v;
3260 for( int i = 0; i < v.nlanes; i++ )
3261 {
3262 v.s[i] = ptr[i];
3263 }
3264 return v;
3265}
3266 #if CV_SIMD256
3267 inline v_reg<float, simd256_width / sizeof(float)>
3268v256_load_expand(const float16_t* ptr)
3269{
3270 v_reg<float, simd256_width / sizeof(float)> v;
3271 for (int i = 0; i < v.nlanes; i++)
3272 {
3273 v.s[i] = ptr[i];
3274 }
3275 return v;
3276}
3277 #endif
3278 #if CV_SIMD512
3279 inline v_reg<float, simd512_width / sizeof(float)>
3280v512_load_expand(const float16_t* ptr)
3281{
3282 v_reg<float, simd512_width / sizeof(float)> v;
3283 for (int i = 0; i < v.nlanes; i++)
3284 {
3285 v.s[i] = ptr[i];
3286 }
3287 return v;
3288}
3289 #endif
3290
3291 template<int n> inline void
3292v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
3293{
3294 for( int i = 0; i < v.nlanes; i++ )
3295 {
3296 ptr[i] = float16_t(v.s[i]);
3297 }
3298}
3299
3300 inline void v_cleanup() {}
3301 #if CV_SIMD256
3302 inline void v256_cleanup() {}
3303 #endif
3304 #if CV_SIMD512
3305 inline void v512_cleanup() {}
3306 #endif
3307
3309
3310 #ifndef CV_DOXYGEN
3311CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3312 #endif
3313}
3314
3315 #if !defined(CV_DOXYGEN)
3316 #undef CV_SIMD256
3317 #undef CV_SIMD512
3318 #endif
3319
3320 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst)
Calculates a square root of array elements.
CV_EXPORTS_W void exp(InputArray src, OutputArray dst)
Calculates the exponent of every array element.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void log(InputArray src, OutputArray dst)
Calculates the natural logarithm of every array element.
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix)
Helper macro
Definition: intrin_cpp.hpp:2830
#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2932
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2915
#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:2989
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix, opA, opB)
Bitwise shift left
Definition: intrin_cpp.hpp:1280
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op)
Helper macro
Definition: intrin_cpp.hpp:851
OPENCV_HAL_IMPL_MATH_FUNC(v_abs,(typename V_TypeTraits< _Tp >::abs_type) std::abs, typename V_TypeTraits< _Tp >::abs_type) static const unsigned char popCountTable[]
Square root of elements
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp)
Helper macro
Definition: intrin_cpp.hpp:2898
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix)
Helper macro
Definition: intrin_cpp.hpp:2784
#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:3054
#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix)
Helper macro
Definition: intrin_cpp.hpp:2876
#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:2954
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2)
Helper macro
Definition: intrin_cpp.hpp:913
#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast)
Helper macro
Definition: intrin_cpp.hpp:3024
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op)
Helper macro
Definition: intrin_cpp.hpp:1259
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors
Definition: intrin_cpp.hpp:2307
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
CV_INLINE v_reg< _Tp, n > operator|(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise OR
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
void v_store_high(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (higher half)
Definition: intrin_cpp.hpp:2236
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition: intrin_cpp.hpp:1557
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
CV_INLINE v_reg< _Tp, n > operator&(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise AND
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
CV_INLINE v_reg< _Tp, n > operator/(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Divide values
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_low(const _Tp *ptr)
Load 64-bits of data to lower part (high part is undefined).
Definition: intrin_cpp.hpp:1702
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors
Definition: intrin_cpp.hpp:2325
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_aligned(const _Tp *ptr)
Load register contents from memory (aligned)
Definition: intrin_cpp.hpp:1652
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1454
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition: intrin_cpp.hpp:1499
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
CV_INLINE v_reg< _Tp, n > operator^(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Bitwise XOR
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition: intrin_cpp.hpp:2764
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition: intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks
Definition: intrin_cpp.hpp:1784
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors
Definition: intrin_cpp.hpp:2285
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition: intrin_cpp.hpp:2374
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
softfloat abs(softfloat a)
Absolute value
Definition: softfloat.hpp:444
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer
Definition: fast_math.hpp:200
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition: fast_math.hpp:254
static _Tp saturate_cast(uchar v)
Template function for accurate conversion from one primitive type to another.
Definition: saturate.hpp:80
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition: fast_math.hpp:234
static bool isAligned(const T &data)
Alignment check of passed values
Definition: utility.hpp:517
#define CV_Assert(expr)
Checks a condition at runtime and throws exception if it fails
Definition: base.hpp:342
#define CV_DbgAssert(expr)
Definition: base.hpp:375
Quat< T > cos(const Quat< T > &q)
Quat< T > sin(const Quat< T > &q)
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75
Definition: intrin.hpp:104
Definition: intrin_cpp.hpp:369
_Tp get0() const
Access first value
Definition: intrin_cpp.hpp:436
v_reg(const v_reg< _Tp, n > &r)
Copy constructor
Definition: intrin_cpp.hpp:421
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7, _Tp s8, _Tp s9, _Tp s10, _Tp s11, _Tp s12, _Tp s13, _Tp s14, _Tp s15)
Constructor
Definition: intrin_cpp.hpp:404
v_reg(const _Tp *ptr)
Constructor
Definition: intrin_cpp.hpp:379
v_reg()
Default constructor
Definition: intrin_cpp.hpp:418
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7)
Constructor
Definition: intrin_cpp.hpp:394
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3)
Constructor
Definition: intrin_cpp.hpp:389
v_reg(_Tp s0, _Tp s1)
Constructor
Definition: intrin_cpp.hpp:384