73
#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
 
74
#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
 
84
namespace
cv
{
namespace
cuda {
namespace
device
 
88
static
__device__ __forceinline__
unsigned
int
vadd2(
unsigned
int
a,
unsigned
int
b)
 
92
#if __CUDA_ARCH__ >= 300
 
93
asm(
"vadd2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
94
#elif __CUDA_ARCH__ >= 200
 
95
asm(
"vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
96
asm(
"vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
109
static
__device__ __forceinline__
unsigned
int
vsub2(
unsigned
int
a,
unsigned
int
b)
 
113
#if __CUDA_ARCH__ >= 300
 
114
asm(
"vsub2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
115
#elif __CUDA_ARCH__ >= 200
 
116
asm(
"vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
117
asm(
"vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
130
static
__device__ __forceinline__
unsigned
int
vabsdiff2(
unsigned
int
a,
unsigned
int
b)
 
134
#if __CUDA_ARCH__ >= 300
 
135
asm(
"vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
136
#elif __CUDA_ARCH__ >= 200
 
137
asm(
"vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
138
asm(
"vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
140
unsigned
int
s, t, u, v;
 
157
static
__device__ __forceinline__
unsigned
int
vavg2(
unsigned
int
a,
unsigned
int
b)
 
172
static
__device__ __forceinline__
unsigned
int
vavrg2(
unsigned
int
a,
unsigned
int
b)
 
176
#if __CUDA_ARCH__ >= 300
 
177
asm(
"vavrg2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
192
static
__device__ __forceinline__
unsigned
int
vseteq2(
unsigned
int
a,
unsigned
int
b)
 
196
#if __CUDA_ARCH__ >= 300
 
197
asm(
"vset2.u32.u32.eq %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
213
static
__device__ __forceinline__
unsigned
int
vcmpeq2(
unsigned
int
a,
unsigned
int
b)
 
217
#if __CUDA_ARCH__ >= 300
 
237
static
__device__ __forceinline__
unsigned
int
vsetge2(
unsigned
int
a,
unsigned
int
b)
 
241
#if __CUDA_ARCH__ >= 300
 
242
asm(
"vset2.u32.u32.ge %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
245
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
254
static
__device__ __forceinline__
unsigned
int
vcmpge2(
unsigned
int
a,
unsigned
int
b)
 
258
#if __CUDA_ARCH__ >= 300
 
263
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
274
static
__device__ __forceinline__
unsigned
int
vsetgt2(
unsigned
int
a,
unsigned
int
b)
 
278
#if __CUDA_ARCH__ >= 300
 
279
asm(
"vset2.u32.u32.gt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
282
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
291
static
__device__ __forceinline__
unsigned
int
vcmpgt2(
unsigned
int
a,
unsigned
int
b)
 
295
#if __CUDA_ARCH__ >= 300
 
300
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
311
static
__device__ __forceinline__
unsigned
int
vsetle2(
unsigned
int
a,
unsigned
int
b)
 
315
#if __CUDA_ARCH__ >= 300
 
316
asm(
"vset2.u32.u32.le %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
319
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
328
static
__device__ __forceinline__
unsigned
int
vcmple2(
unsigned
int
a,
unsigned
int
b)
 
332
#if __CUDA_ARCH__ >= 300
 
337
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
348
static
__device__ __forceinline__
unsigned
int
vsetlt2(
unsigned
int
a,
unsigned
int
b)
 
352
#if __CUDA_ARCH__ >= 300
 
353
asm(
"vset2.u32.u32.lt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
356
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
365
static
__device__ __forceinline__
unsigned
int
vcmplt2(
unsigned
int
a,
unsigned
int
b)
 
369
#if __CUDA_ARCH__ >= 300
 
374
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
385
static
__device__ __forceinline__
unsigned
int
vsetne2(
unsigned
int
a,
unsigned
int
b)
 
389
#if __CUDA_ARCH__ >= 300
 
390
asm
(
"vset2.u32.u32.ne %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
406
static
__device__ __forceinline__
unsigned
int
vcmpne2(
unsigned
int
a,
unsigned
int
b)
 
410
#if __CUDA_ARCH__ >= 300
 
430
static
__device__ __forceinline__
unsigned
int
vmax2(
unsigned
int
a,
unsigned
int
b)
 
434
#if __CUDA_ARCH__ >= 300
 
435
asm(
"vmax2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
436
#elif __CUDA_ARCH__ >= 200
 
437
asm(
"vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
438
asm(
"vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
440
unsigned
int
s, t, u;
 
453
static
__device__ __forceinline__
unsigned
int
vmin2(
unsigned
int
a,
unsigned
int
b)
 
457
#if __CUDA_ARCH__ >= 300
 
458
asm(
"vmin2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
459
#elif __CUDA_ARCH__ >= 200
 
460
asm(
"vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
461
asm(
"vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
463
unsigned
int
s, t, u;
 
478
static
__device__ __forceinline__
unsigned
int
vadd4(
unsigned
int
a,
unsigned
int
b)
 
482
#if __CUDA_ARCH__ >= 300
 
483
asm(
"vadd4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
484
#elif __CUDA_ARCH__ >= 200
 
485
asm(
"vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
486
asm(
"vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
487
asm(
"vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
488
asm(
"vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
502
static
__device__ __forceinline__
unsigned
int
vsub4(
unsigned
int
a,
unsigned
int
b)
 
506
#if __CUDA_ARCH__ >= 300
 
507
asm(
"vsub4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
508
#elif __CUDA_ARCH__ >= 200
 
509
asm(
"vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
510
asm(
"vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
511
asm(
"vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
512
asm(
"vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
526
static
__device__ __forceinline__
unsigned
int
vavg4(
unsigned
int
a,
unsigned
int
b)
 
541
static
__device__ __forceinline__
unsigned
int
vavrg4(
unsigned
int
a,
unsigned
int
b)
 
545
#if __CUDA_ARCH__ >= 300
 
546
asm(
"vavrg4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
561
static
__device__ __forceinline__
unsigned
int
vseteq4(
unsigned
int
a,
unsigned
int
b)
 
565
#if __CUDA_ARCH__ >= 300
 
566
asm(
"vset4.u32.u32.eq %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
582
static
__device__ __forceinline__
unsigned
int
vcmpeq4(
unsigned
int
a,
unsigned
int
b)
 
586
#if __CUDA_ARCH__ >= 300
 
606
static
__device__ __forceinline__
unsigned
int
vsetle4(
unsigned
int
a,
unsigned
int
b)
 
610
#if __CUDA_ARCH__ >= 300
 
611
asm(
"vset4.u32.u32.le %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
614
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
623
static
__device__ __forceinline__
unsigned
int
vcmple4(
unsigned
int
a,
unsigned
int
b)
 
627
#if __CUDA_ARCH__ >= 300
 
632
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
643
static
__device__ __forceinline__
unsigned
int
vsetlt4(
unsigned
int
a,
unsigned
int
b)
 
647
#if __CUDA_ARCH__ >= 300
 
648
asm(
"vset4.u32.u32.lt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
651
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
660
static
__device__ __forceinline__
unsigned
int
vcmplt4(
unsigned
int
a,
unsigned
int
b)
 
664
#if __CUDA_ARCH__ >= 300
 
669
asm(
"not.b32 %0, %0;"
:
"+r"(a));
 
680
static
__device__ __forceinline__
unsigned
int
vsetge4(
unsigned
int
a,
unsigned
int
b)
 
684
#if __CUDA_ARCH__ >= 300
 
685
asm(
"vset4.u32.u32.ge %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
688
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
697
static
__device__ __forceinline__
unsigned
int
vcmpge4(
unsigned
int
a,
unsigned
int
b)
 
701
#if __CUDA_ARCH__ >= 300
 
706
asm
(
"not.b32 %0,%0;"
:
"+r"(b));
 
717
static
__device__ __forceinline__
unsigned
int
vsetgt4(
unsigned
int
a,
unsigned
int
b)
 
721
#if __CUDA_ARCH__ >= 300
 
722
asm(
"vset4.u32.u32.gt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
725
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
734
static
__device__ __forceinline__
unsigned
int
vcmpgt4(
unsigned
int
a,
unsigned
int
b)
 
738
#if __CUDA_ARCH__ >= 300
 
743
asm(
"not.b32 %0, %0;"
:
"+r"(b));
 
754
static
__device__ __forceinline__
unsigned
int
vsetne4(
unsigned
int
a,
unsigned
int
b)
 
758
#if __CUDA_ARCH__ >= 300
 
759
asm(
"vset4.u32.u32.ne %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
775
static
__device__ __forceinline__
unsigned
int
vcmpne4(
unsigned
int
a,
unsigned
int
b)
 
779
#if __CUDA_ARCH__ >= 300
 
799
static
__device__ __forceinline__
unsigned
int
vabsdiff4(
unsigned
int
a,
unsigned
int
b)
 
803
#if __CUDA_ARCH__ >= 300
 
804
asm(
"vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
805
#elif __CUDA_ARCH__ >= 200
 
806
asm(
"vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
807
asm(
"vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
808
asm(
"vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
809
asm(
"vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
822
static
__device__ __forceinline__
unsigned
int
vmax4(
unsigned
int
a,
unsigned
int
b)
 
826
#if __CUDA_ARCH__ >= 300
 
827
asm(
"vmax4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
828
#elif __CUDA_ARCH__ >= 200
 
829
asm(
"vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
830
asm(
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
831
asm(
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
832
asm(
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
844
static
__device__ __forceinline__
unsigned
int
vmin4(
unsigned
int
a,
unsigned
int
b)
 
848
#if __CUDA_ARCH__ >= 300
 
849
asm(
"vmin4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
850
#elif __CUDA_ARCH__ >= 200
 
851
asm(
"vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
852
asm(
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
853
asm(
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
854
asm(
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
 
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
 
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
 
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75