73
#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
74
#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
84
namespace
cv
{
namespace
cuda {
namespace
device
88
static
__device__ __forceinline__
unsigned
int
vadd2(
unsigned
int
a,
unsigned
int
b)
92
#if __CUDA_ARCH__ >= 300
93
asm(
"vadd2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
94
#elif __CUDA_ARCH__ >= 200
95
asm(
"vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
96
asm(
"vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
109
static
__device__ __forceinline__
unsigned
int
vsub2(
unsigned
int
a,
unsigned
int
b)
113
#if __CUDA_ARCH__ >= 300
114
asm(
"vsub2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
115
#elif __CUDA_ARCH__ >= 200
116
asm(
"vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
117
asm(
"vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
130
static
__device__ __forceinline__
unsigned
int
vabsdiff2(
unsigned
int
a,
unsigned
int
b)
134
#if __CUDA_ARCH__ >= 300
135
asm(
"vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
136
#elif __CUDA_ARCH__ >= 200
137
asm(
"vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
138
asm(
"vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
140
unsigned
int
s, t, u, v;
157
static
__device__ __forceinline__
unsigned
int
vavg2(
unsigned
int
a,
unsigned
int
b)
172
static
__device__ __forceinline__
unsigned
int
vavrg2(
unsigned
int
a,
unsigned
int
b)
176
#if __CUDA_ARCH__ >= 300
177
asm(
"vavrg2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
192
static
__device__ __forceinline__
unsigned
int
vseteq2(
unsigned
int
a,
unsigned
int
b)
196
#if __CUDA_ARCH__ >= 300
197
asm(
"vset2.u32.u32.eq %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
213
static
__device__ __forceinline__
unsigned
int
vcmpeq2(
unsigned
int
a,
unsigned
int
b)
217
#if __CUDA_ARCH__ >= 300
237
static
__device__ __forceinline__
unsigned
int
vsetge2(
unsigned
int
a,
unsigned
int
b)
241
#if __CUDA_ARCH__ >= 300
242
asm(
"vset2.u32.u32.ge %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
245
asm(
"not.b32 %0, %0;"
:
"+r"(b));
254
static
__device__ __forceinline__
unsigned
int
vcmpge2(
unsigned
int
a,
unsigned
int
b)
258
#if __CUDA_ARCH__ >= 300
263
asm(
"not.b32 %0, %0;"
:
"+r"(b));
274
static
__device__ __forceinline__
unsigned
int
vsetgt2(
unsigned
int
a,
unsigned
int
b)
278
#if __CUDA_ARCH__ >= 300
279
asm(
"vset2.u32.u32.gt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
282
asm(
"not.b32 %0, %0;"
:
"+r"(b));
291
static
__device__ __forceinline__
unsigned
int
vcmpgt2(
unsigned
int
a,
unsigned
int
b)
295
#if __CUDA_ARCH__ >= 300
300
asm(
"not.b32 %0, %0;"
:
"+r"(b));
311
static
__device__ __forceinline__
unsigned
int
vsetle2(
unsigned
int
a,
unsigned
int
b)
315
#if __CUDA_ARCH__ >= 300
316
asm(
"vset2.u32.u32.le %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
319
asm(
"not.b32 %0, %0;"
:
"+r"(a));
328
static
__device__ __forceinline__
unsigned
int
vcmple2(
unsigned
int
a,
unsigned
int
b)
332
#if __CUDA_ARCH__ >= 300
337
asm(
"not.b32 %0, %0;"
:
"+r"(a));
348
static
__device__ __forceinline__
unsigned
int
vsetlt2(
unsigned
int
a,
unsigned
int
b)
352
#if __CUDA_ARCH__ >= 300
353
asm(
"vset2.u32.u32.lt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
356
asm(
"not.b32 %0, %0;"
:
"+r"(a));
365
static
__device__ __forceinline__
unsigned
int
vcmplt2(
unsigned
int
a,
unsigned
int
b)
369
#if __CUDA_ARCH__ >= 300
374
asm(
"not.b32 %0, %0;"
:
"+r"(a));
385
static
__device__ __forceinline__
unsigned
int
vsetne2(
unsigned
int
a,
unsigned
int
b)
389
#if __CUDA_ARCH__ >= 300
390
asm
(
"vset2.u32.u32.ne %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
406
static
__device__ __forceinline__
unsigned
int
vcmpne2(
unsigned
int
a,
unsigned
int
b)
410
#if __CUDA_ARCH__ >= 300
430
static
__device__ __forceinline__
unsigned
int
vmax2(
unsigned
int
a,
unsigned
int
b)
434
#if __CUDA_ARCH__ >= 300
435
asm(
"vmax2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
436
#elif __CUDA_ARCH__ >= 200
437
asm(
"vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
438
asm(
"vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
440
unsigned
int
s, t, u;
453
static
__device__ __forceinline__
unsigned
int
vmin2(
unsigned
int
a,
unsigned
int
b)
457
#if __CUDA_ARCH__ >= 300
458
asm(
"vmin2.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
459
#elif __CUDA_ARCH__ >= 200
460
asm(
"vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
461
asm(
"vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
463
unsigned
int
s, t, u;
478
static
__device__ __forceinline__
unsigned
int
vadd4(
unsigned
int
a,
unsigned
int
b)
482
#if __CUDA_ARCH__ >= 300
483
asm(
"vadd4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
484
#elif __CUDA_ARCH__ >= 200
485
asm(
"vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
486
asm(
"vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
487
asm(
"vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
488
asm(
"vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
502
static
__device__ __forceinline__
unsigned
int
vsub4(
unsigned
int
a,
unsigned
int
b)
506
#if __CUDA_ARCH__ >= 300
507
asm(
"vsub4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
508
#elif __CUDA_ARCH__ >= 200
509
asm(
"vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
510
asm(
"vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
511
asm(
"vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
512
asm(
"vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
526
static
__device__ __forceinline__
unsigned
int
vavg4(
unsigned
int
a,
unsigned
int
b)
541
static
__device__ __forceinline__
unsigned
int
vavrg4(
unsigned
int
a,
unsigned
int
b)
545
#if __CUDA_ARCH__ >= 300
546
asm(
"vavrg4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
561
static
__device__ __forceinline__
unsigned
int
vseteq4(
unsigned
int
a,
unsigned
int
b)
565
#if __CUDA_ARCH__ >= 300
566
asm(
"vset4.u32.u32.eq %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
582
static
__device__ __forceinline__
unsigned
int
vcmpeq4(
unsigned
int
a,
unsigned
int
b)
586
#if __CUDA_ARCH__ >= 300
606
static
__device__ __forceinline__
unsigned
int
vsetle4(
unsigned
int
a,
unsigned
int
b)
610
#if __CUDA_ARCH__ >= 300
611
asm(
"vset4.u32.u32.le %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
614
asm(
"not.b32 %0, %0;"
:
"+r"(a));
623
static
__device__ __forceinline__
unsigned
int
vcmple4(
unsigned
int
a,
unsigned
int
b)
627
#if __CUDA_ARCH__ >= 300
632
asm(
"not.b32 %0, %0;"
:
"+r"(a));
643
static
__device__ __forceinline__
unsigned
int
vsetlt4(
unsigned
int
a,
unsigned
int
b)
647
#if __CUDA_ARCH__ >= 300
648
asm(
"vset4.u32.u32.lt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
651
asm(
"not.b32 %0, %0;"
:
"+r"(a));
660
static
__device__ __forceinline__
unsigned
int
vcmplt4(
unsigned
int
a,
unsigned
int
b)
664
#if __CUDA_ARCH__ >= 300
669
asm(
"not.b32 %0, %0;"
:
"+r"(a));
680
static
__device__ __forceinline__
unsigned
int
vsetge4(
unsigned
int
a,
unsigned
int
b)
684
#if __CUDA_ARCH__ >= 300
685
asm(
"vset4.u32.u32.ge %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
688
asm(
"not.b32 %0, %0;"
:
"+r"(b));
697
static
__device__ __forceinline__
unsigned
int
vcmpge4(
unsigned
int
a,
unsigned
int
b)
701
#if __CUDA_ARCH__ >= 300
706
asm
(
"not.b32 %0,%0;"
:
"+r"(b));
717
static
__device__ __forceinline__
unsigned
int
vsetgt4(
unsigned
int
a,
unsigned
int
b)
721
#if __CUDA_ARCH__ >= 300
722
asm(
"vset4.u32.u32.gt %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
725
asm(
"not.b32 %0, %0;"
:
"+r"(b));
734
static
__device__ __forceinline__
unsigned
int
vcmpgt4(
unsigned
int
a,
unsigned
int
b)
738
#if __CUDA_ARCH__ >= 300
743
asm(
"not.b32 %0, %0;"
:
"+r"(b));
754
static
__device__ __forceinline__
unsigned
int
vsetne4(
unsigned
int
a,
unsigned
int
b)
758
#if __CUDA_ARCH__ >= 300
759
asm(
"vset4.u32.u32.ne %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
775
static
__device__ __forceinline__
unsigned
int
vcmpne4(
unsigned
int
a,
unsigned
int
b)
779
#if __CUDA_ARCH__ >= 300
799
static
__device__ __forceinline__
unsigned
int
vabsdiff4(
unsigned
int
a,
unsigned
int
b)
803
#if __CUDA_ARCH__ >= 300
804
asm(
"vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
805
#elif __CUDA_ARCH__ >= 200
806
asm(
"vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
807
asm(
"vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
808
asm(
"vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
809
asm(
"vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
822
static
__device__ __forceinline__
unsigned
int
vmax4(
unsigned
int
a,
unsigned
int
b)
826
#if __CUDA_ARCH__ >= 300
827
asm(
"vmax4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
828
#elif __CUDA_ARCH__ >= 200
829
asm(
"vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
830
asm(
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
831
asm(
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
832
asm(
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
844
static
__device__ __forceinline__
unsigned
int
vmin4(
unsigned
int
a,
unsigned
int
b)
848
#if __CUDA_ARCH__ >= 300
849
asm(
"vmin4.u32.u32.u32 %0, %1, %2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
850
#elif __CUDA_ARCH__ >= 200
851
asm(
"vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
852
asm(
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
853
asm(
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
854
asm(
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;"
:
"=r"(r) :
"r"(a),
"r"(b),
"r"(r));
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75