43
#ifndef OPENCV_CUDA_SCAN_HPP
 
44
#define OPENCV_CUDA_SCAN_HPP
 
57
namespace
cv
{
namespace
cuda {
namespace
device
 
59
enum
ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
 
61
template
<ScanKind Kind,
typename
T,
typename
F>
struct
WarpScan
 
63
__device__ __forceinline__ WarpScan() {}
 
64
__device__ __forceinline__ WarpScan(
const
WarpScan& other) { CV_UNUSED(other); }
 
66
__device__ __forceinline__ T operator()(
volatile
T *ptr ,
const
unsigned
int
idx)
 
68
const
unsigned
int
lane = idx & 31;
 
71
if
( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
 
72
if
( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
 
73
if
( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
 
74
if
( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
 
75
if
( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
 
77
if( Kind == INCLUSIVE )
 
80
return
(lane > 0) ? ptr [idx - 1] : 0;
 
83
__device__ __forceinline__
unsigned
int
index(
const
unsigned
int
tid)
 
88
__device__ __forceinline__
void
init(
volatile
T *ptr){}
 
90
static
const
int
warp_offset      = 0;
 
92
typedef
WarpScan<INCLUSIVE, T, F>
merge;
 
95
template
<ScanKind Kind ,
typename
T,
typename
F>
struct
WarpScanNoComp
 
97
__device__ __forceinline__ WarpScanNoComp() {}
 
98
__device__ __forceinline__ WarpScanNoComp(
const
WarpScanNoComp& other) { CV_UNUSED(other); }
 
100
__device__ __forceinline__ T operator()(
volatile
T *ptr ,
const
unsigned
int
idx)
 
102
const
unsigned
int
lane = threadIdx.x & 31;
 
105
ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
 
106
ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
 
107
ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
 
108
ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
 
109
ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
 
111
if( Kind == INCLUSIVE )
 
114
return
(lane > 0) ? ptr [idx - 1] : 0;
 
117
__device__ __forceinline__
unsigned
int
index(
const
unsigned
int
tid)
 
119
return
(tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
 
122
__device__ __forceinline__
void
init(
volatile
T *ptr)
 
124
ptr[threadIdx.x] = 0;
 
127
static
const
int
warp_smem_stride = 32 + 16 + 1;
 
128
static
const
int
warp_offset      = 16;
 
129
static
const
int
warp_log         = 5;
 
130
static
const
int
warp_mask        = 31;
 
132
typedef
WarpScanNoComp<INCLUSIVE, T, F>
merge;
 
135
template
<ScanKind Kind ,
typename
T,
typename
Sc,
typename
F>
struct
BlockScan
 
137
__device__ __forceinline__ BlockScan() {}
 
138
__device__ __forceinline__ BlockScan(
const
BlockScan& other) { CV_UNUSED(other); }
 
140
__device__ __forceinline__ T operator()(
volatile
T *ptr)
 
142
const
unsigned
int
tid  = threadIdx.x;
 
143
const
unsigned
int
lane = tid & warp_mask;
 
144
const
unsigned
int
warp = tid >> warp_log;
 
148
const
unsigned
int
idx = scan.index(tid);
 
150
T val = scan(ptr, idx);
 
158
ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
 
162
merge_scan(ptr, idx);
 
166
val = ptr [scan.warp_offset + warp - 1] + val;
 
175
static
const
int
warp_log  = 5;
 
176
static
const
int
warp_mask = 31;
 
179
template
<
typename
T>
 
180
__device__ T warpScanInclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
 
182
#if __CUDA_ARCH__ >= 300
 
183
const
unsigned
int
laneId = cv::cuda::device::Warp::laneId();
 
187
for
(
int
i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
 
189
const
T n = cv::cuda::device::shfl_up(idata, i);
 
196
unsigned
int
pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
 
198
pos += OPENCV_CUDA_WARP_SIZE;
 
201
s_Data[pos] += s_Data[pos - 1];
 
202
s_Data[pos] += s_Data[pos - 2];
 
203
s_Data[pos] += s_Data[pos - 4];
 
204
s_Data[pos] += s_Data[pos - 8];
 
205
s_Data[pos] += s_Data[pos - 16];
 
211
template
<
typename
T>
 
212
__device__ __forceinline__ T warpScanExclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
 
214
return
warpScanInclusive(idata, s_Data, tid) - idata;
 
217
template
<
int
tiNumScanThreads,
typename
T>
 
218
__device__ T blockScanInclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
 
220
if
(tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
 
223
T warpResult = warpScanInclusive(idata, s_Data, tid);
 
228
if
((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
 
230
s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
 
236
if
(tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
 
241
s_Data[tid] = warpScanExclusive(val, s_Data, tid);
 
247
return
warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
 
251
return
warpScanInclusive(idata, s_Data, tid);
 
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
 
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75