43
#ifndef OPENCV_CUDA_SCAN_HPP
44
#define OPENCV_CUDA_SCAN_HPP
57
namespace
cv
{
namespace
cuda {
namespace
device
59
enum
ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
61
template
<ScanKind Kind,
typename
T,
typename
F>
struct
WarpScan
63
__device__ __forceinline__ WarpScan() {}
64
__device__ __forceinline__ WarpScan(
const
WarpScan& other) { CV_UNUSED(other); }
66
__device__ __forceinline__ T operator()(
volatile
T *ptr ,
const
unsigned
int
idx)
68
const
unsigned
int
lane = idx & 31;
71
if
( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
72
if
( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
73
if
( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
74
if
( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
75
if
( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
77
if( Kind == INCLUSIVE )
80
return
(lane > 0) ? ptr [idx - 1] : 0;
83
__device__ __forceinline__
unsigned
int
index(
const
unsigned
int
tid)
88
__device__ __forceinline__
void
init(
volatile
T *ptr){}
90
static
const
int
warp_offset = 0;
92
typedef
WarpScan<INCLUSIVE, T, F>
merge;
95
template
<ScanKind Kind ,
typename
T,
typename
F>
struct
WarpScanNoComp
97
__device__ __forceinline__ WarpScanNoComp() {}
98
__device__ __forceinline__ WarpScanNoComp(
const
WarpScanNoComp& other) { CV_UNUSED(other); }
100
__device__ __forceinline__ T operator()(
volatile
T *ptr ,
const
unsigned
int
idx)
102
const
unsigned
int
lane = threadIdx.x & 31;
105
ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
106
ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
107
ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
108
ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
109
ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
111
if( Kind == INCLUSIVE )
114
return
(lane > 0) ? ptr [idx - 1] : 0;
117
__device__ __forceinline__
unsigned
int
index(
const
unsigned
int
tid)
119
return
(tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
122
__device__ __forceinline__
void
init(
volatile
T *ptr)
124
ptr[threadIdx.x] = 0;
127
static
const
int
warp_smem_stride = 32 + 16 + 1;
128
static
const
int
warp_offset = 16;
129
static
const
int
warp_log = 5;
130
static
const
int
warp_mask = 31;
132
typedef
WarpScanNoComp<INCLUSIVE, T, F>
merge;
135
template
<ScanKind Kind ,
typename
T,
typename
Sc,
typename
F>
struct
BlockScan
137
__device__ __forceinline__ BlockScan() {}
138
__device__ __forceinline__ BlockScan(
const
BlockScan& other) { CV_UNUSED(other); }
140
__device__ __forceinline__ T operator()(
volatile
T *ptr)
142
const
unsigned
int
tid = threadIdx.x;
143
const
unsigned
int
lane = tid & warp_mask;
144
const
unsigned
int
warp = tid >> warp_log;
148
const
unsigned
int
idx = scan.index(tid);
150
T val = scan(ptr, idx);
158
ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
162
merge_scan(ptr, idx);
166
val = ptr [scan.warp_offset + warp - 1] + val;
175
static
const
int
warp_log = 5;
176
static
const
int
warp_mask = 31;
179
template
<
typename
T>
180
__device__ T warpScanInclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
182
#if __CUDA_ARCH__ >= 300
183
const
unsigned
int
laneId = cv::cuda::device::Warp::laneId();
187
for
(
int
i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
189
const
T n = cv::cuda::device::shfl_up(idata, i);
196
unsigned
int
pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
198
pos += OPENCV_CUDA_WARP_SIZE;
201
s_Data[pos] += s_Data[pos - 1];
202
s_Data[pos] += s_Data[pos - 2];
203
s_Data[pos] += s_Data[pos - 4];
204
s_Data[pos] += s_Data[pos - 8];
205
s_Data[pos] += s_Data[pos - 16];
211
template
<
typename
T>
212
__device__ __forceinline__ T warpScanExclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
214
return
warpScanInclusive(idata, s_Data, tid) - idata;
217
template
<
int
tiNumScanThreads,
typename
T>
218
__device__ T blockScanInclusive(T idata,
volatile
T* s_Data,
unsigned
int
tid)
220
if
(tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
223
T warpResult = warpScanInclusive(idata, s_Data, tid);
228
if
((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
230
s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
236
if
(tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
241
s_Data[tid] = warpScanExclusive(val, s_Data, tid);
247
return
warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
251
return
warpScanInclusive(idata, s_Data, tid);
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75