43
#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
 
44
#define OPENCV_CUDA_DEVICE_BLOCK_HPP
 
52
namespace
cv
{
namespace
cuda {
namespace
device
 
56
static
__device__ __forceinline__
unsigned
int
id()
 
61
static
__device__ __forceinline__
unsigned
int
stride()
 
63
return
blockDim.x * blockDim.y * blockDim.z;
 
66
static
__device__ __forceinline__
void
sync()
 
71
static
__device__ __forceinline__
int
flattenedThreadId()
 
73
return
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
 
76
template<
typename
It,
typename
T>
 
77
static
__device__ __forceinline__
void
fill(It beg, It end,
const
T& value)
 
79
int
STRIDE = stride();
 
80
It t = beg + flattenedThreadId();
 
82
for(; t < end; t += STRIDE)
 
86
template<
typename
OutIt,
typename
T>
 
87
static
__device__ __forceinline__
void
yota(OutIt beg, OutIt end, T value)
 
89
int
STRIDE = stride();
 
90
int
tid = flattenedThreadId();
 
93
for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
 
97
template<
typename
InIt,
typename
OutIt>
 
98
static
__device__ __forceinline__
void
copy(InIt beg, InIt end, OutIt out)
 
100
int
STRIDE = stride();
 
101
InIt  t = beg + flattenedThreadId();
 
102
OutIt o = out + (t - beg);
 
104
for(; t < end; t += STRIDE, o += STRIDE)
 
108
template<
typename
InIt,
typename
OutIt,
class
UnOp>
 
109
static
__device__ __forceinline__
void
transform(InIt beg, InIt end, OutIt out, UnOp op)
 
111
int
STRIDE = stride();
 
112
InIt  t = beg + flattenedThreadId();
 
113
OutIt o = out + (t - beg);
 
115
for(; t < end; t += STRIDE, o += STRIDE)
 
119
template<
typename
InIt1,
typename
InIt2,
typename
OutIt,
class
BinOp>
 
120
static
__device__ __forceinline__
void
transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
 
122
int
STRIDE = stride();
 
123
InIt1 t1 = beg1 + flattenedThreadId();
 
124
InIt2 t2 = beg2 + flattenedThreadId();
 
125
OutIt o  = out + (t1 - beg1);
 
127
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
 
131
template<
int
CTA_SIZE,
typename
T,
class
BinOp>
 
132
static
__device__ __forceinline__
void
reduce(
volatile
T* buffer, BinOp op)
 
134
int
tid = flattenedThreadId();
 
137
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
 
138
if
(CTA_SIZE >=  512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
 
139
if
(CTA_SIZE >=  256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
 
140
if
(CTA_SIZE >=  128) {
if
(tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
 
144
if
(CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
 
145
if
(CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
 
146
if
(CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
 
147
if
(CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
 
148
if
(CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
 
149
if
(CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
 
153
template<
int
CTA_SIZE,
typename
T,
class
BinOp>
 
154
static
__device__ __forceinline__ T
reduce(
volatile
T* buffer, T init, BinOp op)
 
156
int
tid = flattenedThreadId();
 
157
T val =  buffer[tid] = init;
 
160
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
 
161
if
(CTA_SIZE >=  512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
 
162
if
(CTA_SIZE >=  256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
 
163
if
(CTA_SIZE >=  128) {
if
(tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
 
167
if
(CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
 
168
if
(CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
 
169
if
(CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
 
170
if
(CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
 
171
if
(CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
 
172
if
(CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
 
178
template
<
typename
T,
class
BinOp>
 
179
static
__device__ __forceinline__
void
reduce_n(T* data,
unsigned
int
n, BinOp op)
 
181
int
ftid = flattenedThreadId();
 
186
for
(
unsigned
int
i = sft + ftid; i < n; i += sft)
 
187
data[ftid] = op(data[ftid], data[i]);
 
196
unsigned
int
half = n/2;
 
199
data[ftid] = op(data[ftid], data[n - ftid - 1]);
 
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
 
CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m)
Performs the matrix transformation of every array element.
 
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75