43
#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
44
#define OPENCV_CUDA_DEVICE_BLOCK_HPP
52
namespace
cv
{
namespace
cuda {
namespace
device
56
static
__device__ __forceinline__
unsigned
int
id()
61
static
__device__ __forceinline__
unsigned
int
stride()
63
return
blockDim.x * blockDim.y * blockDim.z;
66
static
__device__ __forceinline__
void
sync()
71
static
__device__ __forceinline__
int
flattenedThreadId()
73
return
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
76
template<
typename
It,
typename
T>
77
static
__device__ __forceinline__
void
fill(It beg, It end,
const
T& value)
79
int
STRIDE = stride();
80
It t = beg + flattenedThreadId();
82
for(; t < end; t += STRIDE)
86
template<
typename
OutIt,
typename
T>
87
static
__device__ __forceinline__
void
yota(OutIt beg, OutIt end, T value)
89
int
STRIDE = stride();
90
int
tid = flattenedThreadId();
93
for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
97
template<
typename
InIt,
typename
OutIt>
98
static
__device__ __forceinline__
void
copy(InIt beg, InIt end, OutIt out)
100
int
STRIDE = stride();
101
InIt t = beg + flattenedThreadId();
102
OutIt o = out + (t - beg);
104
for(; t < end; t += STRIDE, o += STRIDE)
108
template<
typename
InIt,
typename
OutIt,
class
UnOp>
109
static
__device__ __forceinline__
void
transform(InIt beg, InIt end, OutIt out, UnOp op)
111
int
STRIDE = stride();
112
InIt t = beg + flattenedThreadId();
113
OutIt o = out + (t - beg);
115
for(; t < end; t += STRIDE, o += STRIDE)
119
template<
typename
InIt1,
typename
InIt2,
typename
OutIt,
class
BinOp>
120
static
__device__ __forceinline__
void
transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
122
int
STRIDE = stride();
123
InIt1 t1 = beg1 + flattenedThreadId();
124
InIt2 t2 = beg2 + flattenedThreadId();
125
OutIt o = out + (t1 - beg1);
127
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
131
template<
int
CTA_SIZE,
typename
T,
class
BinOp>
132
static
__device__ __forceinline__
void
reduce(
volatile
T* buffer, BinOp op)
134
int
tid = flattenedThreadId();
137
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
138
if
(CTA_SIZE >= 512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
139
if
(CTA_SIZE >= 256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
140
if
(CTA_SIZE >= 128) {
if
(tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
144
if
(CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
145
if
(CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
146
if
(CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
147
if
(CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
148
if
(CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
149
if
(CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
153
template<
int
CTA_SIZE,
typename
T,
class
BinOp>
154
static
__device__ __forceinline__ T
reduce(
volatile
T* buffer, T init, BinOp op)
156
int
tid = flattenedThreadId();
157
T val = buffer[tid] = init;
160
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
161
if
(CTA_SIZE >= 512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
162
if
(CTA_SIZE >= 256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
163
if
(CTA_SIZE >= 128) {
if
(tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
167
if
(CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
168
if
(CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
169
if
(CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
170
if
(CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
171
if
(CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
172
if
(CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
178
template
<
typename
T,
class
BinOp>
179
static
__device__ __forceinline__
void
reduce_n(T* data,
unsigned
int
n, BinOp op)
181
int
ftid = flattenedThreadId();
186
for
(
unsigned
int
i = sft + ftid; i < n; i += sft)
187
data[ftid] = op(data[ftid], data[i]);
196
unsigned
int
half = n/2;
199
data[ftid] = op(data[ftid], data[n - ftid - 1]);
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m)
Performs the matrix transformation of every array element.
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75