43#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
44#define OPENCV_CUDA_DEVICE_BLOCK_HPP
52namespace cv {
namespace cuda {
namespace device
56 static __device__ __forceinline__
unsigned int id()
61 static __device__ __forceinline__
unsigned int stride()
63 return blockDim.x * blockDim.y * blockDim.z;
66 static __device__ __forceinline__
void sync()
71 static __device__ __forceinline__
int flattenedThreadId()
73 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
76 template<
typename It,
typename T>
77 static __device__ __forceinline__
void fill(It beg, It end,
const T& value)
79 int STRIDE = stride();
80 It t = beg + flattenedThreadId();
82 for(; t < end; t += STRIDE)
86 template<
typename OutIt,
typename T>
87 static __device__ __forceinline__
void yota(OutIt beg, OutIt end, T value)
89 int STRIDE = stride();
90 int tid = flattenedThreadId();
93 for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
97 template<
typename InIt,
typename OutIt>
98 static __device__ __forceinline__
void copy(InIt beg, InIt end, OutIt out)
100 int STRIDE = stride();
101 InIt t = beg + flattenedThreadId();
102 OutIt o = out + (t - beg);
104 for(; t < end; t += STRIDE, o += STRIDE)
108 template<
typename InIt,
typename OutIt,
class UnOp>
109 static __device__ __forceinline__
void transform(InIt beg, InIt end, OutIt out, UnOp op)
111 int STRIDE = stride();
112 InIt t = beg + flattenedThreadId();
113 OutIt o = out + (t - beg);
115 for(; t < end; t += STRIDE, o += STRIDE)
119 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
120 static __device__ __forceinline__
void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
122 int STRIDE = stride();
123 InIt1 t1 = beg1 + flattenedThreadId();
124 InIt2 t2 = beg2 + flattenedThreadId();
125 OutIt o = out + (t1 - beg1);
127 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
131 template<
int CTA_SIZE,
typename T,
class BinOp>
132 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
134 int tid = flattenedThreadId();
137 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
138 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
139 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
140 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
144 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
145 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
146 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
147 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
148 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
149 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
153 template<
int CTA_SIZE,
typename T,
class BinOp>
154 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
156 int tid = flattenedThreadId();
157 T val = buffer[tid] = init;
160 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
161 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
162 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
163 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
167 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
168 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
169 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
170 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
171 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
172 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
178 template <
typename T,
class BinOp>
179 static __device__ __forceinline__
void reduce_n(T* data,
unsigned int n, BinOp op)
181 int ftid = flattenedThreadId();
186 for (
unsigned int i = sft + ftid; i < n; i += sft)
187 data[ftid] = op(data[ftid], data[i]);
196 unsigned int half = n/2;
199 data[ftid] = op(data[ftid], data[n - ftid - 1]);
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m)
Performs the matrix transformation of every array element.
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75