43
#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
44
#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
46
#include "../common.hpp"
47
#include "../vec_traits.hpp"
48
#include "../functional.hpp"
52
namespace
cv
{
namespace
cuda {
namespace
device
54
namespace
transform_detail
58
template
<
typename
T,
typename
D,
int
shift>
struct
UnaryReadWriteTraits
60
typedef
typename
TypeVec<T, shift>::vec_type read_type;
61
typedef
typename
TypeVec<D, shift>::vec_type write_type;
64
template
<
typename
T1,
typename
T2,
typename
D,
int
shift>
struct
BinaryReadWriteTraits
66
typedef
typename
TypeVec<T1, shift>::vec_type read_type1;
67
typedef
typename
TypeVec<T2, shift>::vec_type read_type2;
68
typedef
typename
TypeVec<D, shift>::vec_type write_type;
73
template
<
int
shift>
struct
OpUnroller;
74
template
<>
struct
OpUnroller<1>
76
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
77
static
__device__ __forceinline__
void
unroll(
const
T& src, D& dst,
const
Mask& mask, UnOp& op,
int
x_shifted,
int
y)
79
if
(mask(y, x_shifted))
83
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
84
static
__device__ __forceinline__
void
unroll(
const
T1& src1,
const
T2& src2, D& dst,
const
Mask& mask, BinOp& op,
int
x_shifted,
int
y)
86
if
(mask(y, x_shifted))
87
dst.x = op(src1.x, src2.x);
90
template
<>
struct
OpUnroller<2>
92
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
93
static
__device__ __forceinline__
void
unroll(
const
T& src, D& dst,
const
Mask& mask, UnOp& op,
int
x_shifted,
int
y)
95
if
(mask(y, x_shifted))
97
if
(mask(y, x_shifted + 1))
101
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
102
static
__device__ __forceinline__
void
unroll(
const
T1& src1,
const
T2& src2, D& dst,
const
Mask& mask, BinOp& op,
int
x_shifted,
int
y)
104
if
(mask(y, x_shifted))
105
dst.x = op(src1.x, src2.x);
106
if
(mask(y, x_shifted + 1))
107
dst.y = op(src1.y, src2.y);
110
template
<>
struct
OpUnroller<3>
112
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
113
static
__device__ __forceinline__
void
unroll(
const
T& src, D& dst,
const
Mask& mask,
const
UnOp& op,
int
x_shifted,
int
y)
115
if
(mask(y, x_shifted))
117
if
(mask(y, x_shifted + 1))
119
if
(mask(y, x_shifted + 2))
123
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
124
static
__device__ __forceinline__
void
unroll(
const
T1& src1,
const
T2& src2, D& dst,
const
Mask& mask,
const
BinOp& op,
int
x_shifted,
int
y)
126
if
(mask(y, x_shifted))
127
dst.x = op(src1.x, src2.x);
128
if
(mask(y, x_shifted + 1))
129
dst.y = op(src1.y, src2.y);
130
if
(mask(y, x_shifted + 2))
131
dst.z = op(src1.z, src2.z);
134
template
<>
struct
OpUnroller<4>
136
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
137
static
__device__ __forceinline__
void
unroll(
const
T& src, D& dst,
const
Mask& mask,
const
UnOp& op,
int
x_shifted,
int
y)
139
if
(mask(y, x_shifted))
141
if
(mask(y, x_shifted + 1))
143
if
(mask(y, x_shifted + 2))
145
if
(mask(y, x_shifted + 3))
149
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
150
static
__device__ __forceinline__
void
unroll(
const
T1& src1,
const
T2& src2, D& dst,
const
Mask& mask,
const
BinOp& op,
int
x_shifted,
int
y)
152
if
(mask(y, x_shifted))
153
dst.x = op(src1.x, src2.x);
154
if
(mask(y, x_shifted + 1))
155
dst.y = op(src1.y, src2.y);
156
if
(mask(y, x_shifted + 2))
157
dst.z = op(src1.z, src2.z);
158
if
(mask(y, x_shifted + 3))
159
dst.w = op(src1.w, src2.w);
162
template
<>
struct
OpUnroller<8>
164
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
165
static
__device__ __forceinline__
void
unroll(
const
T& src, D& dst,
const
Mask& mask,
const
UnOp& op,
int
x_shifted,
int
y)
167
if
(mask(y, x_shifted))
169
if
(mask(y, x_shifted + 1))
171
if
(mask(y, x_shifted + 2))
173
if
(mask(y, x_shifted + 3))
175
if
(mask(y, x_shifted + 4))
177
if
(mask(y, x_shifted + 5))
179
if
(mask(y, x_shifted + 6))
181
if
(mask(y, x_shifted + 7))
185
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
186
static
__device__ __forceinline__
void
unroll(
const
T1& src1,
const
T2& src2, D& dst,
const
Mask& mask,
const
BinOp& op,
int
x_shifted,
int
y)
188
if
(mask(y, x_shifted))
189
dst.a0 = op(src1.a0, src2.a0);
190
if
(mask(y, x_shifted + 1))
191
dst.a1 = op(src1.a1, src2.a1);
192
if
(mask(y, x_shifted + 2))
193
dst.a2 = op(src1.a2, src2.a2);
194
if
(mask(y, x_shifted + 3))
195
dst.a3 = op(src1.a3, src2.a3);
196
if
(mask(y, x_shifted + 4))
197
dst.a4 = op(src1.a4, src2.a4);
198
if
(mask(y, x_shifted + 5))
199
dst.a5 = op(src1.a5, src2.a5);
200
if
(mask(y, x_shifted + 6))
201
dst.a6 = op(src1.a6, src2.a6);
202
if
(mask(y, x_shifted + 7))
203
dst.a7 = op(src1.a7, src2.a7);
207
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
208
static
__global__
void
transformSmart(
const
PtrStepSz<T> src_, PtrStep<D> dst_,
const
Mask mask,
const
UnOp op)
210
typedef
TransformFunctorTraits<UnOp> ft;
211
typedef
typename
UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
212
typedef
typename
UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
214
const
int
x = threadIdx.x + blockIdx.x * blockDim.x;
215
const
int
y = threadIdx.y + blockIdx.y * blockDim.y;
216
const
int
x_shifted = x * ft::smart_shift;
220
const
T* src = src_.ptr(y);
221
D* dst = dst_.ptr(y);
223
if
(x_shifted + ft::smart_shift - 1 < src_.cols)
225
const
read_type src_n_el = ((
const
read_type*)src)[x];
226
OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
230
for
(
int
real_x = x_shifted; real_x < src_.cols; ++real_x)
233
dst[real_x] = op(src[real_x]);
239
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
240
__global__
static
void
transformSimple(
const
PtrStepSz<T> src, PtrStep<D> dst,
const
Mask mask,
const
UnOp op)
242
const
int
x = blockDim.x * blockIdx.x + threadIdx.x;
243
const
int
y = blockDim.y * blockIdx.y + threadIdx.y;
245
if
(x < src.cols && y < src.rows && mask(y, x))
247
dst.ptr(y)[x] = op(src.ptr(y)[x]);
251
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
252
static
__global__
void
transformSmart(
const
PtrStepSz<T1> src1_,
const
PtrStep<T2> src2_, PtrStep<D> dst_,
253
const
Mask mask,
const
BinOp op)
255
typedef
TransformFunctorTraits<BinOp> ft;
256
typedef
typename
BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
257
typedef
typename
BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
258
typedef
typename
BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
260
const
int
x = threadIdx.x + blockIdx.x * blockDim.x;
261
const
int
y = threadIdx.y + blockIdx.y * blockDim.y;
262
const
int
x_shifted = x * ft::smart_shift;
266
const
T1* src1 = src1_.ptr(y);
267
const
T2* src2 = src2_.ptr(y);
268
D* dst = dst_.ptr(y);
270
if
(x_shifted + ft::smart_shift - 1 < src1_.cols)
272
const
read_type1 src1_n_el = ((
const
read_type1*)src1)[x];
273
const
read_type2 src2_n_el = ((
const
read_type2*)src2)[x];
275
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
279
for
(
int
real_x = x_shifted; real_x < src1_.cols; ++real_x)
282
dst[real_x] = op(src1[real_x], src2[real_x]);
288
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
289
static
__global__
void
transformSimple(
const
PtrStepSz<T1> src1,
const
PtrStep<T2> src2, PtrStep<D> dst,
290
const
Mask mask,
const
BinOp op)
292
const
int
x = blockDim.x * blockIdx.x + threadIdx.x;
293
const
int
y = blockDim.y * blockIdx.y + threadIdx.y;
295
if
(x < src1.cols && y < src1.rows && mask(y, x))
297
const
T1 src1_data = src1.ptr(y)[x];
298
const
T2 src2_data = src2.ptr(y)[x];
299
dst.ptr(y)[x] = op(src1_data, src2_data);
303
template
<
bool
UseSmart>
struct
TransformDispatcher;
304
template<>
struct
TransformDispatcher<false>
306
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
307
static
void
call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
309
typedef
TransformFunctorTraits<UnOp> ft;
311
const
dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
312
const
dim3 grid(
divUp(src.cols, threads.x),
divUp(src.rows, threads.y), 1);
314
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
315
cudaSafeCall( cudaGetLastError() );
318
cudaSafeCall( cudaDeviceSynchronize() );
321
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
322
static
void
call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
324
typedef
TransformFunctorTraits<BinOp> ft;
326
const
dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
327
const
dim3 grid(
divUp(src1.cols, threads.x),
divUp(src1.rows, threads.y), 1);
329
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
330
cudaSafeCall( cudaGetLastError() );
333
cudaSafeCall( cudaDeviceSynchronize() );
336
template<>
struct
TransformDispatcher<true>
338
template
<
typename
T,
typename
D,
typename
UnOp,
typename
Mask>
339
static
void
call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
341
typedef
TransformFunctorTraits<UnOp> ft;
343
CV_StaticAssert(ft::smart_shift != 1,
"");
345
if
(!
isAligned(src.data, ft::smart_shift *
sizeof(T)) || !
isAligned(src.step, ft::smart_shift *
sizeof(T)) ||
346
!
isAligned(dst.data, ft::smart_shift *
sizeof(D)) || !
isAligned(dst.step, ft::smart_shift *
sizeof(D)))
348
TransformDispatcher<false>::call(src, dst, op, mask, stream);
352
const
dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
353
const
dim3 grid(
divUp(src.cols, threads.x * ft::smart_shift),
divUp(src.rows, threads.y), 1);
355
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
356
cudaSafeCall( cudaGetLastError() );
359
cudaSafeCall( cudaDeviceSynchronize() );
362
template
<
typename
T1,
typename
T2,
typename
D,
typename
BinOp,
typename
Mask>
363
static
void
call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
365
typedef
TransformFunctorTraits<BinOp> ft;
367
CV_StaticAssert(ft::smart_shift != 1,
"");
369
if
(!
isAligned(src1.data, ft::smart_shift *
sizeof(T1)) || !
isAligned(src1.step, ft::smart_shift *
sizeof(T1)) ||
370
!
isAligned(src2.data, ft::smart_shift *
sizeof(T2)) || !
isAligned(src2.step, ft::smart_shift *
sizeof(T2)) ||
371
!
isAligned(dst.data, ft::smart_shift *
sizeof(D)) || !
isAligned(dst.step, ft::smart_shift *
sizeof(D)))
373
TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
377
const
dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
378
const
dim3 grid(
divUp(src1.cols, threads.x * ft::smart_shift),
divUp(src1.rows, threads.y), 1);
380
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
381
cudaSafeCall( cudaGetLastError() );
384
cudaSafeCall( cudaDeviceSynchronize() );
static int divUp(int a, unsigned int b)
Integer division with result round up.
Definition:
utility.hpp:482
static bool isAligned(const T &data)
Alignment check of passed values
Definition:
utility.hpp:517
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75