43
#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
44
#define OPENCV_CUDA_REDUCE_DETAIL_HPP
46
#include <thrust/tuple.h>
47
#include "../warp.hpp"
48
#include "../warp_shuffle.hpp"
52
namespace
cv
{
namespace
cuda {
namespace
device
54
namespace
reduce_detail
56
template
<
typename
T>
struct
GetType;
57
template
<
typename
T>
struct
GetType<T*>
61
template
<
typename
T>
struct
GetType<volatile T*>
65
template
<
typename
T>
struct
GetType<T&>
70
template
<
unsigned
int
I,
unsigned
int
N>
73
template
<
class
Po
interTuple,
class
ValTuple>
74
static
__device__
void
loadToSmem(
const
PointerTuple& smem,
const
ValTuple& val,
unsigned
int
tid)
76
thrust::get<I>(smem)[tid] = thrust::get<I>(val);
78
For<I + 1, N>::loadToSmem(smem, val, tid);
80
template
<
class
Po
interTuple,
class
ValTuple>
81
static
__device__
void
loadFromSmem(
const
PointerTuple& smem,
const
ValTuple& val,
unsigned
int
tid)
83
thrust::get<I>(val) = thrust::get<I>(smem)[tid];
85
For<I + 1, N>::loadFromSmem(smem, val, tid);
88
template
<
class
Po
interTuple,
class
ValTuple,
class
OpTuple>
89
static
__device__
void
merge(
const
PointerTuple& smem,
const
ValTuple& val,
unsigned
int
tid,
unsigned
int
delta,
const
OpTuple& op)
91
typename
GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
92
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
96
template
<
class
ValTuple,
class
OpTuple>
97
static
__device__
void
mergeShfl(
const
ValTuple& val,
unsigned
int
delta,
unsigned
int
width,
const
OpTuple& op)
99
typename
GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
100
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
102
For<I + 1, N>::mergeShfl(val, delta, width, op);
105
template
<
unsigned
int
N>
108
template
<
class
Po
interTuple,
class
ValTuple>
109
static
__device__
void
loadToSmem(
const
PointerTuple&,
const
ValTuple&,
unsigned
int)
112
template
<
class
Po
interTuple,
class
ValTuple>
113
static
__device__
void
loadFromSmem(
const
PointerTuple&,
const
ValTuple&,
unsigned
int)
117
template
<
class
Po
interTuple,
class
ValTuple,
class
OpTuple>
118
static
__device__
void
merge(
const
PointerTuple&,
const
ValTuple&,
unsigned
int,
unsigned
int,
const
OpTuple&)
121
template
<
class
ValTuple,
class
OpTuple>
122
static
__device__
void
mergeShfl(
const
ValTuple&,
unsigned
int,
unsigned
int,
const
OpTuple&)
127
template
<
typename
T>
128
__device__ __forceinline__
void
loadToSmem(
volatile
T* smem, T& val,
unsigned
int
tid)
132
template
<
typename
T>
133
__device__ __forceinline__
void
loadFromSmem(
volatile
T* smem, T& val,
unsigned
int
tid)
137
template
<
typename
P0,
typename
P1,
typename
P2,
typename
P3,
typename
P4,
typename
P5,
typename
P6,
typename
P7,
typename
P8,
typename
P9,
138
typename
R0,
typename
R1,
typename
R2,
typename
R3,
typename
R4,
typename
R5,
typename
R6,
typename
R7,
typename
R8,
typename
R9>
139
__device__ __forceinline__
void
loadToSmem(
const
thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
140
const
thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
143
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
145
template
<
typename
P0,
typename
P1,
typename
P2,
typename
P3,
typename
P4,
typename
P5,
typename
P6,
typename
P7,
typename
P8,
typename
P9,
146
typename
R0,
typename
R1,
typename
R2,
typename
R3,
typename
R4,
typename
R5,
typename
R6,
typename
R7,
typename
R8,
typename
R9>
147
__device__ __forceinline__
void
loadFromSmem(
const
thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
148
const
thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
151
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
154
template
<
typename
T,
class
Op>
155
__device__ __forceinline__
void
merge(
volatile
T* smem, T& val,
unsigned
int
tid,
unsigned
int
delta,
const
Op& op)
157
T reg = smem[tid + delta];
158
smem[tid] = val = op(val, reg);
160
template
<
typename
T,
class
Op>
161
__device__ __forceinline__
void
mergeShfl(T& val,
unsigned
int
delta,
unsigned
int
width,
const
Op& op)
163
T reg = shfl_down(val, delta, width);
166
template
<
typename
P0,
typename
P1,
typename
P2,
typename
P3,
typename
P4,
typename
P5,
typename
P6,
typename
P7,
typename
P8,
typename
P9,
167
typename
R0,
typename
R1,
typename
R2,
typename
R3,
typename
R4,
typename
R5,
typename
R6,
typename
R7,
typename
R8,
typename
R9,
168
class
Op0,
class
Op1,
class
Op2,
class
Op3,
class
Op4,
class
Op5,
class
Op6,
class
Op7,
class
Op8,
class
Op9>
169
__device__ __forceinline__
void
merge(
const
thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
170
const
thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
173
const
thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
175
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>
::merge(smem, val, tid, delta, op);
177
template
<
typename
R0,
typename
R1,
typename
R2,
typename
R3,
typename
R4,
typename
R5,
typename
R6,
typename
R7,
typename
R8,
typename
R9,
178
class
Op0,
class
Op1,
class
Op2,
class
Op3,
class
Op4,
class
Op5,
class
Op6,
class
Op7,
class
Op8,
class
Op9>
179
__device__ __forceinline__
void
mergeShfl(
const
thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
182
const
thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
184
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
187
template
<
unsigned
int
N>
struct
Generic
189
template
<
typename
Po
inter,
typename
Reference,
class
Op>
190
static
__device__
void
reduce(Pointer smem, Reference val,
unsigned
int
tid, Op op)
192
loadToSmem(smem, val, tid);
199
merge(smem, val, tid, 1024, op);
206
merge(smem, val, tid, 512, op);
213
merge(smem, val, tid, 256, op);
220
merge(smem, val, tid, 128, op);
227
merge(smem, val, tid, 64, op);
234
merge(smem, val, tid, 32, op);
239
merge(smem, val, tid, 16, op);
240
merge(smem, val, tid, 8, op);
241
merge(smem, val, tid, 4, op);
242
merge(smem, val, tid, 2, op);
243
merge(smem, val, tid, 1, op);
248
template
<
unsigned
int
I,
typename
Po
inter,
typename
Reference,
class
Op>
251
static
__device__
void
loopShfl(Reference val, Op op,
unsigned
int
N)
253
mergeShfl(val, I, N, op);
254
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
256
static
__device__
void
loop(Pointer smem, Reference val,
unsigned
int
tid, Op op)
258
merge(smem, val, tid, I, op);
259
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
262
template
<
typename
Po
inter,
typename
Reference,
class
Op>
263
struct
Unroll<0, Pointer, Reference, Op>
265
static
__device__
void
loopShfl(Reference, Op,
unsigned
int)
268
static
__device__
void
loop(Pointer, Reference,
unsigned
int, Op)
273
template
<
unsigned
int
N>
struct
WarpOptimized
275
template
<
typename
Po
inter,
typename
Reference,
class
Op>
276
static
__device__
void
reduce(Pointer smem, Reference val,
unsigned
int
tid, Op op)
278
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
282
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
284
loadToSmem(smem, val, tid);
287
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
292
template
<
unsigned
int
N>
struct
GenericOptimized32
296
template
<
typename
Po
inter,
typename
Reference,
class
Op>
297
static
__device__
void
reduce(Pointer smem, Reference val,
unsigned
int
tid, Op op)
299
const
unsigned
int
laneId = Warp::laneId();
301
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
302
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
305
loadToSmem(smem, val, tid / 32);
307
loadToSmem(smem, val, tid);
310
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
315
loadToSmem(smem, val, tid / 32);
320
loadFromSmem(smem, val, tid);
324
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
325
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
327
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
333
template
<
bool
val,
class
T1,
class
T2>
struct
StaticIf;
334
template
<
class
T1,
class
T2>
struct
StaticIf<true, T1, T2>
338
template
<
class
T1,
class
T2>
struct
StaticIf<false, T1, T2>
343
template
<
unsigned
int
N>
struct
IsPowerOf2
345
enum
{ value = ((N != 0) && !(N & (N - 1))) };
348
template
<
unsigned
int
N>
struct
Dispatcher
350
typedef
typename
StaticIf<
351
(N <= 32) && IsPowerOf2<N>::value,
354
(N <= 1024) && IsPowerOf2<N>::value,
355
GenericOptimized32<N>,
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75