43
#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
44
#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
46
#include <thrust/tuple.h>
47
#include "../warp.hpp"
48
#include "../warp_shuffle.hpp"
52
namespace
cv
{
namespace
cuda {
namespace
device
54
namespace
reduce_key_val_detail
56
template
<
typename
T>
struct
GetType;
57
template
<
typename
T>
struct
GetType<T*>
61
template
<
typename
T>
struct
GetType<volatile T*>
65
template
<
typename
T>
struct
GetType<T&>
70
template
<
unsigned
int
I,
unsigned
int
N>
73
template
<
class
Po
interTuple,
class
ReferenceTuple>
74
static
__device__
void
loadToSmem(
const
PointerTuple& smem,
const
ReferenceTuple& data,
unsigned
int
tid)
76
thrust::get<I>(smem)[tid] = thrust::get<I>(data);
78
For<I + 1, N>::loadToSmem(smem, data, tid);
80
template
<
class
Po
interTuple,
class
ReferenceTuple>
81
static
__device__
void
loadFromSmem(
const
PointerTuple& smem,
const
ReferenceTuple& data,
unsigned
int
tid)
83
thrust::get<I>(data) = thrust::get<I>(smem)[tid];
85
For<I + 1, N>::loadFromSmem(smem, data, tid);
88
template
<
class
ReferenceTuple>
89
static
__device__
void
copyShfl(
const
ReferenceTuple& val,
unsigned
int
delta,
int
width)
91
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
93
For<I + 1, N>::copyShfl(val, delta, width);
95
template
<
class
Po
interTuple,
class
ReferenceTuple>
96
static
__device__
void
copy(
const
PointerTuple& svals,
const
ReferenceTuple& val,
unsigned
int
tid,
unsigned
int
delta)
98
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
100
For<I + 1, N>::copy(svals, val, tid, delta);
103
template
<
class
KeyReferenceTuple,
class
ValReferenceTuple,
class
CmpTuple>
104
static
__device__
void
mergeShfl(
const
KeyReferenceTuple& key,
const
ValReferenceTuple& val,
const
CmpTuple& cmp,
unsigned
int
delta,
int
width)
106
typename
GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
108
if
(thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
110
thrust::get<I>(key) = reg;
111
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
114
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
116
template
<
class
KeyPo
interTuple,
class
KeyReferenceTuple,
class
ValPo
interTuple,
class
ValReferenceTuple,
class
CmpTuple>
117
static
__device__
void
merge(
const
KeyPointerTuple& skeys,
const
KeyReferenceTuple& key,
118
const
ValPointerTuple& svals,
const
ValReferenceTuple& val,
120
unsigned
int
tid,
unsigned
int
delta)
122
typename
GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
124
if
(thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
126
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
127
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
133
template
<
unsigned
int
N>
136
template
<
class
Po
interTuple,
class
ReferenceTuple>
137
static
__device__
void
loadToSmem(
const
PointerTuple&,
const
ReferenceTuple&,
unsigned
int)
140
template
<
class
Po
interTuple,
class
ReferenceTuple>
141
static
__device__
void
loadFromSmem(
const
PointerTuple&,
const
ReferenceTuple&,
unsigned
int)
145
template
<
class
ReferenceTuple>
146
static
__device__
void
copyShfl(
const
ReferenceTuple&,
unsigned
int,
int)
149
template
<
class
Po
interTuple,
class
ReferenceTuple>
150
static
__device__
void
copy(
const
PointerTuple&,
const
ReferenceTuple&,
unsigned
int,
unsigned
int)
154
template
<
class
KeyReferenceTuple,
class
ValReferenceTuple,
class
CmpTuple>
155
static
__device__
void
mergeShfl(
const
KeyReferenceTuple&,
const
ValReferenceTuple&,
const
CmpTuple&,
unsigned
int,
int)
158
template
<
class
KeyPo
interTuple,
class
KeyReferenceTuple,
class
ValPo
interTuple,
class
ValReferenceTuple,
class
CmpTuple>
159
static
__device__
void
merge(
const
KeyPointerTuple&,
const
KeyReferenceTuple&,
160
const
ValPointerTuple&,
const
ValReferenceTuple&,
162
unsigned
int,
unsigned
int)
170
template
<
typename
T>
171
__device__ __forceinline__
void
loadToSmem(
volatile
T* smem, T& data,
unsigned
int
tid)
175
template
<
typename
T>
176
__device__ __forceinline__
void
loadFromSmem(
volatile
T* smem, T& data,
unsigned
int
tid)
180
template
<
typename
VP0,
typename
VP1,
typename
VP2,
typename
VP3,
typename
VP4,
typename
VP5,
typename
VP6,
typename
VP7,
typename
VP8,
typename
VP9,
181
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9>
182
__device__ __forceinline__
void
loadToSmem(
const
thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
183
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
186
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
188
template
<
typename
VP0,
typename
VP1,
typename
VP2,
typename
VP3,
typename
VP4,
typename
VP5,
typename
VP6,
typename
VP7,
typename
VP8,
typename
VP9,
189
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9>
190
__device__ __forceinline__
void
loadFromSmem(
const
thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
191
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
194
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
200
template
<
typename
V>
201
__device__ __forceinline__
void
copyValsShfl(V& val,
unsigned
int
delta,
int
width)
203
val = shfl_down(val, delta, width);
205
template
<
typename
V>
206
__device__ __forceinline__
void
copyVals(
volatile
V* svals, V& val,
unsigned
int
tid,
unsigned
int
delta)
208
svals[tid] = val = svals[tid + delta];
210
template
<
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9>
211
__device__ __forceinline__
void
copyValsShfl(
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
215
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
217
template
<
typename
VP0,
typename
VP1,
typename
VP2,
typename
VP3,
typename
VP4,
typename
VP5,
typename
VP6,
typename
VP7,
typename
VP8,
typename
VP9,
218
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9>
219
__device__ __forceinline__
void
copyVals(
const
thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
220
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
221
unsigned
int
tid,
unsigned
int
delta)
223
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
229
template
<
typename
K,
typename
V,
class
Cmp>
230
__device__ __forceinline__
void
mergeShfl(K& key, V& val,
const
Cmp& cmp,
unsigned
int
delta,
int
width)
232
K reg = shfl_down(key, delta, width);
237
copyValsShfl(val, delta, width);
240
template
<
typename
K,
typename
V,
class
Cmp>
241
__device__ __forceinline__
void
merge(
volatile
K* skeys, K& key,
volatile
V* svals, V& val,
const
Cmp& cmp,
unsigned
int
tid,
unsigned
int
delta)
243
K reg = skeys[tid + delta];
247
skeys[tid] = key = reg;
248
copyVals(svals, val, tid, delta);
251
template
<
typename
K,
252
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9,
254
__device__ __forceinline__
void
mergeShfl(K& key,
255
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
257
unsigned
int
delta,
int
width)
259
K reg = shfl_down(key, delta, width);
264
copyValsShfl(val, delta, width);
267
template
<
typename
K,
268
typename
VP0,
typename
VP1,
typename
VP2,
typename
VP3,
typename
VP4,
typename
VP5,
typename
VP6,
typename
VP7,
typename
VP8,
typename
VP9,
269
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9,
271
__device__ __forceinline__
void
merge(
volatile
K* skeys, K& key,
272
const
thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
273
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
274
const
Cmp& cmp,
unsigned
int
tid,
unsigned
int
delta)
276
K reg = skeys[tid + delta];
280
skeys[tid] = key = reg;
281
copyVals(svals, val, tid, delta);
284
template
<
typename
KR0,
typename
KR1,
typename
KR2,
typename
KR3,
typename
KR4,
typename
KR5,
typename
KR6,
typename
KR7,
typename
KR8,
typename
KR9,
285
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9,
286
class
Cmp0,
class
Cmp1,
class
Cmp2,
class
Cmp3,
class
Cmp4,
class
Cmp5,
class
Cmp6,
class
Cmp7,
class
Cmp8,
class
Cmp9>
287
__device__ __forceinline__
void
mergeShfl(
const
thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
288
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
289
const
thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
290
unsigned
int
delta,
int
width)
292
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
294
template
<
typename
KP0,
typename
KP1,
typename
KP2,
typename
KP3,
typename
KP4,
typename
KP5,
typename
KP6,
typename
KP7,
typename
KP8,
typename
KP9,
295
typename
KR0,
typename
KR1,
typename
KR2,
typename
KR3,
typename
KR4,
typename
KR5,
typename
KR6,
typename
KR7,
typename
KR8,
typename
KR9,
296
typename
VP0,
typename
VP1,
typename
VP2,
typename
VP3,
typename
VP4,
typename
VP5,
typename
VP6,
typename
VP7,
typename
VP8,
typename
VP9,
297
typename
VR0,
typename
VR1,
typename
VR2,
typename
VR3,
typename
VR4,
typename
VR5,
typename
VR6,
typename
VR7,
typename
VR8,
typename
VR9,
298
class
Cmp0,
class
Cmp1,
class
Cmp2,
class
Cmp3,
class
Cmp4,
class
Cmp5,
class
Cmp6,
class
Cmp7,
class
Cmp8,
class
Cmp9>
299
__device__ __forceinline__
void
merge(
const
thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
300
const
thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
301
const
thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
302
const
thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
303
const
thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
304
unsigned
int
tid,
unsigned
int
delta)
306
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>
::merge(skeys, key, svals, val, cmp, tid, delta);
312
template
<
unsigned
int
N>
struct
Generic
314
template
<
class
KP,
class
KR,
class
VP,
class
VR,
class
Cmp>
315
static
__device__
void
reduce(KP skeys, KR key, VP svals, VR val,
unsigned
int
tid, Cmp cmp)
317
loadToSmem(skeys, key, tid);
318
loadValsToSmem(svals, val, tid);
325
merge(skeys, key, svals, val, cmp, tid, 1024);
332
merge(skeys, key, svals, val, cmp, tid, 512);
339
merge(skeys, key, svals, val, cmp, tid, 256);
346
merge(skeys, key, svals, val, cmp, tid, 128);
353
merge(skeys, key, svals, val, cmp, tid, 64);
360
merge(skeys, key, svals, val, cmp, tid, 32);
365
merge(skeys, key, svals, val, cmp, tid, 16);
366
merge(skeys, key, svals, val, cmp, tid, 8);
367
merge(skeys, key, svals, val, cmp, tid, 4);
368
merge(skeys, key, svals, val, cmp, tid, 2);
369
merge(skeys, key, svals, val, cmp, tid, 1);
374
template
<
unsigned
int
I,
class
KP,
class
KR,
class
VP,
class
VR,
class
Cmp>
377
static
__device__
void
loopShfl(KR key, VR val, Cmp cmp,
unsigned
int
N)
379
mergeShfl(key, val, cmp, I, N);
380
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
382
static
__device__
void
loop(KP skeys, KR key, VP svals, VR val,
unsigned
int
tid, Cmp cmp)
384
merge(skeys, key, svals, val, cmp, tid, I);
385
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
388
template
<
class
KP,
class
KR,
class
VP,
class
VR,
class
Cmp>
389
struct
Unroll<0, KP, KR, VP, VR, Cmp>
391
static
__device__
void
loopShfl(KR, VR, Cmp,
unsigned
int)
394
static
__device__
void
loop(KP, KR, VP, VR,
unsigned
int, Cmp)
399
template
<
unsigned
int
N>
struct
WarpOptimized
401
template
<
class
KP,
class
KR,
class
VP,
class
VR,
class
Cmp>
402
static
__device__
void
reduce(KP skeys, KR key, VP svals, VR val,
unsigned
int
tid, Cmp cmp)
409
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
411
loadToSmem(skeys, key, tid);
412
loadToSmem(svals, val, tid);
415
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
420
template
<
unsigned
int
N>
struct
GenericOptimized32
424
template
<
class
KP,
class
KR,
class
VP,
class
VR,
class
Cmp>
425
static
__device__
void
reduce(KP skeys, KR key, VP svals, VR val,
unsigned
int
tid, Cmp cmp)
427
const
unsigned
int
laneId = Warp::laneId();
430
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
434
loadToSmem(skeys, key, tid / 32);
435
loadToSmem(svals, val, tid / 32);
438
loadToSmem(skeys, key, tid);
439
loadToSmem(svals, val, tid);
442
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
448
loadToSmem(skeys, key, tid / 32);
449
loadToSmem(svals, val, tid / 32);
455
loadFromSmem(skeys, key, tid);
460
loadFromSmem(svals, val, tid);
462
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
464
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
470
template
<
bool
val,
class
T1,
class
T2>
struct
StaticIf;
471
template
<
class
T1,
class
T2>
struct
StaticIf<true, T1, T2>
475
template
<
class
T1,
class
T2>
struct
StaticIf<false, T1, T2>
480
template
<
unsigned
int
N>
struct
IsPowerOf2
482
enum
{ value = ((N != 0) && !(N & (N - 1))) };
485
template
<
unsigned
int
N>
struct
Dispatcher
487
typedef
typename
StaticIf<
488
(N <= 32) && IsPowerOf2<N>::value,
491
(N <= 1024) && IsPowerOf2<N>::value,
492
GenericOptimized32<N>,
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75