43
#ifndef OPENCV_CUDA_WARP_SHUFFLE_HPP
 
44
#define OPENCV_CUDA_WARP_SHUFFLE_HPP
 
52
namespace
cv
{
namespace
cuda {
namespace
device
 
54
#if __CUDACC_VER_MAJOR__ >= 9
 
55
#  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
 
56
#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
 
57
#  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
 
60
__device__ __forceinline__ T shfl(T val,
int
srcLane,
int
width = warpSize)
 
62
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
63
return
__shfl(val, srcLane, width);
 
68
__device__ __forceinline__
unsigned
int
shfl(
unsigned
int
val,
int
srcLane,
int
width = warpSize)
 
70
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
71
return
(
unsigned
int) __shfl((
int) val, srcLane, width);
 
76
__device__ __forceinline__
double
shfl(
double
val,
int
srcLane,
int
width = warpSize)
 
78
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
79
int
lo = __double2loint(val);
 
80
int
hi = __double2hiint(val);
 
82
lo = __shfl(lo, srcLane, width);
 
83
hi = __shfl(hi, srcLane, width);
 
85
return
__hiloint2double(hi, lo);
 
92
__device__ __forceinline__ T shfl_down(T val,
unsigned
int
delta,
int
width = warpSize)
 
94
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
95
return
__shfl_down(val, delta, width);
 
100
__device__ __forceinline__
unsigned
int
shfl_down(
unsigned
int
val,
unsigned
int
delta,
int
width = warpSize)
 
102
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
103
return
(
unsigned
int) __shfl_down((
int) val, delta, width);
 
108
__device__ __forceinline__
double
shfl_down(
double
val,
unsigned
int
delta,
int
width = warpSize)
 
110
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
111
int
lo = __double2loint(val);
 
112
int
hi = __double2hiint(val);
 
114
lo = __shfl_down(lo, delta, width);
 
115
hi = __shfl_down(hi, delta, width);
 
117
return
__hiloint2double(hi, lo);
 
123
template
<
typename
T>
 
124
__device__ __forceinline__ T shfl_up(T val,
unsigned
int
delta,
int
width = warpSize)
 
126
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
127
return
__shfl_up(val, delta, width);
 
132
__device__ __forceinline__
unsigned
int
shfl_up(
unsigned
int
val,
unsigned
int
delta,
int
width = warpSize)
 
134
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
135
return
(
unsigned
int) __shfl_up((
int) val, delta, width);
 
140
__device__ __forceinline__
double
shfl_up(
double
val,
unsigned
int
delta,
int
width = warpSize)
 
142
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 
143
int
lo = __double2loint(val);
 
144
int
hi = __double2hiint(val);
 
146
lo = __shfl_up(lo, delta, width);
 
147
hi = __shfl_up(hi, delta, width);
 
149
return
__hiloint2double(hi, lo);
 
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75