OpenCV 4.5.3(日本語機械翻訳)
scan.hpp
[詳解]
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #ifndef OPENCV_CUDA_SCAN_HPP
44 #define OPENCV_CUDA_SCAN_HPP
45
50
56
57 namespace cv { namespace cuda { namespace device
58{
59 enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
60
61 template <ScanKind Kind, typename T, typename F> struct WarpScan
62 {
63 __device__ __forceinline__ WarpScan() {}
64 __device__ __forceinline__ WarpScan(const WarpScan& other) { CV_UNUSED(other); }
65
66 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
67 {
68 const unsigned int lane = idx & 31;
69 F op;
70
71 if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
72 if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
73 if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
74 if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
75 if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
76
77 if( Kind == INCLUSIVE )
78 return ptr [idx];
79 else
80 return (lane > 0) ? ptr [idx - 1] : 0;
81 }
82
83 __device__ __forceinline__ unsigned int index(const unsigned int tid)
84 {
85 return tid;
86 }
87
88 __device__ __forceinline__ void init(volatile T *ptr){}
89
90 static const int warp_offset = 0;
91
92 typedef WarpScan<INCLUSIVE, T, F> merge;
93 };
94
95 template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
96 {
97 __device__ __forceinline__ WarpScanNoComp() {}
98 __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { CV_UNUSED(other); }
99
100 __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
101 {
102 const unsigned int lane = threadIdx.x & 31;
103 F op;
104
105 ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
106 ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
107 ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
108 ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
109 ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
110
111 if( Kind == INCLUSIVE )
112 return ptr [idx];
113 else
114 return (lane > 0) ? ptr [idx - 1] : 0;
115 }
116
117 __device__ __forceinline__ unsigned int index(const unsigned int tid)
118 {
119 return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
120 }
121
122 __device__ __forceinline__ void init(volatile T *ptr)
123 {
124 ptr[threadIdx.x] = 0;
125 }
126
127 static const int warp_smem_stride = 32 + 16 + 1;
128 static const int warp_offset = 16;
129 static const int warp_log = 5;
130 static const int warp_mask = 31;
131
132 typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
133 };
134
135 template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
136 {
137 __device__ __forceinline__ BlockScan() {}
138 __device__ __forceinline__ BlockScan(const BlockScan& other) { CV_UNUSED(other); }
139
140 __device__ __forceinline__ T operator()(volatile T *ptr)
141 {
142 const unsigned int tid = threadIdx.x;
143 const unsigned int lane = tid & warp_mask;
144 const unsigned int warp = tid >> warp_log;
145
146 Sc scan;
147 typename Sc::merge merge_scan;
148 const unsigned int idx = scan.index(tid);
149
150 T val = scan(ptr, idx);
151 __syncthreads ();
152
153 if( warp == 0)
154 scan.init(ptr);
155 __syncthreads ();
156
157 if( lane == 31 )
158 ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
159 __syncthreads ();
160
161 if( warp == 0 )
162 merge_scan(ptr, idx);
163 __syncthreads();
164
165 if ( warp > 0)
166 val = ptr [scan.warp_offset + warp - 1] + val;
167 __syncthreads ();
168
169 ptr[idx] = val;
170 __syncthreads ();
171
172 return val ;
173 }
174
175 static const int warp_log = 5;
176 static const int warp_mask = 31;
177 };
178
179 template <typename T>
180 __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
181 {
182 #if __CUDA_ARCH__ >= 300
183 const unsigned int laneId = cv::cuda::device::Warp::laneId();
184
185 // scan on shuffl functions
186 #pragma unroll
187 for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
188 {
189 const T n = cv::cuda::device::shfl_up(idata, i);
190 if (laneId >= i)
191 idata += n;
192 }
193
194 return idata;
195 #else
196 unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
197 s_Data[pos] = 0;
198 pos += OPENCV_CUDA_WARP_SIZE;
199 s_Data[pos] = idata;
200
201 s_Data[pos] += s_Data[pos - 1];
202 s_Data[pos] += s_Data[pos - 2];
203 s_Data[pos] += s_Data[pos - 4];
204 s_Data[pos] += s_Data[pos - 8];
205 s_Data[pos] += s_Data[pos - 16];
206
207 return s_Data[pos];
208 #endif
209 }
210
211 template <typename T>
212 __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
213 {
214 return warpScanInclusive(idata, s_Data, tid) - idata;
215 }
216
217 template <int tiNumScanThreads, typename T>
218 __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
219 {
220 if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
221 {
222 //Bottom-level inclusive warp scan
223 T warpResult = warpScanInclusive(idata, s_Data, tid);
224
225 //Save top elements of each warp for exclusive warp scan
226 //sync to wait for warp scans to complete (because s_Data is being overwritten)
227 __syncthreads();
228 if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
229 {
230 s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
231 }
232
233 //wait for warp scans to complete
234 __syncthreads();
235
236 if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
237 {
238 //grab top warp elements
239 T val = s_Data[tid];
240 //calculate exclusive scan and write back to shared memory
241 s_Data[tid] = warpScanExclusive(val, s_Data, tid);
242 }
243
244 //return updated warp scans with exclusive scan results
245 __syncthreads();
246
247 return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
248 }
249 else
250 {
251 return warpScanInclusive(idata, s_Data, tid);
252 }
253 }
254}}}
255
257
258 #endif // OPENCV_CUDA_SCAN_HPP
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75