1 | /* |
2 | * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. |
3 | * |
4 | * NOTICE TO LICENSEE: |
5 | * |
6 | * This source code and/or documentation ("Licensed Deliverables") are |
7 | * subject to NVIDIA intellectual property rights under U.S. and |
8 | * international Copyright laws. |
9 | * |
10 | * These Licensed Deliverables contained herein is PROPRIETARY and |
11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and |
12 | * conditions of a form of NVIDIA software license agreement by and |
13 | * between NVIDIA and Licensee ("License Agreement") or electronically |
14 | * accepted by Licensee. Notwithstanding any terms or conditions to |
15 | * the contrary in the License Agreement, reproduction or disclosure |
16 | * of the Licensed Deliverables to any third party without the express |
17 | * written consent of NVIDIA is prohibited. |
18 | * |
19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE |
21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS |
22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. |
23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED |
24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, |
25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. |
26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY |
28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY |
29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
32 | * OF THESE LICENSED DELIVERABLES. |
33 | * |
34 | * U.S. Government End Users. These Licensed Deliverables are a |
35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT |
36 | * 1995), consisting of "commercial computer software" and "commercial |
37 | * computer software documentation" as such terms are used in 48 |
38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government |
39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and |
40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all |
41 | * U.S. Government End Users acquire the Licensed Deliverables with |
42 | * only those rights set forth herein. |
43 | * |
44 | * Any use of the Licensed Deliverables in individual and commercial |
45 | * software must include, in the user documentation and internal |
46 | * comments to the code, the above Disclaimer and U.S. Government End |
47 | * Users Notice. |
48 | */ |
49 | |
50 | #ifndef __cuda_cuda_h__ |
51 | #define __cuda_cuda_h__ |
52 | |
53 | #include <stdlib.h> |
54 | #ifdef _MSC_VER |
55 | typedef unsigned __int32 cuuint32_t; |
56 | typedef unsigned __int64 cuuint64_t; |
57 | #else |
58 | #include <stdint.h> |
59 | typedef uint32_t cuuint32_t; |
60 | typedef uint64_t cuuint64_t; |
61 | #endif |
62 | |
63 | #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) |
64 | #define __CUDA_DEPRECATED |
65 | #elif defined(_MSC_VER) |
66 | #define __CUDA_DEPRECATED __declspec(deprecated) |
67 | #elif defined(__GNUC__) |
68 | #define __CUDA_DEPRECATED __attribute__((deprecated)) |
69 | #else |
70 | #define __CUDA_DEPRECATED |
71 | #endif |
72 | |
73 | #if defined(CUDA_FORCE_API_VERSION) |
74 | #error "CUDA_FORCE_API_VERSION is no longer supported." |
75 | #endif |
76 | |
77 | #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) |
78 | #define __CUDA_API_PER_THREAD_DEFAULT_STREAM |
79 | #define __CUDA_API_PTDS(api) api ## _ptds |
80 | #define __CUDA_API_PTSZ(api) api ## _ptsz |
81 | #else |
82 | #define __CUDA_API_PTDS(api) api |
83 | #define __CUDA_API_PTSZ(api) api |
84 | #endif |
85 | |
86 | #define cuDeviceTotalMem cuDeviceTotalMem_v2 |
87 | #define cuCtxCreate cuCtxCreate_v2 |
88 | #define cuCtxCreate_v3 cuCtxCreate_v3 |
89 | #define cuModuleGetGlobal cuModuleGetGlobal_v2 |
90 | #define cuMemGetInfo cuMemGetInfo_v2 |
91 | #define cuMemAlloc cuMemAlloc_v2 |
92 | #define cuMemAllocPitch cuMemAllocPitch_v2 |
93 | #define cuMemFree cuMemFree_v2 |
94 | #define cuMemGetAddressRange cuMemGetAddressRange_v2 |
95 | #define cuMemAllocHost cuMemAllocHost_v2 |
96 | #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 |
97 | #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) |
98 | #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) |
99 | #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) |
100 | #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) |
101 | #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) |
102 | #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) |
103 | #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) |
104 | #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) |
105 | #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) |
106 | #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) |
107 | #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) |
108 | #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) |
109 | #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) |
110 | #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) |
111 | #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) |
112 | #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) |
113 | #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) |
114 | #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) |
115 | #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) |
116 | #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) |
117 | #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) |
118 | #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) |
119 | #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) |
120 | #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) |
121 | #define cuArrayCreate cuArrayCreate_v2 |
122 | #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 |
123 | #define cuArray3DCreate cuArray3DCreate_v2 |
124 | #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 |
125 | #define cuTexRefSetAddress cuTexRefSetAddress_v2 |
126 | #define cuTexRefGetAddress cuTexRefGetAddress_v2 |
127 | #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 |
128 | #define cuCtxDestroy cuCtxDestroy_v2 |
129 | #define cuCtxPopCurrent cuCtxPopCurrent_v2 |
130 | #define cuCtxPushCurrent cuCtxPushCurrent_v2 |
131 | #define cuStreamDestroy cuStreamDestroy_v2 |
132 | #define cuEventDestroy cuEventDestroy_v2 |
133 | #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 |
134 | #define cuLinkCreate cuLinkCreate_v2 |
135 | #define cuLinkAddData cuLinkAddData_v2 |
136 | #define cuLinkAddFile cuLinkAddFile_v2 |
137 | #define cuMemHostRegister cuMemHostRegister_v2 |
138 | #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 |
139 | #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) |
140 | #define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 |
141 | #define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 |
142 | #define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 |
143 | #define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 |
144 | #define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 |
145 | #define cuGraphInstantiate cuGraphInstantiate_v2 |
146 | |
147 | #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) |
148 | #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) |
149 | #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) |
150 | #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) |
151 | #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) |
152 | #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) |
153 | #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) |
154 | #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) |
155 | |
156 | #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) |
157 | #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) |
158 | #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) |
159 | #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) |
160 | #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) |
161 | #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) |
162 | |
163 | #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) |
164 | #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) |
165 | #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) |
166 | #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) |
167 | #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) |
168 | #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) |
169 | #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) |
170 | #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) |
171 | #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) |
172 | #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) |
173 | #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) |
174 | #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) |
175 | #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) |
176 | #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) |
177 | #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) |
178 | #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) |
179 | #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) |
180 | #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) |
181 | #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) |
182 | |
183 | #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) |
184 | #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) |
185 | #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) |
186 | #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) |
187 | #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) |
188 | |
189 | #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) |
190 | |
191 | #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) |
192 | #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) |
193 | |
194 | #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) |
195 | #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) |
196 | #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) |
197 | #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) |
198 | #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) |
199 | #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) |
200 | |
201 | #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) |
202 | #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) |
203 | #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) |
204 | #endif |
205 | |
206 | /** |
207 | * \file cuda.h |
208 | * \brief Header file for the CUDA Toolkit application programming interface. |
209 | * |
210 | * \file cudaGL.h |
211 | * \brief Header file for the OpenGL interoperability functions of the |
212 | * low-level CUDA driver application programming interface. |
213 | * |
214 | * \file cudaD3D9.h |
215 | * \brief Header file for the Direct3D 9 interoperability functions of the |
216 | * low-level CUDA driver application programming interface. |
217 | */ |
218 | |
219 | /** |
220 | * \defgroup CUDA_TYPES Data types used by CUDA driver |
221 | * @{ |
222 | */ |
223 | |
224 | /** |
225 | * CUDA API version number |
226 | */ |
227 | #define CUDA_VERSION 11040 |
228 | |
229 | #ifdef __cplusplus |
230 | extern "C" { |
231 | #endif |
232 | |
233 | /** |
234 | * CUDA device pointer |
235 | * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. |
236 | */ |
237 | #if defined(_WIN64) || defined(__LP64__) |
238 | typedef unsigned long long CUdeviceptr_v2; |
239 | #else |
240 | typedef unsigned int CUdeviceptr_v2; |
241 | #endif |
242 | typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ |
243 | |
244 | typedef int CUdevice_v1; /**< CUDA device */ |
245 | typedef CUdevice_v1 CUdevice; /**< CUDA device */ |
246 | typedef struct CUctx_st *CUcontext; /**< CUDA context */ |
247 | typedef struct CUmod_st *CUmodule; /**< CUDA module */ |
248 | typedef struct CUfunc_st *CUfunction; /**< CUDA function */ |
249 | typedef struct CUarray_st *CUarray; /**< CUDA array */ |
250 | typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ |
251 | typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ |
252 | typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ |
253 | typedef struct CUevent_st *CUevent; /**< CUDA event */ |
254 | typedef struct CUstream_st *CUstream; /**< CUDA stream */ |
255 | typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ |
256 | typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ |
257 | typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ |
258 | typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ |
259 | typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ |
260 | typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ |
261 | typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ |
262 | typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ |
263 | typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ |
264 | typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ |
265 | typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ |
266 | typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ |
267 | |
268 | #ifndef CU_UUID_HAS_BEEN_DEFINED |
269 | #define CU_UUID_HAS_BEEN_DEFINED |
270 | typedef struct CUuuid_st { /**< CUDA definition of UUID */ |
271 | char bytes[16]; |
272 | } CUuuid; |
273 | #endif |
274 | |
275 | /** |
276 | * CUDA IPC handle size |
277 | */ |
278 | #define CU_IPC_HANDLE_SIZE 64 |
279 | |
280 | /** |
281 | * CUDA IPC event handle |
282 | */ |
283 | typedef struct CUipcEventHandle_st { |
284 | char reserved[CU_IPC_HANDLE_SIZE]; |
285 | } CUipcEventHandle_v1; |
286 | typedef CUipcEventHandle_v1 CUipcEventHandle; |
287 | |
288 | /** |
289 | * CUDA IPC mem handle |
290 | */ |
291 | typedef struct CUipcMemHandle_st { |
292 | char reserved[CU_IPC_HANDLE_SIZE]; |
293 | } CUipcMemHandle_v1; |
294 | typedef CUipcMemHandle_v1 CUipcMemHandle; |
295 | |
296 | /** |
297 | * CUDA Ipc Mem Flags |
298 | */ |
299 | typedef enum CUipcMem_flags_enum { |
300 | CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ |
301 | } CUipcMem_flags; |
302 | |
303 | |
304 | /** |
305 | * CUDA Mem Attach Flags |
306 | */ |
307 | typedef enum CUmemAttach_flags_enum { |
308 | CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ |
309 | CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ |
310 | CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ |
311 | } CUmemAttach_flags; |
312 | |
313 | /** |
314 | * Context creation flags |
315 | */ |
316 | typedef enum CUctx_flags_enum { |
317 | CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ |
318 | CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ |
319 | CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ |
320 | CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ |
321 | CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling |
322 | * \deprecated This flag was deprecated as of CUDA 4.0 |
323 | * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ |
324 | CU_CTX_SCHED_MASK = 0x07, |
325 | CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 |
326 | * and it no longer has any effect. All contexts |
327 | * as of CUDA 3.2 behave as though the flag is enabled. */ |
328 | CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ |
329 | CU_CTX_FLAGS_MASK = 0x1f |
330 | } CUctx_flags; |
331 | |
332 | /** |
333 | * Stream creation flags |
334 | */ |
335 | typedef enum CUstream_flags_enum { |
336 | CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ |
337 | CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ |
338 | } CUstream_flags; |
339 | |
340 | /** |
341 | * Legacy stream handle |
342 | * |
343 | * Stream handle that can be passed as a CUstream to use an implicit stream |
344 | * with legacy synchronization behavior. |
345 | * |
346 | * See details of the \link_sync_behavior |
347 | */ |
348 | #define CU_STREAM_LEGACY ((CUstream)0x1) |
349 | |
350 | /** |
351 | * Per-thread stream handle |
352 | * |
353 | * Stream handle that can be passed as a CUstream to use an implicit stream |
354 | * with per-thread synchronization behavior. |
355 | * |
356 | * See details of the \link_sync_behavior |
357 | */ |
358 | #define CU_STREAM_PER_THREAD ((CUstream)0x2) |
359 | |
360 | /** |
361 | * Event creation flags |
362 | */ |
363 | typedef enum CUevent_flags_enum { |
364 | CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ |
365 | CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ |
366 | CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ |
367 | CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ |
368 | } CUevent_flags; |
369 | |
370 | /** |
371 | * Event record flags |
372 | */ |
373 | typedef enum CUevent_record_flags_enum { |
374 | CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ |
375 | CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node |
376 | * instead of the default behavior. This flag is invalid |
377 | * when used outside of capture. */ |
378 | } CUevent_record_flags; |
379 | |
380 | /** |
381 | * Event wait flags |
382 | */ |
383 | typedef enum CUevent_wait_flags_enum { |
384 | CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ |
385 | CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node |
386 | * instead of the default behavior. This flag is invalid |
387 | * when used outside of capture.*/ |
388 | } CUevent_wait_flags; |
389 | |
390 | /** |
391 | * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 |
392 | */ |
393 | typedef enum CUstreamWaitValue_flags_enum { |
394 | CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit |
395 | values). Note this is a cyclic comparison which ignores wraparound. |
396 | (Default behavior.) */ |
397 | CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ |
398 | CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ |
399 | CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be |
400 | queried with ::cuDeviceGetAttribute() and |
401 | ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ |
402 | CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This |
403 | means that, if a remote write operation is guaranteed to have reached the |
404 | device before the wait can be satisfied, that write is guaranteed to be |
405 | visible to downstream device work. The device is permitted to reorder |
406 | remote writes internally. For example, this flag would be required if |
407 | two remote writes arrive in a defined order, the wait is satisfied by the |
408 | second write, and downstream work needs to observe the first write. |
409 | Support for this operation is restricted to selected platforms and can be |
410 | queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ |
411 | } CUstreamWaitValue_flags; |
412 | |
413 | /** |
414 | * Flags for ::cuStreamWriteValue32 |
415 | */ |
416 | typedef enum CUstreamWriteValue_flags_enum { |
417 | CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ |
418 | CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued |
419 | before it, as a performance optimization. Normally, |
420 | ::cuStreamWriteValue32 will provide a memory fence before the |
421 | write, which has similar semantics to |
422 | __threadfence_system() but is scoped to the stream |
423 | rather than a CUDA thread. */ |
424 | } CUstreamWriteValue_flags; |
425 | |
426 | /** |
427 | * Operations for ::cuStreamBatchMemOp |
428 | */ |
429 | typedef enum CUstreamBatchMemOpType_enum { |
430 | CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ |
431 | CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ |
432 | CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ |
433 | CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ |
434 | CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a |
435 | standalone operation. */ |
436 | } CUstreamBatchMemOpType; |
437 | |
438 | /** |
439 | * Per-operation parameters for ::cuStreamBatchMemOp |
440 | */ |
441 | typedef union CUstreamBatchMemOpParams_union { |
442 | CUstreamBatchMemOpType operation; |
443 | struct CUstreamMemOpWaitValueParams_st { |
444 | CUstreamBatchMemOpType operation; |
445 | CUdeviceptr address; |
446 | union { |
447 | cuuint32_t value; |
448 | cuuint64_t value64; |
449 | }; |
450 | unsigned int flags; |
451 | CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ |
452 | } waitValue; |
453 | struct CUstreamMemOpWriteValueParams_st { |
454 | CUstreamBatchMemOpType operation; |
455 | CUdeviceptr address; |
456 | union { |
457 | cuuint32_t value; |
458 | cuuint64_t value64; |
459 | }; |
460 | unsigned int flags; |
461 | CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ |
462 | } writeValue; |
463 | struct CUstreamMemOpFlushRemoteWritesParams_st { |
464 | CUstreamBatchMemOpType operation; |
465 | unsigned int flags; |
466 | } flushRemoteWrites; |
467 | cuuint64_t pad[6]; |
468 | } CUstreamBatchMemOpParams_v1; |
469 | typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; |
470 | |
471 | /** |
472 | * Occupancy calculator flag |
473 | */ |
474 | typedef enum CUoccupancy_flags_enum { |
475 | CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ |
476 | CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ |
477 | } CUoccupancy_flags; |
478 | |
479 | /** |
480 | * Flags for ::cuStreamUpdateCaptureDependencies |
481 | */ |
482 | typedef enum CUstreamUpdateCaptureDependencies_flags_enum { |
483 | CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ |
484 | CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ |
485 | } CUstreamUpdateCaptureDependencies_flags; |
486 | |
487 | /** |
488 | * Array formats |
489 | */ |
490 | typedef enum CUarray_format_enum { |
491 | CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ |
492 | CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ |
493 | CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ |
494 | CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ |
495 | CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ |
496 | CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ |
497 | CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ |
498 | CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ |
499 | CU_AD_FORMAT_NV12 = 0xb0 |
500 | } CUarray_format; |
501 | |
502 | /** |
503 | * Texture reference addressing modes |
504 | */ |
505 | typedef enum CUaddress_mode_enum { |
506 | CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ |
507 | CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ |
508 | CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ |
509 | CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ |
510 | } CUaddress_mode; |
511 | |
512 | /** |
513 | * Texture reference filtering modes |
514 | */ |
515 | typedef enum CUfilter_mode_enum { |
516 | CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ |
517 | CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ |
518 | } CUfilter_mode; |
519 | |
520 | /** |
521 | * Device properties |
522 | */ |
523 | typedef enum CUdevice_attribute_enum { |
524 | CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ |
525 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ |
526 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ |
527 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ |
528 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ |
529 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ |
530 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ |
531 | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ |
532 | CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ |
533 | CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ |
534 | CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ |
535 | CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ |
536 | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ |
537 | CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ |
538 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ |
539 | CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ |
540 | CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ |
541 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ |
542 | CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ |
543 | CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ |
544 | CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ |
545 | CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ |
546 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ |
547 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ |
548 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ |
549 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ |
550 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ |
551 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ |
552 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ |
553 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ |
554 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ |
555 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ |
556 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ |
557 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ |
558 | CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ |
559 | CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ |
560 | CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ |
561 | CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ |
562 | CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ |
563 | CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ |
564 | CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ |
565 | CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ |
566 | CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ |
567 | CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ |
568 | CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ |
569 | CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ |
570 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ |
571 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ |
572 | CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ |
573 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ |
574 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ |
575 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ |
576 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ |
577 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ |
578 | CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ |
579 | CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ |
580 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ |
581 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ |
582 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ |
583 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ |
584 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ |
585 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ |
586 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ |
587 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ |
588 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ |
589 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ |
590 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ |
591 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ |
592 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ |
593 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ |
594 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ |
595 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ |
596 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ |
597 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ |
598 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ |
599 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ |
600 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ |
601 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ |
602 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ |
603 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ |
604 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ |
605 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ |
606 | CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ |
607 | CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ |
608 | CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ |
609 | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ |
610 | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ |
611 | CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ |
612 | CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ |
613 | CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ |
614 | CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ |
615 | CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ |
616 | CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ |
617 | CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ |
618 | CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ |
619 | CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ |
620 | CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ |
621 | CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ |
622 | CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ |
623 | CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ |
624 | CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ |
625 | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ |
626 | CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ |
627 | CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ |
628 | CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ |
629 | CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ |
630 | CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ |
631 | CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ |
632 | CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ |
633 | CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ |
634 | CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate */ |
635 | CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ |
636 | CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ |
637 | CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ |
638 | CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ |
639 | CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ |
640 | CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ |
641 | CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ |
642 | CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ |
643 | CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ |
644 | CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ |
645 | CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ |
646 | CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ |
647 | CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ |
648 | CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ |
649 | CU_DEVICE_ATTRIBUTE_MAX |
650 | } CUdevice_attribute; |
651 | |
652 | /** |
653 | * Legacy device properties |
654 | */ |
655 | typedef struct CUdevprop_st { |
656 | int maxThreadsPerBlock; /**< Maximum number of threads per block */ |
657 | int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ |
658 | int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ |
659 | int sharedMemPerBlock; /**< Shared memory available per block in bytes */ |
660 | int totalConstantMemory; /**< Constant memory available on device in bytes */ |
661 | int SIMDWidth; /**< Warp size in threads */ |
662 | int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ |
663 | int regsPerBlock; /**< 32-bit registers available per block */ |
664 | int clockRate; /**< Clock frequency in kilohertz */ |
665 | int textureAlign; /**< Alignment requirement for textures */ |
666 | } CUdevprop_v1; |
667 | typedef CUdevprop_v1 CUdevprop; |
668 | |
669 | /** |
670 | * Pointer information |
671 | */ |
672 | typedef enum CUpointer_attribute_enum { |
673 | CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ |
674 | CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ |
675 | CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ |
676 | CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ |
677 | CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ |
678 | CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ |
679 | CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ |
680 | CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ |
681 | CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ |
682 | CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ |
683 | CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ |
684 | CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ |
685 | CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ |
686 | CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ |
687 | CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ |
688 | CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ |
689 | CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ |
690 | } CUpointer_attribute; |
691 | |
692 | /** |
693 | * Function properties |
694 | */ |
695 | typedef enum CUfunction_attribute_enum { |
696 | /** |
697 | * The maximum number of threads per block, beyond which a launch of the |
698 | * function would fail. This number depends on both the function and the |
699 | * device on which the function is currently loaded. |
700 | */ |
701 | CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, |
702 | |
703 | /** |
704 | * The size in bytes of statically-allocated shared memory required by |
705 | * this function. This does not include dynamically-allocated shared |
706 | * memory requested by the user at runtime. |
707 | */ |
708 | CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, |
709 | |
710 | /** |
711 | * The size in bytes of user-allocated constant memory required by this |
712 | * function. |
713 | */ |
714 | CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, |
715 | |
716 | /** |
717 | * The size in bytes of local memory used by each thread of this function. |
718 | */ |
719 | CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, |
720 | |
721 | /** |
722 | * The number of registers used by each thread of this function. |
723 | */ |
724 | CU_FUNC_ATTRIBUTE_NUM_REGS = 4, |
725 | |
726 | /** |
727 | * The PTX virtual architecture version for which the function was |
728 | * compiled. This value is the major PTX version * 10 + the minor PTX |
729 | * version, so a PTX version 1.3 function would return the value 13. |
730 | * Note that this may return the undefined value of 0 for cubins |
731 | * compiled prior to CUDA 3.0. |
732 | */ |
733 | CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, |
734 | |
735 | /** |
736 | * The binary architecture version for which the function was compiled. |
737 | * This value is the major binary version * 10 + the minor binary version, |
738 | * so a binary version 1.3 function would return the value 13. Note that |
739 | * this will return a value of 10 for legacy cubins that do not have a |
740 | * properly-encoded binary architecture version. |
741 | */ |
742 | CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, |
743 | |
744 | /** |
745 | * The attribute to indicate whether the function has been compiled with |
746 | * user specified option "-Xptxas --dlcm=ca" set . |
747 | */ |
748 | CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, |
749 | |
750 | /** |
751 | * The maximum size in bytes of dynamically-allocated shared memory that can be used by |
752 | * this function. If the user-specified dynamic shared memory size is larger than this |
753 | * value, the launch will fail. |
754 | * See ::cuFuncSetAttribute |
755 | */ |
756 | CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, |
757 | |
758 | /** |
759 | * On devices where the L1 cache and shared memory use the same hardware resources, |
760 | * this sets the shared memory carveout preference, in percent of the total shared memory. |
761 | * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. |
762 | * This is only a hint, and the driver can choose a different ratio if required to execute the function. |
763 | * See ::cuFuncSetAttribute |
764 | */ |
765 | CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, |
766 | |
767 | CU_FUNC_ATTRIBUTE_MAX |
768 | } CUfunction_attribute; |
769 | |
770 | /** |
771 | * Function cache configurations |
772 | */ |
773 | typedef enum CUfunc_cache_enum { |
774 | CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ |
775 | CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ |
776 | CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ |
777 | CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ |
778 | } CUfunc_cache; |
779 | |
780 | /** |
781 | * Shared memory configurations |
782 | */ |
783 | typedef enum CUsharedconfig_enum { |
784 | CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ |
785 | CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ |
786 | CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ |
787 | } CUsharedconfig; |
788 | |
789 | /** |
790 | * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute |
791 | */ |
792 | typedef enum CUshared_carveout_enum { |
793 | CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ |
794 | CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ |
795 | CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ |
796 | } CUshared_carveout; |
797 | |
798 | /** |
799 | * Memory types |
800 | */ |
801 | typedef enum CUmemorytype_enum { |
802 | CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ |
803 | CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ |
804 | CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ |
805 | CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ |
806 | } CUmemorytype; |
807 | |
808 | /** |
809 | * Compute Modes |
810 | */ |
811 | typedef enum CUcomputemode_enum { |
812 | CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ |
813 | CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ |
814 | CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ |
815 | } CUcomputemode; |
816 | |
817 | /** |
818 | * Memory advise values |
819 | */ |
820 | typedef enum CUmem_advise_enum { |
821 | CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occasionally be written to */ |
822 | CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ |
823 | CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ |
824 | CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ |
825 | CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ |
826 | CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ |
827 | } CUmem_advise; |
828 | |
829 | typedef enum CUmem_range_attribute_enum { |
830 | CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occasionally be written to */ |
831 | CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ |
832 | CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ |
833 | CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ |
834 | } CUmem_range_attribute; |
835 | |
836 | /** |
837 | * Online compiler and linker options |
838 | */ |
839 | typedef enum CUjit_option_enum |
840 | { |
841 | /** |
842 | * Max number of registers that a thread may use.\n |
843 | * Option type: unsigned int\n |
844 | * Applies to: compiler only |
845 | */ |
846 | CU_JIT_MAX_REGISTERS = 0, |
847 | |
848 | /** |
849 | * IN: Specifies minimum number of threads per block to target compilation |
850 | * for\n |
851 | * OUT: Returns the number of threads the compiler actually targeted. |
852 | * This restricts the resource utilization of the compiler (e.g. max |
853 | * registers) such that a block with the given number of threads should be |
854 | * able to launch based on register limitations. Note, this option does not |
855 | * currently take into account any other resource limitations, such as |
856 | * shared memory utilization.\n |
857 | * Cannot be combined with ::CU_JIT_TARGET.\n |
858 | * Option type: unsigned int\n |
859 | * Applies to: compiler only |
860 | */ |
861 | CU_JIT_THREADS_PER_BLOCK, |
862 | |
863 | /** |
864 | * Overwrites the option value with the total wall clock time, in |
865 | * milliseconds, spent in the compiler and linker\n |
866 | * Option type: float\n |
867 | * Applies to: compiler and linker |
868 | */ |
869 | CU_JIT_WALL_TIME, |
870 | |
871 | /** |
872 | * Pointer to a buffer in which to print any log messages |
873 | * that are informational in nature (the buffer size is specified via |
874 | * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n |
875 | * Option type: char *\n |
876 | * Applies to: compiler and linker |
877 | */ |
878 | CU_JIT_INFO_LOG_BUFFER, |
879 | |
880 | /** |
881 | * IN: Log buffer size in bytes. Log messages will be capped at this size |
882 | * (including null terminator)\n |
883 | * OUT: Amount of log buffer filled with messages\n |
884 | * Option type: unsigned int\n |
885 | * Applies to: compiler and linker |
886 | */ |
887 | CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, |
888 | |
889 | /** |
890 | * Pointer to a buffer in which to print any log messages that |
891 | * reflect errors (the buffer size is specified via option |
892 | * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n |
893 | * Option type: char *\n |
894 | * Applies to: compiler and linker |
895 | */ |
896 | CU_JIT_ERROR_LOG_BUFFER, |
897 | |
898 | /** |
899 | * IN: Log buffer size in bytes. Log messages will be capped at this size |
900 | * (including null terminator)\n |
901 | * OUT: Amount of log buffer filled with messages\n |
902 | * Option type: unsigned int\n |
903 | * Applies to: compiler and linker |
904 | */ |
905 | CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, |
906 | |
907 | /** |
908 | * Level of optimizations to apply to generated code (0 - 4), with 4 |
909 | * being the default and highest level of optimizations.\n |
910 | * Option type: unsigned int\n |
911 | * Applies to: compiler only |
912 | */ |
913 | CU_JIT_OPTIMIZATION_LEVEL, |
914 | |
915 | /** |
916 | * No option value required. Determines the target based on the current |
917 | * attached context (default)\n |
918 | * Option type: No option value needed\n |
919 | * Applies to: compiler and linker |
920 | */ |
921 | CU_JIT_TARGET_FROM_CUCONTEXT, |
922 | |
923 | /** |
924 | * Target is chosen based on supplied ::CUjit_target. Cannot be |
925 | * combined with ::CU_JIT_THREADS_PER_BLOCK.\n |
926 | * Option type: unsigned int for enumerated type ::CUjit_target\n |
927 | * Applies to: compiler and linker |
928 | */ |
929 | CU_JIT_TARGET, |
930 | |
931 | /** |
932 | * Specifies choice of fallback strategy if matching cubin is not found. |
933 | * Choice is based on supplied ::CUjit_fallback. This option cannot be |
934 | * used with cuLink* APIs as the linker requires exact matches.\n |
935 | * Option type: unsigned int for enumerated type ::CUjit_fallback\n |
936 | * Applies to: compiler only |
937 | */ |
938 | CU_JIT_FALLBACK_STRATEGY, |
939 | |
940 | /** |
941 | * Specifies whether to create debug information in output (-g) |
942 | * (0: false, default)\n |
943 | * Option type: int\n |
944 | * Applies to: compiler and linker |
945 | */ |
946 | CU_JIT_GENERATE_DEBUG_INFO, |
947 | |
948 | /** |
949 | * Generate verbose log messages (0: false, default)\n |
950 | * Option type: int\n |
951 | * Applies to: compiler and linker |
952 | */ |
953 | CU_JIT_LOG_VERBOSE, |
954 | |
955 | /** |
956 | * Generate line number information (-lineinfo) (0: false, default)\n |
957 | * Option type: int\n |
958 | * Applies to: compiler only |
959 | */ |
960 | CU_JIT_GENERATE_LINE_INFO, |
961 | |
962 | /** |
963 | * Specifies whether to enable caching explicitly (-dlcm) \n |
964 | * Choice is based on supplied ::CUjit_cacheMode_enum.\n |
965 | * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n |
966 | * Applies to: compiler only |
967 | */ |
968 | CU_JIT_CACHE_MODE, |
969 | |
970 | /** |
971 | * The below jit options are used for internal purposes only, in this version of CUDA |
972 | */ |
973 | CU_JIT_NEW_SM3X_OPT, |
974 | CU_JIT_FAST_COMPILE, |
975 | |
976 | /** |
977 | * Array of device symbol names that will be relocated to the corresponding |
978 | * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n |
979 | * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n |
980 | * When loading a device module, driver will relocate all encountered |
981 | * unresolved symbols to the host addresses.\n |
982 | * It is only allowed to register symbols that correspond to unresolved |
983 | * global variables.\n |
984 | * It is illegal to register the same device symbol at multiple addresses.\n |
985 | * Option type: const char **\n |
986 | * Applies to: dynamic linker only |
987 | */ |
988 | CU_JIT_GLOBAL_SYMBOL_NAMES, |
989 | |
990 | /** |
991 | * Array of host addresses that will be used to relocate corresponding |
992 | * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n |
993 | * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n |
994 | * Option type: void **\n |
995 | * Applies to: dynamic linker only |
996 | */ |
997 | CU_JIT_GLOBAL_SYMBOL_ADDRESSES, |
998 | |
999 | /** |
1000 | * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and |
1001 | * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n |
1002 | * Option type: unsigned int\n |
1003 | * Applies to: dynamic linker only |
1004 | */ |
1005 | CU_JIT_GLOBAL_SYMBOL_COUNT, |
1006 | |
1007 | /** |
1008 | * Enable link-time optimization (-dlto) for device code (0: false, default)\n |
1009 | * Option type: int\n |
1010 | * Applies to: compiler and linker |
1011 | */ |
1012 | CU_JIT_LTO, |
1013 | |
1014 | /** |
1015 | * Control single-precision denormals (-ftz) support (0: false, default). |
1016 | * 1 : flushes denormal values to zero |
1017 | * 0 : preserves denormal values |
1018 | * Option type: int\n |
1019 | * Applies to: link-time optimization specified with CU_JIT_LTO |
1020 | */ |
1021 | CU_JIT_FTZ, |
1022 | |
1023 | /** |
1024 | * Control single-precision floating-point division and reciprocals |
1025 | * (-prec-div) support (1: true, default). |
1026 | * 1 : Enables the IEEE round-to-nearest mode |
1027 | * 0 : Enables the fast approximation mode |
1028 | * Option type: int\n |
1029 | * Applies to: link-time optimization specified with CU_JIT_LTO |
1030 | */ |
1031 | CU_JIT_PREC_DIV, |
1032 | |
1033 | /** |
1034 | * Control single-precision floating-point square root |
1035 | * (-prec-sqrt) support (1: true, default). |
1036 | * 1 : Enables the IEEE round-to-nearest mode |
1037 | * 0 : Enables the fast approximation mode |
1038 | * Option type: int\n |
1039 | * Applies to: link-time optimization specified with CU_JIT_LTO |
1040 | */ |
1041 | CU_JIT_PREC_SQRT, |
1042 | |
1043 | /** |
1044 | * Enable/Disable the contraction of floating-point multiplies |
1045 | * and adds/subtracts into floating-point multiply-add (-fma) |
1046 | * operations (1: Enable, default; 0: Disable). |
1047 | * Option type: int\n |
1048 | * Applies to: link-time optimization specified with CU_JIT_LTO |
1049 | */ |
1050 | CU_JIT_FMA, |
1051 | |
1052 | CU_JIT_NUM_OPTIONS |
1053 | |
1054 | } CUjit_option; |
1055 | |
1056 | /** |
1057 | * Online compilation targets |
1058 | */ |
1059 | typedef enum CUjit_target_enum |
1060 | { |
1061 | CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ |
1062 | CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ |
1063 | CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ |
1064 | CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ |
1065 | CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ |
1066 | CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ |
1067 | CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ |
1068 | CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ |
1069 | CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ |
1070 | CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ |
1071 | CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ |
1072 | CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ |
1073 | CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ |
1074 | CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ |
1075 | CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ |
1076 | CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ |
1077 | CU_TARGET_COMPUTE_86 = 86 /**< Compute device class 8.6.*/ |
1078 | } CUjit_target; |
1079 | |
1080 | /** |
1081 | * Cubin matching fallback strategies |
1082 | */ |
1083 | typedef enum CUjit_fallback_enum |
1084 | { |
1085 | CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ |
1086 | |
1087 | CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ |
1088 | |
1089 | } CUjit_fallback; |
1090 | |
1091 | /** |
1092 | * Caching modes for dlcm |
1093 | */ |
1094 | typedef enum CUjit_cacheMode_enum |
1095 | { |
1096 | CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ |
1097 | CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ |
1098 | CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ |
1099 | } CUjit_cacheMode; |
1100 | |
1101 | /** |
1102 | * Device code formats |
1103 | */ |
1104 | typedef enum CUjitInputType_enum |
1105 | { |
1106 | /** |
1107 | * Compiled device-class-specific device code\n |
1108 | * Applicable options: none |
1109 | */ |
1110 | CU_JIT_INPUT_CUBIN = 0, |
1111 | |
1112 | /** |
1113 | * PTX source code\n |
1114 | * Applicable options: PTX compiler options |
1115 | */ |
1116 | CU_JIT_INPUT_PTX, |
1117 | |
1118 | /** |
1119 | * Bundle of multiple cubins and/or PTX of some device code\n |
1120 | * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
1121 | */ |
1122 | CU_JIT_INPUT_FATBINARY, |
1123 | |
1124 | /** |
1125 | * Host object with embedded device code\n |
1126 | * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
1127 | */ |
1128 | CU_JIT_INPUT_OBJECT, |
1129 | |
1130 | /** |
1131 | * Archive of host objects with embedded device code\n |
1132 | * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
1133 | */ |
1134 | CU_JIT_INPUT_LIBRARY, |
1135 | |
1136 | /** |
1137 | * High-level intermediate code for link-time optimization\n |
1138 | * Applicable options: NVVM compiler options, PTX compiler options |
1139 | */ |
1140 | CU_JIT_INPUT_NVVM, |
1141 | |
1142 | CU_JIT_NUM_INPUT_TYPES |
1143 | } CUjitInputType; |
1144 | |
1145 | typedef struct CUlinkState_st *CUlinkState; |
1146 | |
1147 | /** |
1148 | * Flags to register a graphics resource |
1149 | */ |
1150 | typedef enum CUgraphicsRegisterFlags_enum { |
1151 | CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, |
1152 | CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, |
1153 | CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, |
1154 | CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, |
1155 | CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 |
1156 | } CUgraphicsRegisterFlags; |
1157 | |
1158 | /** |
1159 | * Flags for mapping and unmapping interop resources |
1160 | */ |
1161 | typedef enum CUgraphicsMapResourceFlags_enum { |
1162 | CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, |
1163 | CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, |
1164 | CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 |
1165 | } CUgraphicsMapResourceFlags; |
1166 | |
1167 | /** |
1168 | * Array indices for cube faces |
1169 | */ |
1170 | typedef enum CUarray_cubemap_face_enum { |
1171 | CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ |
1172 | CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ |
1173 | CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ |
1174 | CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ |
1175 | CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ |
1176 | CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ |
1177 | } CUarray_cubemap_face; |
1178 | |
1179 | /** |
1180 | * Limits |
1181 | */ |
1182 | typedef enum CUlimit_enum { |
1183 | CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ |
1184 | CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ |
1185 | CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ |
1186 | CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ |
1187 | CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ |
1188 | CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ |
1189 | CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ |
1190 | CU_LIMIT_MAX |
1191 | } CUlimit; |
1192 | |
1193 | /** |
1194 | * Resource types |
1195 | */ |
1196 | typedef enum CUresourcetype_enum { |
1197 | CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resource */ |
1198 | CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ |
1199 | CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ |
1200 | CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ |
1201 | } CUresourcetype; |
1202 | |
1203 | #ifdef _WIN32 |
1204 | #define CUDA_CB __stdcall |
1205 | #else |
1206 | #define CUDA_CB |
1207 | #endif |
1208 | |
1209 | /** |
1210 | * CUDA host function |
1211 | * \param userData Argument value passed to the function |
1212 | */ |
1213 | typedef void (CUDA_CB *CUhostFn)(void *userData); |
1214 | |
1215 | /** |
1216 | * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. |
1217 | */ |
1218 | typedef enum CUaccessProperty_enum { |
1219 | CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ |
1220 | CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ |
1221 | CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ |
1222 | } CUaccessProperty; |
1223 | |
1224 | /** |
1225 | * Specifies an access policy for a window, a contiguous extent of memory |
1226 | * beginning at base_ptr and ending at base_ptr + num_bytes. |
1227 | * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. |
1228 | * Partition into many segments and assign segments such that: |
1229 | * sum of "hit segments" / window == approx. ratio. |
1230 | * sum of "miss segments" / window == approx 1-ratio. |
1231 | * Segments and ratio specifications are fitted to the capabilities of |
1232 | * the architecture. |
1233 | * Accesses in a hit segment apply the hitProp access policy. |
1234 | * Accesses in a miss segment apply the missProp access policy. |
1235 | */ |
1236 | typedef struct CUaccessPolicyWindow_st { |
1237 | void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ |
1238 | size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ |
1239 | float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ |
1240 | CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ |
1241 | CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ |
1242 | } CUaccessPolicyWindow_v1; |
1243 | typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; |
1244 | |
1245 | /** |
1246 | * GPU kernel node parameters |
1247 | */ |
1248 | typedef struct CUDA_KERNEL_NODE_PARAMS_st { |
1249 | CUfunction func; /**< Kernel to launch */ |
1250 | unsigned int gridDimX; /**< Width of grid in blocks */ |
1251 | unsigned int gridDimY; /**< Height of grid in blocks */ |
1252 | unsigned int gridDimZ; /**< Depth of grid in blocks */ |
1253 | unsigned int blockDimX; /**< X dimension of each thread block */ |
1254 | unsigned int blockDimY; /**< Y dimension of each thread block */ |
1255 | unsigned int blockDimZ; /**< Z dimension of each thread block */ |
1256 | unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ |
1257 | void **kernelParams; /**< Array of pointers to kernel parameters */ |
1258 | void **; /**< Extra options */ |
1259 | } CUDA_KERNEL_NODE_PARAMS_v1; |
1260 | typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; |
1261 | |
1262 | /** |
1263 | * Memset node parameters |
1264 | */ |
1265 | typedef struct CUDA_MEMSET_NODE_PARAMS_st { |
1266 | CUdeviceptr dst; /**< Destination device pointer */ |
1267 | size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ |
1268 | unsigned int value; /**< Value to be set */ |
1269 | unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ |
1270 | size_t width; /**< Width of the row in elements */ |
1271 | size_t height; /**< Number of rows */ |
1272 | } CUDA_MEMSET_NODE_PARAMS_v1; |
1273 | typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; |
1274 | |
1275 | /** |
1276 | * Host node parameters |
1277 | */ |
1278 | typedef struct CUDA_HOST_NODE_PARAMS_st { |
1279 | CUhostFn fn; /**< The function to call when the node executes */ |
1280 | void* userData; /**< Argument to pass to the function */ |
1281 | } CUDA_HOST_NODE_PARAMS_v1; |
1282 | typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; |
1283 | |
1284 | /** |
1285 | * Graph node types |
1286 | */ |
1287 | typedef enum CUgraphNodeType_enum { |
1288 | CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ |
1289 | CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ |
1290 | CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ |
1291 | CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ |
1292 | CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ |
1293 | CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ |
1294 | CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ |
1295 | CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ |
1296 | CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ |
1297 | CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ |
1298 | CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ |
1299 | CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ |
1300 | } CUgraphNodeType; |
1301 | |
1302 | typedef enum CUsynchronizationPolicy_enum { |
1303 | CU_SYNC_POLICY_AUTO = 1, |
1304 | CU_SYNC_POLICY_SPIN = 2, |
1305 | CU_SYNC_POLICY_YIELD = 3, |
1306 | CU_SYNC_POLICY_BLOCKING_SYNC = 4 |
1307 | } CUsynchronizationPolicy; |
1308 | |
1309 | /** |
1310 | * Graph kernel node Attributes |
1311 | */ |
1312 | typedef enum CUkernelNodeAttrID_enum { |
1313 | CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ |
1314 | CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ |
1315 | } CUkernelNodeAttrID; |
1316 | |
1317 | /** |
1318 | * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute |
1319 | */ |
1320 | typedef union CUkernelNodeAttrValue_union { |
1321 | CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ |
1322 | int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ |
1323 | } CUkernelNodeAttrValue_v1; |
1324 | typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; |
1325 | |
1326 | /** |
1327 | * Possible stream capture statuses returned by ::cuStreamIsCapturing |
1328 | */ |
1329 | typedef enum CUstreamCaptureStatus_enum { |
1330 | CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ |
1331 | CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ |
1332 | CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that |
1333 | has been invalidated, but not terminated */ |
1334 | } CUstreamCaptureStatus; |
1335 | |
1336 | /** |
1337 | * Possible modes for stream capture thread interactions. For more details see |
1338 | * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode |
1339 | */ |
1340 | typedef enum CUstreamCaptureMode_enum { |
1341 | CU_STREAM_CAPTURE_MODE_GLOBAL = 0, |
1342 | CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, |
1343 | CU_STREAM_CAPTURE_MODE_RELAXED = 2 |
1344 | } CUstreamCaptureMode; |
1345 | |
1346 | /** |
1347 | * Stream Attributes |
1348 | */ |
1349 | typedef enum CUstreamAttrID_enum { |
1350 | CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ |
1351 | CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ |
1352 | } CUstreamAttrID; |
1353 | |
1354 | /** |
1355 | * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute |
1356 | */ |
1357 | typedef union CUstreamAttrValue_union { |
1358 | CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ |
1359 | CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ |
1360 | } CUstreamAttrValue_v1; |
1361 | typedef CUstreamAttrValue_v1 CUstreamAttrValue; |
1362 | |
1363 | /** |
1364 | * Flags to specify search options. For more details see ::cuGetProcAddress |
1365 | */ |
1366 | typedef enum CUdriverProcAddress_flags_enum { |
1367 | CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ |
1368 | CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ |
1369 | CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ |
1370 | } CUdriverProcAddress_flags; |
1371 | |
1372 | /** |
1373 | * Execution Affinity Types |
1374 | */ |
1375 | typedef enum CUexecAffinityType_enum { |
1376 | CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ |
1377 | CU_EXEC_AFFINITY_TYPE_MAX |
1378 | } CUexecAffinityType; |
1379 | |
1380 | /** |
1381 | * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT |
1382 | */ |
1383 | typedef struct CUexecAffinitySmCount_st { |
1384 | unsigned int val; /**< The number of SMs the context is limited to use. */ |
1385 | } CUexecAffinitySmCount_v1; |
1386 | typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; |
1387 | |
1388 | /** |
1389 | * Execution Affinity Parameters |
1390 | */ |
1391 | typedef struct CUexecAffinityParam_st { |
1392 | CUexecAffinityType type; |
1393 | union { |
1394 | CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ |
1395 | } param; |
1396 | } CUexecAffinityParam_v1; |
1397 | typedef CUexecAffinityParam_v1 CUexecAffinityParam; |
1398 | |
1399 | /** |
1400 | * Error codes |
1401 | */ |
1402 | typedef enum cudaError_enum { |
1403 | /** |
1404 | * The API call returned with no errors. In the case of query calls, this |
1405 | * also means that the operation being queried is complete (see |
1406 | * ::cuEventQuery() and ::cuStreamQuery()). |
1407 | */ |
1408 | CUDA_SUCCESS = 0, |
1409 | |
1410 | /** |
1411 | * This indicates that one or more of the parameters passed to the API call |
1412 | * is not within an acceptable range of values. |
1413 | */ |
1414 | CUDA_ERROR_INVALID_VALUE = 1, |
1415 | |
1416 | /** |
1417 | * The API call failed because it was unable to allocate enough memory to |
1418 | * perform the requested operation. |
1419 | */ |
1420 | CUDA_ERROR_OUT_OF_MEMORY = 2, |
1421 | |
1422 | /** |
1423 | * This indicates that the CUDA driver has not been initialized with |
1424 | * ::cuInit() or that initialization has failed. |
1425 | */ |
1426 | CUDA_ERROR_NOT_INITIALIZED = 3, |
1427 | |
1428 | /** |
1429 | * This indicates that the CUDA driver is in the process of shutting down. |
1430 | */ |
1431 | CUDA_ERROR_DEINITIALIZED = 4, |
1432 | |
1433 | /** |
1434 | * This indicates profiler is not initialized for this run. This can |
1435 | * happen when the application is running with external profiling tools |
1436 | * like visual profiler. |
1437 | */ |
1438 | CUDA_ERROR_PROFILER_DISABLED = 5, |
1439 | |
1440 | /** |
1441 | * \deprecated |
1442 | * This error return is deprecated as of CUDA 5.0. It is no longer an error |
1443 | * to attempt to enable/disable the profiling via ::cuProfilerStart or |
1444 | * ::cuProfilerStop without initialization. |
1445 | */ |
1446 | CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, |
1447 | |
1448 | /** |
1449 | * \deprecated |
1450 | * This error return is deprecated as of CUDA 5.0. It is no longer an error |
1451 | * to call cuProfilerStart() when profiling is already enabled. |
1452 | */ |
1453 | CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, |
1454 | |
1455 | /** |
1456 | * \deprecated |
1457 | * This error return is deprecated as of CUDA 5.0. It is no longer an error |
1458 | * to call cuProfilerStop() when profiling is already disabled. |
1459 | */ |
1460 | CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, |
1461 | |
1462 | /** |
1463 | * This indicates that the CUDA driver that the application has loaded is a |
1464 | * stub library. Applications that run with the stub rather than a real |
1465 | * driver loaded will result in CUDA API returning this error. |
1466 | */ |
1467 | CUDA_ERROR_STUB_LIBRARY = 34, |
1468 | |
1469 | /** |
1470 | * This indicates that no CUDA-capable devices were detected by the installed |
1471 | * CUDA driver. |
1472 | */ |
1473 | CUDA_ERROR_NO_DEVICE = 100, |
1474 | |
1475 | /** |
1476 | * This indicates that the device ordinal supplied by the user does not |
1477 | * correspond to a valid CUDA device or that the action requested is |
1478 | * invalid for the specified device. |
1479 | */ |
1480 | CUDA_ERROR_INVALID_DEVICE = 101, |
1481 | |
1482 | /** |
1483 | * This error indicates that the Grid license is not applied. |
1484 | */ |
1485 | CUDA_ERROR_DEVICE_NOT_LICENSED = 102, |
1486 | |
1487 | /** |
1488 | * This indicates that the device kernel image is invalid. This can also |
1489 | * indicate an invalid CUDA module. |
1490 | */ |
1491 | CUDA_ERROR_INVALID_IMAGE = 200, |
1492 | |
1493 | /** |
1494 | * This most frequently indicates that there is no context bound to the |
1495 | * current thread. This can also be returned if the context passed to an |
1496 | * API call is not a valid handle (such as a context that has had |
1497 | * ::cuCtxDestroy() invoked on it). This can also be returned if a user |
1498 | * mixes different API versions (i.e. 3010 context with 3020 API calls). |
1499 | * See ::cuCtxGetApiVersion() for more details. |
1500 | */ |
1501 | CUDA_ERROR_INVALID_CONTEXT = 201, |
1502 | |
1503 | /** |
1504 | * This indicated that the context being supplied as a parameter to the |
1505 | * API call was already the active context. |
1506 | * \deprecated |
1507 | * This error return is deprecated as of CUDA 3.2. It is no longer an |
1508 | * error to attempt to push the active context via ::cuCtxPushCurrent(). |
1509 | */ |
1510 | CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, |
1511 | |
1512 | /** |
1513 | * This indicates that a map or register operation has failed. |
1514 | */ |
1515 | CUDA_ERROR_MAP_FAILED = 205, |
1516 | |
1517 | /** |
1518 | * This indicates that an unmap or unregister operation has failed. |
1519 | */ |
1520 | CUDA_ERROR_UNMAP_FAILED = 206, |
1521 | |
1522 | /** |
1523 | * This indicates that the specified array is currently mapped and thus |
1524 | * cannot be destroyed. |
1525 | */ |
1526 | CUDA_ERROR_ARRAY_IS_MAPPED = 207, |
1527 | |
1528 | /** |
1529 | * This indicates that the resource is already mapped. |
1530 | */ |
1531 | CUDA_ERROR_ALREADY_MAPPED = 208, |
1532 | |
1533 | /** |
1534 | * This indicates that there is no kernel image available that is suitable |
1535 | * for the device. This can occur when a user specifies code generation |
1536 | * options for a particular CUDA source file that do not include the |
1537 | * corresponding device configuration. |
1538 | */ |
1539 | CUDA_ERROR_NO_BINARY_FOR_GPU = 209, |
1540 | |
1541 | /** |
1542 | * This indicates that a resource has already been acquired. |
1543 | */ |
1544 | CUDA_ERROR_ALREADY_ACQUIRED = 210, |
1545 | |
1546 | /** |
1547 | * This indicates that a resource is not mapped. |
1548 | */ |
1549 | CUDA_ERROR_NOT_MAPPED = 211, |
1550 | |
1551 | /** |
1552 | * This indicates that a mapped resource is not available for access as an |
1553 | * array. |
1554 | */ |
1555 | CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, |
1556 | |
1557 | /** |
1558 | * This indicates that a mapped resource is not available for access as a |
1559 | * pointer. |
1560 | */ |
1561 | CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, |
1562 | |
1563 | /** |
1564 | * This indicates that an uncorrectable ECC error was detected during |
1565 | * execution. |
1566 | */ |
1567 | CUDA_ERROR_ECC_UNCORRECTABLE = 214, |
1568 | |
1569 | /** |
1570 | * This indicates that the ::CUlimit passed to the API call is not |
1571 | * supported by the active device. |
1572 | */ |
1573 | CUDA_ERROR_UNSUPPORTED_LIMIT = 215, |
1574 | |
1575 | /** |
1576 | * This indicates that the ::CUcontext passed to the API call can |
1577 | * only be bound to a single CPU thread at a time but is already |
1578 | * bound to a CPU thread. |
1579 | */ |
1580 | CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, |
1581 | |
1582 | /** |
1583 | * This indicates that peer access is not supported across the given |
1584 | * devices. |
1585 | */ |
1586 | CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, |
1587 | |
1588 | /** |
1589 | * This indicates that a PTX JIT compilation failed. |
1590 | */ |
1591 | CUDA_ERROR_INVALID_PTX = 218, |
1592 | |
1593 | /** |
1594 | * This indicates an error with OpenGL or DirectX context. |
1595 | */ |
1596 | CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, |
1597 | |
1598 | /** |
1599 | * This indicates that an uncorrectable NVLink error was detected during the |
1600 | * execution. |
1601 | */ |
1602 | CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, |
1603 | |
1604 | /** |
1605 | * This indicates that the PTX JIT compiler library was not found. |
1606 | */ |
1607 | CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, |
1608 | |
1609 | /** |
1610 | * This indicates that the provided PTX was compiled with an unsupported toolchain. |
1611 | */ |
1612 | |
1613 | CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, |
1614 | |
1615 | /** |
1616 | * This indicates that the PTX JIT compilation was disabled. |
1617 | */ |
1618 | CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, |
1619 | |
1620 | /** |
1621 | * This indicates that the ::CUexecAffinityType passed to the API call is not |
1622 | * supported by the active device. |
1623 | */ |
1624 | CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, |
1625 | |
1626 | /** |
1627 | * This indicates that the device kernel source is invalid. |
1628 | */ |
1629 | CUDA_ERROR_INVALID_SOURCE = 300, |
1630 | |
1631 | /** |
1632 | * This indicates that the file specified was not found. |
1633 | */ |
1634 | CUDA_ERROR_FILE_NOT_FOUND = 301, |
1635 | |
1636 | /** |
1637 | * This indicates that a link to a shared object failed to resolve. |
1638 | */ |
1639 | CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, |
1640 | |
1641 | /** |
1642 | * This indicates that initialization of a shared object failed. |
1643 | */ |
1644 | CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, |
1645 | |
1646 | /** |
1647 | * This indicates that an OS call failed. |
1648 | */ |
1649 | CUDA_ERROR_OPERATING_SYSTEM = 304, |
1650 | |
1651 | /** |
1652 | * This indicates that a resource handle passed to the API call was not |
1653 | * valid. Resource handles are opaque types like ::CUstream and ::CUevent. |
1654 | */ |
1655 | CUDA_ERROR_INVALID_HANDLE = 400, |
1656 | |
1657 | /** |
1658 | * This indicates that a resource required by the API call is not in a |
1659 | * valid state to perform the requested operation. |
1660 | */ |
1661 | CUDA_ERROR_ILLEGAL_STATE = 401, |
1662 | |
1663 | /** |
1664 | * This indicates that a named symbol was not found. Examples of symbols |
1665 | * are global/constant variable names, driver function names, texture names, |
1666 | * and surface names. |
1667 | */ |
1668 | CUDA_ERROR_NOT_FOUND = 500, |
1669 | |
1670 | /** |
1671 | * This indicates that asynchronous operations issued previously have not |
1672 | * completed yet. This result is not actually an error, but must be indicated |
1673 | * differently than ::CUDA_SUCCESS (which indicates completion). Calls that |
1674 | * may return this value include ::cuEventQuery() and ::cuStreamQuery(). |
1675 | */ |
1676 | CUDA_ERROR_NOT_READY = 600, |
1677 | |
1678 | /** |
1679 | * While executing a kernel, the device encountered a |
1680 | * load or store instruction on an invalid memory address. |
1681 | * This leaves the process in an inconsistent state and any further CUDA work |
1682 | * will return the same error. To continue using CUDA, the process must be terminated |
1683 | * and relaunched. |
1684 | */ |
1685 | CUDA_ERROR_ILLEGAL_ADDRESS = 700, |
1686 | |
1687 | /** |
1688 | * This indicates that a launch did not occur because it did not have |
1689 | * appropriate resources. This error usually indicates that the user has |
1690 | * attempted to pass too many arguments to the device kernel, or the |
1691 | * kernel launch specifies too many threads for the kernel's register |
1692 | * count. Passing arguments of the wrong size (i.e. a 64-bit pointer |
1693 | * when a 32-bit int is expected) is equivalent to passing too many |
1694 | * arguments and can also result in this error. |
1695 | */ |
1696 | CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, |
1697 | |
1698 | /** |
1699 | * This indicates that the device kernel took too long to execute. This can |
1700 | * only occur if timeouts are enabled - see the device attribute |
1701 | * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. |
1702 | * This leaves the process in an inconsistent state and any further CUDA work |
1703 | * will return the same error. To continue using CUDA, the process must be terminated |
1704 | * and relaunched. |
1705 | */ |
1706 | CUDA_ERROR_LAUNCH_TIMEOUT = 702, |
1707 | |
1708 | /** |
1709 | * This error indicates a kernel launch that uses an incompatible texturing |
1710 | * mode. |
1711 | */ |
1712 | CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, |
1713 | |
1714 | /** |
1715 | * This error indicates that a call to ::cuCtxEnablePeerAccess() is |
1716 | * trying to re-enable peer access to a context which has already |
1717 | * had peer access to it enabled. |
1718 | */ |
1719 | CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, |
1720 | |
1721 | /** |
1722 | * This error indicates that ::cuCtxDisablePeerAccess() is |
1723 | * trying to disable peer access which has not been enabled yet |
1724 | * via ::cuCtxEnablePeerAccess(). |
1725 | */ |
1726 | CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, |
1727 | |
1728 | /** |
1729 | * This error indicates that the primary context for the specified device |
1730 | * has already been initialized. |
1731 | */ |
1732 | CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, |
1733 | |
1734 | /** |
1735 | * This error indicates that the context current to the calling thread |
1736 | * has been destroyed using ::cuCtxDestroy, or is a primary context which |
1737 | * has not yet been initialized. |
1738 | */ |
1739 | CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, |
1740 | |
1741 | /** |
1742 | * A device-side assert triggered during kernel execution. The context |
1743 | * cannot be used anymore, and must be destroyed. All existing device |
1744 | * memory allocations from this context are invalid and must be |
1745 | * reconstructed if the program is to continue using CUDA. |
1746 | */ |
1747 | CUDA_ERROR_ASSERT = 710, |
1748 | |
1749 | /** |
1750 | * This error indicates that the hardware resources required to enable |
1751 | * peer access have been exhausted for one or more of the devices |
1752 | * passed to ::cuCtxEnablePeerAccess(). |
1753 | */ |
1754 | CUDA_ERROR_TOO_MANY_PEERS = 711, |
1755 | |
1756 | /** |
1757 | * This error indicates that the memory range passed to ::cuMemHostRegister() |
1758 | * has already been registered. |
1759 | */ |
1760 | CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, |
1761 | |
1762 | /** |
1763 | * This error indicates that the pointer passed to ::cuMemHostUnregister() |
1764 | * does not correspond to any currently registered memory region. |
1765 | */ |
1766 | CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, |
1767 | |
1768 | /** |
1769 | * While executing a kernel, the device encountered a stack error. |
1770 | * This can be due to stack corruption or exceeding the stack size limit. |
1771 | * This leaves the process in an inconsistent state and any further CUDA work |
1772 | * will return the same error. To continue using CUDA, the process must be terminated |
1773 | * and relaunched. |
1774 | */ |
1775 | CUDA_ERROR_HARDWARE_STACK_ERROR = 714, |
1776 | |
1777 | /** |
1778 | * While executing a kernel, the device encountered an illegal instruction. |
1779 | * This leaves the process in an inconsistent state and any further CUDA work |
1780 | * will return the same error. To continue using CUDA, the process must be terminated |
1781 | * and relaunched. |
1782 | */ |
1783 | CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, |
1784 | |
1785 | /** |
1786 | * While executing a kernel, the device encountered a load or store instruction |
1787 | * on a memory address which is not aligned. |
1788 | * This leaves the process in an inconsistent state and any further CUDA work |
1789 | * will return the same error. To continue using CUDA, the process must be terminated |
1790 | * and relaunched. |
1791 | */ |
1792 | CUDA_ERROR_MISALIGNED_ADDRESS = 716, |
1793 | |
1794 | /** |
1795 | * While executing a kernel, the device encountered an instruction |
1796 | * which can only operate on memory locations in certain address spaces |
1797 | * (global, shared, or local), but was supplied a memory address not |
1798 | * belonging to an allowed address space. |
1799 | * This leaves the process in an inconsistent state and any further CUDA work |
1800 | * will return the same error. To continue using CUDA, the process must be terminated |
1801 | * and relaunched. |
1802 | */ |
1803 | CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, |
1804 | |
1805 | /** |
1806 | * While executing a kernel, the device program counter wrapped its address space. |
1807 | * This leaves the process in an inconsistent state and any further CUDA work |
1808 | * will return the same error. To continue using CUDA, the process must be terminated |
1809 | * and relaunched. |
1810 | */ |
1811 | CUDA_ERROR_INVALID_PC = 718, |
1812 | |
1813 | /** |
1814 | * An exception occurred on the device while executing a kernel. Common |
1815 | * causes include dereferencing an invalid device pointer and accessing |
1816 | * out of bounds shared memory. Less common cases can be system specific - more |
1817 | * information about these cases can be found in the system specific user guide. |
1818 | * This leaves the process in an inconsistent state and any further CUDA work |
1819 | * will return the same error. To continue using CUDA, the process must be terminated |
1820 | * and relaunched. |
1821 | */ |
1822 | CUDA_ERROR_LAUNCH_FAILED = 719, |
1823 | |
1824 | /** |
1825 | * This error indicates that the number of blocks launched per grid for a kernel that was |
1826 | * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice |
1827 | * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor |
1828 | * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors |
1829 | * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. |
1830 | */ |
1831 | CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, |
1832 | |
1833 | /** |
1834 | * This error indicates that the attempted operation is not permitted. |
1835 | */ |
1836 | CUDA_ERROR_NOT_PERMITTED = 800, |
1837 | |
1838 | /** |
1839 | * This error indicates that the attempted operation is not supported |
1840 | * on the current system or device. |
1841 | */ |
1842 | CUDA_ERROR_NOT_SUPPORTED = 801, |
1843 | |
1844 | /** |
1845 | * This error indicates that the system is not yet ready to start any CUDA |
1846 | * work. To continue using CUDA, verify the system configuration is in a |
1847 | * valid state and all required driver daemons are actively running. |
1848 | * More information about this error can be found in the system specific |
1849 | * user guide. |
1850 | */ |
1851 | CUDA_ERROR_SYSTEM_NOT_READY = 802, |
1852 | |
1853 | /** |
1854 | * This error indicates that there is a mismatch between the versions of |
1855 | * the display driver and the CUDA driver. Refer to the compatibility documentation |
1856 | * for supported versions. |
1857 | */ |
1858 | CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, |
1859 | |
1860 | /** |
1861 | * This error indicates that the system was upgraded to run with forward compatibility |
1862 | * but the visible hardware detected by CUDA does not support this configuration. |
1863 | * Refer to the compatibility documentation for the supported hardware matrix or ensure |
1864 | * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES |
1865 | * environment variable. |
1866 | */ |
1867 | CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, |
1868 | |
1869 | /** |
1870 | * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. |
1871 | */ |
1872 | CUDA_ERROR_MPS_CONNECTION_FAILED = 805, |
1873 | |
1874 | /** |
1875 | * This error indicates that the remote procedural call between the MPS server and the MPS client failed. |
1876 | */ |
1877 | CUDA_ERROR_MPS_RPC_FAILURE = 806, |
1878 | |
1879 | /** |
1880 | * This error indicates that the MPS server is not ready to accept new MPS client requests. |
1881 | * This error can be returned when the MPS server is in the process of recovering from a fatal failure. |
1882 | */ |
1883 | CUDA_ERROR_MPS_SERVER_NOT_READY = 807, |
1884 | |
1885 | /** |
1886 | * This error indicates that the hardware resources required to create MPS client have been exhausted. |
1887 | */ |
1888 | CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, |
1889 | |
1890 | /** |
1891 | * This error indicates the the hardware resources required to support device connections have been exhausted. |
1892 | */ |
1893 | CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, |
1894 | |
1895 | /** |
1896 | * This error indicates that the operation is not permitted when |
1897 | * the stream is capturing. |
1898 | */ |
1899 | CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, |
1900 | |
1901 | /** |
1902 | * This error indicates that the current capture sequence on the stream |
1903 | * has been invalidated due to a previous error. |
1904 | */ |
1905 | CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, |
1906 | |
1907 | /** |
1908 | * This error indicates that the operation would have resulted in a merge |
1909 | * of two independent capture sequences. |
1910 | */ |
1911 | CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, |
1912 | |
1913 | /** |
1914 | * This error indicates that the capture was not initiated in this stream. |
1915 | */ |
1916 | CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, |
1917 | |
1918 | /** |
1919 | * This error indicates that the capture sequence contains a fork that was |
1920 | * not joined to the primary stream. |
1921 | */ |
1922 | CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, |
1923 | |
1924 | /** |
1925 | * This error indicates that a dependency would have been created which |
1926 | * crosses the capture sequence boundary. Only implicit in-stream ordering |
1927 | * dependencies are allowed to cross the boundary. |
1928 | */ |
1929 | CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, |
1930 | |
1931 | /** |
1932 | * This error indicates a disallowed implicit dependency on a current capture |
1933 | * sequence from cudaStreamLegacy. |
1934 | */ |
1935 | CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, |
1936 | |
1937 | /** |
1938 | * This error indicates that the operation is not permitted on an event which |
1939 | * was last recorded in a capturing stream. |
1940 | */ |
1941 | CUDA_ERROR_CAPTURED_EVENT = 907, |
1942 | |
1943 | /** |
1944 | * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED |
1945 | * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a |
1946 | * different thread. |
1947 | */ |
1948 | CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, |
1949 | |
1950 | /** |
1951 | * This error indicates that the timeout specified for the wait operation has lapsed. |
1952 | */ |
1953 | CUDA_ERROR_TIMEOUT = 909, |
1954 | |
1955 | /** |
1956 | * This error indicates that the graph update was not performed because it included |
1957 | * changes which violated constraints specific to instantiated graph update. |
1958 | */ |
1959 | CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, |
1960 | |
1961 | /** |
1962 | * This indicates that an async error has occurred in a device outside of CUDA. |
1963 | * If CUDA was waiting for an external device's signal before consuming shared data, |
1964 | * the external device signaled an error indicating that the data is not valid for |
1965 | * consumption. This leaves the process in an inconsistent state and any further CUDA |
1966 | * work will return the same error. To continue using CUDA, the process must be |
1967 | * terminated and relaunched. |
1968 | */ |
1969 | CUDA_ERROR_EXTERNAL_DEVICE = 911, |
1970 | |
1971 | /** |
1972 | * This indicates that an unknown internal error has occurred. |
1973 | */ |
1974 | CUDA_ERROR_UNKNOWN = 999 |
1975 | } CUresult; |
1976 | |
1977 | /** |
1978 | * P2P Attributes |
1979 | */ |
1980 | typedef enum CUdevice_P2PAttribute_enum { |
1981 | CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ |
1982 | CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ |
1983 | CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ |
1984 | CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ |
1985 | CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ |
1986 | } CUdevice_P2PAttribute; |
1987 | |
1988 | /** |
1989 | * CUDA stream callback |
1990 | * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. |
1991 | * \param status ::CUDA_SUCCESS or any persistent error on the stream. |
1992 | * \param userData User parameter provided at registration. |
1993 | */ |
1994 | typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); |
1995 | |
1996 | /** |
1997 | * Block size to per-block dynamic shared memory mapping for a certain |
1998 | * kernel \param blockSize Block size of the kernel. |
1999 | * |
2000 | * \return The dynamic shared memory needed by a block. |
2001 | */ |
2002 | typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); |
2003 | |
2004 | /** |
2005 | * If set, host memory is portable between CUDA contexts. |
2006 | * Flag for ::cuMemHostAlloc() |
2007 | */ |
2008 | #define CU_MEMHOSTALLOC_PORTABLE 0x01 |
2009 | |
2010 | /** |
2011 | * If set, host memory is mapped into CUDA address space and |
2012 | * ::cuMemHostGetDevicePointer() may be called on the host pointer. |
2013 | * Flag for ::cuMemHostAlloc() |
2014 | */ |
2015 | #define CU_MEMHOSTALLOC_DEVICEMAP 0x02 |
2016 | |
2017 | /** |
2018 | * If set, host memory is allocated as write-combined - fast to write, |
2019 | * faster to DMA, slow to read except via SSE4 streaming load instruction |
2020 | * (MOVNTDQA). |
2021 | * Flag for ::cuMemHostAlloc() |
2022 | */ |
2023 | #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 |
2024 | |
2025 | /** |
2026 | * If set, host memory is portable between CUDA contexts. |
2027 | * Flag for ::cuMemHostRegister() |
2028 | */ |
2029 | #define CU_MEMHOSTREGISTER_PORTABLE 0x01 |
2030 | |
2031 | /** |
2032 | * If set, host memory is mapped into CUDA address space and |
2033 | * ::cuMemHostGetDevicePointer() may be called on the host pointer. |
2034 | * Flag for ::cuMemHostRegister() |
2035 | */ |
2036 | #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 |
2037 | |
2038 | /** |
2039 | * If set, the passed memory pointer is treated as pointing to some |
2040 | * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. |
2041 | * On Windows the flag is a no-op. |
2042 | * On Linux that memory is marked as non cache-coherent for the GPU and |
2043 | * is expected to be physically contiguous. It may return |
2044 | * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, |
2045 | * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. |
2046 | * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED |
2047 | * is returned. |
2048 | * Flag for ::cuMemHostRegister() |
2049 | */ |
2050 | #define CU_MEMHOSTREGISTER_IOMEMORY 0x04 |
2051 | |
2052 | /** |
2053 | * If set, the passed memory pointer is treated as pointing to memory that is |
2054 | * considered read-only by the device. On platforms without |
2055 | * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is |
2056 | * required in order to register memory mapped to the CPU as read-only. Support |
2057 | * for the use of this flag can be queried from the device attribute |
2058 | * CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with |
2059 | * a current context associated with a device that does not have this attribute |
2060 | * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. |
2061 | */ |
2062 | #define CU_MEMHOSTREGISTER_READ_ONLY 0x08 |
2063 | |
2064 | /** |
2065 | * 2D memory copy parameters |
2066 | */ |
2067 | typedef struct CUDA_MEMCPY2D_st { |
2068 | size_t srcXInBytes; /**< Source X in bytes */ |
2069 | size_t srcY; /**< Source Y */ |
2070 | |
2071 | CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
2072 | const void *srcHost; /**< Source host pointer */ |
2073 | CUdeviceptr srcDevice; /**< Source device pointer */ |
2074 | CUarray srcArray; /**< Source array reference */ |
2075 | size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
2076 | |
2077 | size_t dstXInBytes; /**< Destination X in bytes */ |
2078 | size_t dstY; /**< Destination Y */ |
2079 | |
2080 | CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
2081 | void *dstHost; /**< Destination host pointer */ |
2082 | CUdeviceptr dstDevice; /**< Destination device pointer */ |
2083 | CUarray dstArray; /**< Destination array reference */ |
2084 | size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
2085 | |
2086 | size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ |
2087 | size_t Height; /**< Height of 2D memory copy */ |
2088 | } CUDA_MEMCPY2D_v2; |
2089 | typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; |
2090 | |
2091 | /** |
2092 | * 3D memory copy parameters |
2093 | */ |
2094 | typedef struct CUDA_MEMCPY3D_st { |
2095 | size_t srcXInBytes; /**< Source X in bytes */ |
2096 | size_t srcY; /**< Source Y */ |
2097 | size_t srcZ; /**< Source Z */ |
2098 | size_t srcLOD; /**< Source LOD */ |
2099 | CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
2100 | const void *srcHost; /**< Source host pointer */ |
2101 | CUdeviceptr srcDevice; /**< Source device pointer */ |
2102 | CUarray srcArray; /**< Source array reference */ |
2103 | void *reserved0; /**< Must be NULL */ |
2104 | size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
2105 | size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
2106 | |
2107 | size_t dstXInBytes; /**< Destination X in bytes */ |
2108 | size_t dstY; /**< Destination Y */ |
2109 | size_t dstZ; /**< Destination Z */ |
2110 | size_t dstLOD; /**< Destination LOD */ |
2111 | CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
2112 | void *dstHost; /**< Destination host pointer */ |
2113 | CUdeviceptr dstDevice; /**< Destination device pointer */ |
2114 | CUarray dstArray; /**< Destination array reference */ |
2115 | void *reserved1; /**< Must be NULL */ |
2116 | size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
2117 | size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
2118 | |
2119 | size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ |
2120 | size_t Height; /**< Height of 3D memory copy */ |
2121 | size_t Depth; /**< Depth of 3D memory copy */ |
2122 | } CUDA_MEMCPY3D_v2; |
2123 | typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; |
2124 | |
2125 | /** |
2126 | * 3D memory cross-context copy parameters |
2127 | */ |
2128 | typedef struct CUDA_MEMCPY3D_PEER_st { |
2129 | size_t srcXInBytes; /**< Source X in bytes */ |
2130 | size_t srcY; /**< Source Y */ |
2131 | size_t srcZ; /**< Source Z */ |
2132 | size_t srcLOD; /**< Source LOD */ |
2133 | CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
2134 | const void *srcHost; /**< Source host pointer */ |
2135 | CUdeviceptr srcDevice; /**< Source device pointer */ |
2136 | CUarray srcArray; /**< Source array reference */ |
2137 | CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ |
2138 | size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
2139 | size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
2140 | |
2141 | size_t dstXInBytes; /**< Destination X in bytes */ |
2142 | size_t dstY; /**< Destination Y */ |
2143 | size_t dstZ; /**< Destination Z */ |
2144 | size_t dstLOD; /**< Destination LOD */ |
2145 | CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
2146 | void *dstHost; /**< Destination host pointer */ |
2147 | CUdeviceptr dstDevice; /**< Destination device pointer */ |
2148 | CUarray dstArray; /**< Destination array reference */ |
2149 | CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ |
2150 | size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
2151 | size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
2152 | |
2153 | size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ |
2154 | size_t Height; /**< Height of 3D memory copy */ |
2155 | size_t Depth; /**< Depth of 3D memory copy */ |
2156 | } CUDA_MEMCPY3D_PEER_v1; |
2157 | typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; |
2158 | |
2159 | /** |
2160 | * Array descriptor |
2161 | */ |
2162 | typedef struct CUDA_ARRAY_DESCRIPTOR_st |
2163 | { |
2164 | size_t Width; /**< Width of array */ |
2165 | size_t Height; /**< Height of array */ |
2166 | |
2167 | CUarray_format Format; /**< Array format */ |
2168 | unsigned int NumChannels; /**< Channels per array element */ |
2169 | } CUDA_ARRAY_DESCRIPTOR_v2; |
2170 | typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; |
2171 | |
2172 | /** |
2173 | * 3D array descriptor |
2174 | */ |
2175 | typedef struct CUDA_ARRAY3D_DESCRIPTOR_st |
2176 | { |
2177 | size_t Width; /**< Width of 3D array */ |
2178 | size_t Height; /**< Height of 3D array */ |
2179 | size_t Depth; /**< Depth of 3D array */ |
2180 | |
2181 | CUarray_format Format; /**< Array format */ |
2182 | unsigned int NumChannels; /**< Channels per array element */ |
2183 | unsigned int Flags; /**< Flags */ |
2184 | } CUDA_ARRAY3D_DESCRIPTOR_v2; |
2185 | typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; |
2186 | |
2187 | /** |
2188 | * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers |
2189 | */ |
2190 | #define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 |
2191 | |
2192 | /** |
2193 | * CUDA array sparse properties |
2194 | */ |
2195 | typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { |
2196 | struct { |
2197 | unsigned int width; /**< Width of sparse tile in elements */ |
2198 | unsigned int height; /**< Height of sparse tile in elements */ |
2199 | unsigned int depth; /**< Depth of sparse tile in elements */ |
2200 | } tileExtent; |
2201 | |
2202 | /** |
2203 | * First mip level at which the mip tail begins. |
2204 | */ |
2205 | unsigned int miptailFirstLevel; |
2206 | /** |
2207 | * Total size of the mip tail. |
2208 | */ |
2209 | unsigned long long miptailSize; |
2210 | /** |
2211 | * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL |
2212 | */ |
2213 | unsigned int flags; |
2214 | unsigned int reserved[4]; |
2215 | } CUDA_ARRAY_SPARSE_PROPERTIES_v1; |
2216 | typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; |
2217 | |
2218 | /** |
2219 | * CUDA Resource descriptor |
2220 | */ |
2221 | typedef struct CUDA_RESOURCE_DESC_st |
2222 | { |
2223 | CUresourcetype resType; /**< Resource type */ |
2224 | |
2225 | union { |
2226 | struct { |
2227 | CUarray hArray; /**< CUDA array */ |
2228 | } array; |
2229 | struct { |
2230 | CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ |
2231 | } mipmap; |
2232 | struct { |
2233 | CUdeviceptr devPtr; /**< Device pointer */ |
2234 | CUarray_format format; /**< Array format */ |
2235 | unsigned int numChannels; /**< Channels per array element */ |
2236 | size_t sizeInBytes; /**< Size in bytes */ |
2237 | } linear; |
2238 | struct { |
2239 | CUdeviceptr devPtr; /**< Device pointer */ |
2240 | CUarray_format format; /**< Array format */ |
2241 | unsigned int numChannels; /**< Channels per array element */ |
2242 | size_t width; /**< Width of the array in elements */ |
2243 | size_t height; /**< Height of the array in elements */ |
2244 | size_t pitchInBytes; /**< Pitch between two rows in bytes */ |
2245 | } pitch2D; |
2246 | struct { |
2247 | int reserved[32]; |
2248 | } reserved; |
2249 | } res; |
2250 | |
2251 | unsigned int flags; /**< Flags (must be zero) */ |
2252 | } CUDA_RESOURCE_DESC_v1; |
2253 | typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; |
2254 | |
2255 | /** |
2256 | * Texture descriptor |
2257 | */ |
2258 | typedef struct CUDA_TEXTURE_DESC_st { |
2259 | CUaddress_mode addressMode[3]; /**< Address modes */ |
2260 | CUfilter_mode filterMode; /**< Filter mode */ |
2261 | unsigned int flags; /**< Flags */ |
2262 | unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ |
2263 | CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ |
2264 | float mipmapLevelBias; /**< Mipmap level bias */ |
2265 | float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ |
2266 | float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ |
2267 | float borderColor[4]; /**< Border Color */ |
2268 | int reserved[12]; |
2269 | } CUDA_TEXTURE_DESC_v1; |
2270 | typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; |
2271 | |
2272 | /** |
2273 | * Resource view format |
2274 | */ |
2275 | typedef enum CUresourceViewFormat_enum |
2276 | { |
2277 | CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ |
2278 | CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ |
2279 | CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ |
2280 | CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ |
2281 | CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ |
2282 | CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ |
2283 | CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ |
2284 | CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ |
2285 | CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ |
2286 | CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ |
2287 | CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ |
2288 | CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ |
2289 | CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ |
2290 | CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ |
2291 | CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ |
2292 | CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ |
2293 | CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ |
2294 | CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ |
2295 | CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ |
2296 | CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ |
2297 | CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ |
2298 | CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ |
2299 | CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ |
2300 | CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ |
2301 | CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ |
2302 | CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ |
2303 | CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ |
2304 | CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ |
2305 | CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ |
2306 | CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ |
2307 | CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ |
2308 | CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ |
2309 | CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ |
2310 | CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ |
2311 | CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ |
2312 | } CUresourceViewFormat; |
2313 | |
2314 | /** |
2315 | * Resource view descriptor |
2316 | */ |
2317 | typedef struct CUDA_RESOURCE_VIEW_DESC_st |
2318 | { |
2319 | CUresourceViewFormat format; /**< Resource view format */ |
2320 | size_t width; /**< Width of the resource view */ |
2321 | size_t height; /**< Height of the resource view */ |
2322 | size_t depth; /**< Depth of the resource view */ |
2323 | unsigned int firstMipmapLevel; /**< First defined mipmap level */ |
2324 | unsigned int lastMipmapLevel; /**< Last defined mipmap level */ |
2325 | unsigned int firstLayer; /**< First layer index */ |
2326 | unsigned int lastLayer; /**< Last layer index */ |
2327 | unsigned int reserved[16]; |
2328 | } CUDA_RESOURCE_VIEW_DESC_v1; |
2329 | typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; |
2330 | |
2331 | /** |
2332 | * GPU Direct v3 tokens |
2333 | */ |
2334 | typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { |
2335 | unsigned long long p2pToken; |
2336 | unsigned int vaSpaceToken; |
2337 | } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; |
2338 | typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; |
2339 | |
2340 | /** |
2341 | * Access flags that specify the level of access the current context's device has |
2342 | * on the memory referenced. |
2343 | */ |
2344 | typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { |
2345 | CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ |
2346 | CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ |
2347 | CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ |
2348 | } CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; |
2349 | |
2350 | /** |
2351 | * Kernel launch parameters |
2352 | */ |
2353 | typedef struct CUDA_LAUNCH_PARAMS_st { |
2354 | CUfunction function; /**< Kernel to launch */ |
2355 | unsigned int gridDimX; /**< Width of grid in blocks */ |
2356 | unsigned int gridDimY; /**< Height of grid in blocks */ |
2357 | unsigned int gridDimZ; /**< Depth of grid in blocks */ |
2358 | unsigned int blockDimX; /**< X dimension of each thread block */ |
2359 | unsigned int blockDimY; /**< Y dimension of each thread block */ |
2360 | unsigned int blockDimZ; /**< Z dimension of each thread block */ |
2361 | unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ |
2362 | CUstream hStream; /**< Stream identifier */ |
2363 | void **kernelParams; /**< Array of pointers to kernel parameters */ |
2364 | } CUDA_LAUNCH_PARAMS_v1; |
2365 | typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; |
2366 | |
2367 | /** |
2368 | * External memory handle types |
2369 | */ |
2370 | typedef enum CUexternalMemoryHandleType_enum { |
2371 | /** |
2372 | * Handle is an opaque file descriptor |
2373 | */ |
2374 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, |
2375 | /** |
2376 | * Handle is an opaque shared NT handle |
2377 | */ |
2378 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
2379 | /** |
2380 | * Handle is an opaque, globally shared handle |
2381 | */ |
2382 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
2383 | /** |
2384 | * Handle is a D3D12 heap object |
2385 | */ |
2386 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, |
2387 | /** |
2388 | * Handle is a D3D12 committed resource |
2389 | */ |
2390 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, |
2391 | /** |
2392 | * Handle is a shared NT handle to a D3D11 resource |
2393 | */ |
2394 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, |
2395 | /** |
2396 | * Handle is a globally shared handle to a D3D11 resource |
2397 | */ |
2398 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, |
2399 | /** |
2400 | * Handle is an NvSciBuf object |
2401 | */ |
2402 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 |
2403 | } CUexternalMemoryHandleType; |
2404 | |
2405 | /** |
2406 | * Indicates that the external memory object is a dedicated resource |
2407 | */ |
2408 | #define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 |
2409 | |
2410 | /** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS |
2411 | * contains this flag, it indicates that signaling an external semaphore object |
2412 | * should skip performing appropriate memory synchronization operations over all |
2413 | * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, |
2414 | * which otherwise are performed by default to ensure data coherency with other |
2415 | * importers of the same NvSciBuf memory objects. |
2416 | */ |
2417 | #define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 |
2418 | |
2419 | /** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS |
2420 | * contains this flag, it indicates that waiting on an external semaphore object |
2421 | * should skip performing appropriate memory synchronization operations over all |
2422 | * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, |
2423 | * which otherwise are performed by default to ensure data coherency with other |
2424 | * importers of the same NvSciBuf memory objects. |
2425 | */ |
2426 | #define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 |
2427 | |
2428 | /** |
2429 | * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, |
2430 | * it indicates that application needs signaler specific NvSciSyncAttr |
2431 | * to be filled by ::cuDeviceGetNvSciSyncAttributes. |
2432 | */ |
2433 | #define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 |
2434 | |
2435 | /** |
2436 | * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, |
2437 | * it indicates that application needs waiter specific NvSciSyncAttr |
2438 | * to be filled by ::cuDeviceGetNvSciSyncAttributes. |
2439 | */ |
2440 | #define CUDA_NVSCISYNC_ATTR_WAIT 0x2 |
2441 | /** |
2442 | * External memory handle descriptor |
2443 | */ |
2444 | typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { |
2445 | /** |
2446 | * Type of the handle |
2447 | */ |
2448 | CUexternalMemoryHandleType type; |
2449 | union { |
2450 | /** |
2451 | * File descriptor referencing the memory object. Valid |
2452 | * when type is |
2453 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD |
2454 | */ |
2455 | int fd; |
2456 | /** |
2457 | * Win32 handle referencing the semaphore object. Valid when |
2458 | * type is one of the following: |
2459 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 |
2460 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT |
2461 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP |
2462 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE |
2463 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE |
2464 | * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
2465 | * Exactly one of 'handle' and 'name' must be non-NULL. If |
2466 | * type is one of the following: |
2467 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT |
2468 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
2469 | * then 'name' must be NULL. |
2470 | */ |
2471 | struct { |
2472 | /** |
2473 | * Valid NT handle. Must be NULL if 'name' is non-NULL |
2474 | */ |
2475 | void *handle; |
2476 | /** |
2477 | * Name of a valid memory object. |
2478 | * Must be NULL if 'handle' is non-NULL. |
2479 | */ |
2480 | const void *name; |
2481 | } win32; |
2482 | /** |
2483 | * A handle representing an NvSciBuf Object. Valid when type |
2484 | * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF |
2485 | */ |
2486 | const void *nvSciBufObject; |
2487 | } handle; |
2488 | /** |
2489 | * Size of the memory allocation |
2490 | */ |
2491 | unsigned long long size; |
2492 | /** |
2493 | * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED |
2494 | */ |
2495 | unsigned int flags; |
2496 | unsigned int reserved[16]; |
2497 | } CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; |
2498 | typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; |
2499 | |
2500 | /** |
2501 | * External memory buffer descriptor |
2502 | */ |
2503 | typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { |
2504 | /** |
2505 | * Offset into the memory object where the buffer's base is |
2506 | */ |
2507 | unsigned long long offset; |
2508 | /** |
2509 | * Size of the buffer |
2510 | */ |
2511 | unsigned long long size; |
2512 | /** |
2513 | * Flags reserved for future use. Must be zero. |
2514 | */ |
2515 | unsigned int flags; |
2516 | unsigned int reserved[16]; |
2517 | } CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; |
2518 | typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; |
2519 | |
2520 | /** |
2521 | * External memory mipmap descriptor |
2522 | */ |
2523 | typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { |
2524 | /** |
2525 | * Offset into the memory object where the base level of the |
2526 | * mipmap chain is. |
2527 | */ |
2528 | unsigned long long offset; |
2529 | /** |
2530 | * Format, dimension and type of base level of the mipmap chain |
2531 | */ |
2532 | CUDA_ARRAY3D_DESCRIPTOR arrayDesc; |
2533 | /** |
2534 | * Total number of levels in the mipmap chain |
2535 | */ |
2536 | unsigned int numLevels; |
2537 | unsigned int reserved[16]; |
2538 | } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; |
2539 | typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; |
2540 | |
2541 | /** |
2542 | * External semaphore handle types |
2543 | */ |
2544 | typedef enum CUexternalSemaphoreHandleType_enum { |
2545 | /** |
2546 | * Handle is an opaque file descriptor |
2547 | */ |
2548 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, |
2549 | /** |
2550 | * Handle is an opaque shared NT handle |
2551 | */ |
2552 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
2553 | /** |
2554 | * Handle is an opaque, globally shared handle |
2555 | */ |
2556 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
2557 | /** |
2558 | * Handle is a shared NT handle referencing a D3D12 fence object |
2559 | */ |
2560 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, |
2561 | /** |
2562 | * Handle is a shared NT handle referencing a D3D11 fence object |
2563 | */ |
2564 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, |
2565 | /** |
2566 | * Opaque handle to NvSciSync Object |
2567 | */ |
2568 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, |
2569 | /** |
2570 | * Handle is a shared NT handle referencing a D3D11 keyed mutex object |
2571 | */ |
2572 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, |
2573 | /** |
2574 | * Handle is a globally shared handle referencing a D3D11 keyed mutex object |
2575 | */ |
2576 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, |
2577 | /** |
2578 | * Handle is an opaque file descriptor referencing a timeline semaphore |
2579 | */ |
2580 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, |
2581 | /** |
2582 | * Handle is an opaque shared NT handle referencing a timeline semaphore |
2583 | */ |
2584 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 |
2585 | } CUexternalSemaphoreHandleType; |
2586 | |
2587 | /** |
2588 | * External semaphore handle descriptor |
2589 | */ |
2590 | typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { |
2591 | /** |
2592 | * Type of the handle |
2593 | */ |
2594 | CUexternalSemaphoreHandleType type; |
2595 | union { |
2596 | /** |
2597 | * File descriptor referencing the semaphore object. Valid |
2598 | * when type is one of the following: |
2599 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD |
2600 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD |
2601 | */ |
2602 | int fd; |
2603 | /** |
2604 | * Win32 handle referencing the semaphore object. Valid when |
2605 | * type is one of the following: |
2606 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 |
2607 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
2608 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE |
2609 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE |
2610 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX |
2611 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 |
2612 | * Exactly one of 'handle' and 'name' must be non-NULL. If |
2613 | * type is one of the following: |
2614 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
2615 | * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
2616 | * then 'name' must be NULL. |
2617 | */ |
2618 | struct { |
2619 | /** |
2620 | * Valid NT handle. Must be NULL if 'name' is non-NULL |
2621 | */ |
2622 | void *handle; |
2623 | /** |
2624 | * Name of a valid synchronization primitive. |
2625 | * Must be NULL if 'handle' is non-NULL. |
2626 | */ |
2627 | const void *name; |
2628 | } win32; |
2629 | /** |
2630 | * Valid NvSciSyncObj. Must be non NULL |
2631 | */ |
2632 | const void* nvSciSyncObj; |
2633 | } handle; |
2634 | /** |
2635 | * Flags reserved for the future. Must be zero. |
2636 | */ |
2637 | unsigned int flags; |
2638 | unsigned int reserved[16]; |
2639 | } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; |
2640 | typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; |
2641 | |
2642 | /** |
2643 | * External semaphore signal parameters |
2644 | */ |
2645 | typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { |
2646 | struct { |
2647 | /** |
2648 | * Parameters for fence objects |
2649 | */ |
2650 | struct { |
2651 | /** |
2652 | * Value of fence to be signaled |
2653 | */ |
2654 | unsigned long long value; |
2655 | } fence; |
2656 | union { |
2657 | /** |
2658 | * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType |
2659 | * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. |
2660 | */ |
2661 | void *fence; |
2662 | unsigned long long reserved; |
2663 | } nvSciSync; |
2664 | /** |
2665 | * Parameters for keyed mutex objects |
2666 | */ |
2667 | struct { |
2668 | /** |
2669 | * Value of key to release the mutex with |
2670 | */ |
2671 | unsigned long long key; |
2672 | } keyedMutex; |
2673 | unsigned int reserved[12]; |
2674 | } params; |
2675 | /** |
2676 | * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to |
2677 | * signal a ::CUexternalSemaphore of type |
2678 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is |
2679 | * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates |
2680 | * that while signaling the ::CUexternalSemaphore, no memory synchronization |
2681 | * operations should be performed for any external memory object imported |
2682 | * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. |
2683 | * For all other types of ::CUexternalSemaphore, flags must be zero. |
2684 | */ |
2685 | unsigned int flags; |
2686 | unsigned int reserved[16]; |
2687 | } CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; |
2688 | typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; |
2689 | |
2690 | /** |
2691 | * External semaphore wait parameters |
2692 | */ |
2693 | typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { |
2694 | struct { |
2695 | /** |
2696 | * Parameters for fence objects |
2697 | */ |
2698 | struct { |
2699 | /** |
2700 | * Value of fence to be waited on |
2701 | */ |
2702 | unsigned long long value; |
2703 | } fence; |
2704 | /** |
2705 | * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType |
2706 | * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. |
2707 | */ |
2708 | union { |
2709 | void *fence; |
2710 | unsigned long long reserved; |
2711 | } nvSciSync; |
2712 | /** |
2713 | * Parameters for keyed mutex objects |
2714 | */ |
2715 | struct { |
2716 | /** |
2717 | * Value of key to acquire the mutex with |
2718 | */ |
2719 | unsigned long long key; |
2720 | /** |
2721 | * Timeout in milliseconds to wait to acquire the mutex |
2722 | */ |
2723 | unsigned int timeoutMs; |
2724 | } keyedMutex; |
2725 | unsigned int reserved[10]; |
2726 | } params; |
2727 | /** |
2728 | * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on |
2729 | * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
2730 | * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC |
2731 | * which indicates that while waiting for the ::CUexternalSemaphore, no memory |
2732 | * synchronization operations should be performed for any external memory |
2733 | * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. |
2734 | * For all other types of ::CUexternalSemaphore, flags must be zero. |
2735 | */ |
2736 | unsigned int flags; |
2737 | unsigned int reserved[16]; |
2738 | } CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; |
2739 | typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; |
2740 | |
2741 | /** |
2742 | * Semaphore signal node parameters |
2743 | */ |
2744 | typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { |
2745 | CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ |
2746 | const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ |
2747 | unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ |
2748 | } CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; |
2749 | typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; |
2750 | |
2751 | /** |
2752 | * Semaphore wait node parameters |
2753 | */ |
2754 | typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { |
2755 | CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ |
2756 | const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ |
2757 | unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ |
2758 | } CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; |
2759 | typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; |
2760 | |
2761 | typedef unsigned long long CUmemGenericAllocationHandle_v1; |
2762 | typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; |
2763 | |
2764 | /** |
2765 | * Flags for specifying particular handle types |
2766 | */ |
2767 | typedef enum CUmemAllocationHandleType_enum { |
2768 | CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ |
2769 | CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ |
2770 | CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ |
2771 | CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ |
2772 | CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF |
2773 | } CUmemAllocationHandleType; |
2774 | |
2775 | /** |
2776 | * Specifies the memory protection flags for mapping. |
2777 | */ |
2778 | typedef enum CUmemAccess_flags_enum { |
2779 | CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ |
2780 | CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ |
2781 | CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ |
2782 | CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF |
2783 | } CUmemAccess_flags; |
2784 | |
2785 | /** |
2786 | * Specifies the type of location |
2787 | */ |
2788 | typedef enum CUmemLocationType_enum { |
2789 | CU_MEM_LOCATION_TYPE_INVALID = 0x0, |
2790 | CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ |
2791 | CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF |
2792 | } CUmemLocationType; |
2793 | |
2794 | /** |
2795 | * Defines the allocation types available |
2796 | */ |
2797 | typedef enum CUmemAllocationType_enum { |
2798 | CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, |
2799 | |
2800 | /** This allocation type is 'pinned', i.e. cannot migrate from its current |
2801 | * location while the application is actively using it |
2802 | */ |
2803 | CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, |
2804 | CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF |
2805 | } CUmemAllocationType; |
2806 | |
2807 | /** |
2808 | * Flag for requesting different optimal and required granularities for an allocation. |
2809 | */ |
2810 | typedef enum CUmemAllocationGranularity_flags_enum { |
2811 | CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ |
2812 | CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ |
2813 | } CUmemAllocationGranularity_flags; |
2814 | |
2815 | /** |
2816 | * Sparse subresource types |
2817 | */ |
2818 | typedef enum CUarraySparseSubresourceType_enum { |
2819 | CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, |
2820 | CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 |
2821 | } CUarraySparseSubresourceType; |
2822 | |
2823 | /** |
2824 | * Memory operation types |
2825 | */ |
2826 | typedef enum CUmemOperationType_enum { |
2827 | CU_MEM_OPERATION_TYPE_MAP = 1, |
2828 | CU_MEM_OPERATION_TYPE_UNMAP = 2 |
2829 | } CUmemOperationType; |
2830 | |
2831 | /** |
2832 | * Memory handle types |
2833 | */ |
2834 | typedef enum CUmemHandleType_enum { |
2835 | CU_MEM_HANDLE_TYPE_GENERIC = 0 |
2836 | } CUmemHandleType; |
2837 | |
2838 | /** |
2839 | * Specifies the CUDA array or CUDA mipmapped array memory mapping information |
2840 | */ |
2841 | typedef struct CUarrayMapInfo_st { |
2842 | CUresourcetype resourceType; /**< Resource type */ |
2843 | |
2844 | union { |
2845 | CUmipmappedArray mipmap; |
2846 | CUarray array; |
2847 | } resource; |
2848 | |
2849 | CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ |
2850 | |
2851 | union { |
2852 | struct { |
2853 | unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ |
2854 | unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ |
2855 | unsigned int offsetX; /**< Starting X offset in elements */ |
2856 | unsigned int offsetY; /**< Starting Y offset in elements */ |
2857 | unsigned int offsetZ; /**< Starting Z offset in elements */ |
2858 | unsigned int extentWidth; /**< Width in elements */ |
2859 | unsigned int extentHeight; /**< Height in elements */ |
2860 | unsigned int extentDepth; /**< Depth in elements */ |
2861 | } sparseLevel; |
2862 | struct { |
2863 | unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ |
2864 | unsigned long long offset; /**< Offset within mip tail */ |
2865 | unsigned long long size; /**< Extent in bytes */ |
2866 | } miptail; |
2867 | } subresource; |
2868 | |
2869 | CUmemOperationType memOperationType; /**< Memory operation type */ |
2870 | CUmemHandleType memHandleType; /**< Memory handle type */ |
2871 | |
2872 | union { |
2873 | CUmemGenericAllocationHandle memHandle; |
2874 | } memHandle; |
2875 | |
2876 | unsigned long long offset; /**< Offset within the memory */ |
2877 | unsigned int deviceBitMask; /**< Device ordinal bit mask */ |
2878 | unsigned int flags; /**< flags for future use, must be zero now. */ |
2879 | unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ |
2880 | } CUarrayMapInfo_v1; |
2881 | typedef CUarrayMapInfo_v1 CUarrayMapInfo; |
2882 | |
2883 | /** |
2884 | * Specifies a memory location. |
2885 | */ |
2886 | typedef struct CUmemLocation_st { |
2887 | CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ |
2888 | int id; /**< identifier for a given this location's ::CUmemLocationType. */ |
2889 | } CUmemLocation_v1; |
2890 | typedef CUmemLocation_v1 CUmemLocation; |
2891 | |
2892 | /** |
2893 | * Specifies compression attribute for an allocation. |
2894 | */ |
2895 | typedef enum CUmemAllocationCompType_enum { |
2896 | CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ |
2897 | CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ |
2898 | } CUmemAllocationCompType; |
2899 | |
2900 | /** |
2901 | * This flag if set indicates that the memory will be used as a tile pool. |
2902 | */ |
2903 | #define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 |
2904 | |
2905 | /** |
2906 | * Specifies the allocation properties for a allocation. |
2907 | */ |
2908 | typedef struct CUmemAllocationProp_st { |
2909 | /** Allocation type */ |
2910 | CUmemAllocationType type; |
2911 | /** requested ::CUmemAllocationHandleType */ |
2912 | CUmemAllocationHandleType requestedHandleTypes; |
2913 | /** Location of allocation */ |
2914 | CUmemLocation location; |
2915 | /** |
2916 | * Windows-specific POBJECT_ATTRIBUTES required when |
2917 | * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes structure |
2918 | * includes security attributes that define |
2919 | * the scope of which exported allocations may be transferred to other |
2920 | * processes. In all other cases, this field is required to be zero. |
2921 | */ |
2922 | void *win32HandleMetaData; |
2923 | struct { |
2924 | /** |
2925 | * Allocation hint for requesting compressible memory. |
2926 | * On devices that support Compute Data Compression, compressible |
2927 | * memory can be used to accelerate accesses to data with unstructured |
2928 | * sparsity and other compressible data patterns. Applications are |
2929 | * expected to query allocation property of the handle obtained with |
2930 | * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to |
2931 | * validate if the obtained allocation is compressible or not. Note that |
2932 | * compressed memory may not be mappable on all devices. |
2933 | */ |
2934 | unsigned char compressionType; |
2935 | unsigned char gpuDirectRDMACapable; |
2936 | /** Bitmask indicating intended usage for this allocation */ |
2937 | unsigned short usage; |
2938 | unsigned char reserved[4]; |
2939 | } allocFlags; |
2940 | } CUmemAllocationProp_v1; |
2941 | typedef CUmemAllocationProp_v1 CUmemAllocationProp; |
2942 | |
2943 | /** |
2944 | * Memory access descriptor |
2945 | */ |
2946 | typedef struct CUmemAccessDesc_st { |
2947 | CUmemLocation location; /**< Location on which the request is to change it's accessibility */ |
2948 | CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ |
2949 | } CUmemAccessDesc_v1; |
2950 | typedef CUmemAccessDesc_v1 CUmemAccessDesc; |
2951 | |
2952 | typedef enum CUgraphExecUpdateResult_enum { |
2953 | CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ |
2954 | CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ |
2955 | CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ |
2956 | CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ |
2957 | CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ |
2958 | CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ |
2959 | CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ |
2960 | CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7 /**< The update failed because the function of a kernel node changed in an unsupported way */ |
2961 | } CUgraphExecUpdateResult; |
2962 | |
2963 | /** |
2964 | * CUDA memory pool attributes |
2965 | */ |
2966 | typedef enum CUmemPool_attribute_enum { |
2967 | /** |
2968 | * (value type = int) |
2969 | * Allow cuMemAllocAsync to use memory asynchronously freed |
2970 | * in another streams as long as a stream ordering dependency |
2971 | * of the allocating stream on the free action exists. |
2972 | * Cuda events and null stream interactions can create the required |
2973 | * stream ordered dependencies. (default enabled) |
2974 | */ |
2975 | CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, |
2976 | |
2977 | /** |
2978 | * (value type = int) |
2979 | * Allow reuse of already completed frees when there is no dependency |
2980 | * between the free and allocation. (default enabled) |
2981 | */ |
2982 | CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, |
2983 | |
2984 | /** |
2985 | * (value type = int) |
2986 | * Allow cuMemAllocAsync to insert new stream dependencies |
2987 | * in order to establish the stream ordering required to reuse |
2988 | * a piece of memory released by cuFreeAsync (default enabled). |
2989 | */ |
2990 | CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, |
2991 | |
2992 | /** |
2993 | * (value type = cuuint64_t) |
2994 | * Amount of reserved memory in bytes to hold onto before trying |
2995 | * to release memory back to the OS. When more than the release |
2996 | * threshold bytes of memory are held by the memory pool, the |
2997 | * allocator will try to release memory back to the OS on the |
2998 | * next call to stream, event or context synchronize. (default 0) |
2999 | */ |
3000 | CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, |
3001 | |
3002 | /** |
3003 | * (value type = cuuint64_t) |
3004 | * Amount of backing memory currently allocated for the mempool. |
3005 | */ |
3006 | CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, |
3007 | |
3008 | /** |
3009 | * (value type = cuuint64_t) |
3010 | * High watermark of backing memory allocated for the mempool since the |
3011 | * last time it was reset. High watermark can only be reset to zero. |
3012 | */ |
3013 | CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, |
3014 | |
3015 | /** |
3016 | * (value type = cuuint64_t) |
3017 | * Amount of memory from the pool that is currently in use by the application. |
3018 | */ |
3019 | CU_MEMPOOL_ATTR_USED_MEM_CURRENT, |
3020 | |
3021 | /** |
3022 | * (value type = cuuint64_t) |
3023 | * High watermark of the amount of memory from the pool that was in use by the application since |
3024 | * the last time it was reset. High watermark can only be reset to zero. |
3025 | */ |
3026 | CU_MEMPOOL_ATTR_USED_MEM_HIGH |
3027 | } CUmemPool_attribute; |
3028 | |
3029 | /** |
3030 | * Specifies the properties of allocations made from the pool. |
3031 | */ |
3032 | typedef struct CUmemPoolProps_st { |
3033 | CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ |
3034 | CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ |
3035 | CUmemLocation location; /**< Location where allocations should reside. */ |
3036 | /** |
3037 | * Windows-specific LPSECURITYATTRIBUTES required when |
3038 | * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines |
3039 | * the scope of which exported allocations may be transferred to other |
3040 | * processes. In all other cases, this field is required to be zero. |
3041 | */ |
3042 | void *win32SecurityAttributes; |
3043 | unsigned char reserved[64]; /**< reserved for future use, must be 0 */ |
3044 | } CUmemPoolProps_v1; |
3045 | typedef CUmemPoolProps_v1 CUmemPoolProps; |
3046 | |
3047 | /** |
3048 | * Opaque data for exporting a pool allocation |
3049 | */ |
3050 | typedef struct CUmemPoolPtrExportData_st { |
3051 | unsigned char reserved[64]; |
3052 | } CUmemPoolPtrExportData_v1; |
3053 | typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; |
3054 | |
3055 | /** |
3056 | * Memory allocation node parameters |
3057 | */ |
3058 | typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { |
3059 | /** |
3060 | * in: location where the allocation should reside (specified in ::location). |
3061 | * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. |
3062 | */ |
3063 | CUmemPoolProps poolProps; |
3064 | const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ |
3065 | size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ |
3066 | size_t bytesize; /**< in: size in bytes of the requested allocation */ |
3067 | CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ |
3068 | } CUDA_MEM_ALLOC_NODE_PARAMS; |
3069 | |
3070 | typedef enum CUgraphMem_attribute_enum { |
3071 | /** |
3072 | * (value type = cuuint64_t) |
3073 | * Amount of memory, in bytes, currently associated with graphs |
3074 | */ |
3075 | CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, |
3076 | |
3077 | /** |
3078 | * (value type = cuuint64_t) |
3079 | * High watermark of memory, in bytes, associated with graphs since the |
3080 | * last time it was reset. High watermark can only be reset to zero. |
3081 | */ |
3082 | CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, |
3083 | |
3084 | /** |
3085 | * (value type = cuuint64_t) |
3086 | * Amount of memory, in bytes, currently allocated for use by |
3087 | * the CUDA graphs asynchronous allocator. |
3088 | */ |
3089 | CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, |
3090 | |
3091 | /** |
3092 | * (value type = cuuint64_t) |
3093 | * High watermark of memory, in bytes, currently allocated for use by |
3094 | * the CUDA graphs asynchronous allocator. |
3095 | */ |
3096 | CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH |
3097 | } CUgraphMem_attribute; |
3098 | |
3099 | /** |
3100 | * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only |
3101 | * waits for prior work in the stream corresponding to that GPU to complete before the |
3102 | * kernel begins execution. |
3103 | */ |
3104 | #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 |
3105 | |
3106 | /** |
3107 | * If set, any subsequent work pushed in a stream that participated in a call to |
3108 | * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on |
3109 | * the GPU corresponding to that stream to complete before it begins execution. |
3110 | */ |
3111 | #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 |
3112 | |
3113 | /** |
3114 | * If set, the CUDA array is a collection of layers, where each layer is either a 1D |
3115 | * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number |
3116 | * of layers, not the depth of a 3D array. |
3117 | */ |
3118 | #define CUDA_ARRAY3D_LAYERED 0x01 |
3119 | |
3120 | /** |
3121 | * Deprecated, use CUDA_ARRAY3D_LAYERED |
3122 | */ |
3123 | #define CUDA_ARRAY3D_2DARRAY 0x01 |
3124 | |
3125 | /** |
3126 | * This flag must be set in order to bind a surface reference |
3127 | * to the CUDA array |
3128 | */ |
3129 | #define CUDA_ARRAY3D_SURFACE_LDST 0x02 |
3130 | |
3131 | /** |
3132 | * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The |
3133 | * width of such a CUDA array must be equal to its height, and Depth must be six. |
3134 | * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps |
3135 | * and Depth must be a multiple of six. |
3136 | */ |
3137 | #define CUDA_ARRAY3D_CUBEMAP 0x04 |
3138 | |
3139 | /** |
3140 | * This flag must be set in order to perform texture gather operations |
3141 | * on a CUDA array. |
3142 | */ |
3143 | #define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 |
3144 | |
3145 | /** |
3146 | * This flag if set indicates that the CUDA |
3147 | * array is a DEPTH_TEXTURE. |
3148 | */ |
3149 | #define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 |
3150 | |
3151 | /** |
3152 | * This flag indicates that the CUDA array may be bound as a color target |
3153 | * in an external graphics API |
3154 | */ |
3155 | #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 |
3156 | |
3157 | /** |
3158 | * This flag if set indicates that the CUDA array or CUDA mipmapped array |
3159 | * is a sparse CUDA array or CUDA mipmapped array respectively |
3160 | */ |
3161 | #define CUDA_ARRAY3D_SPARSE 0x40 |
3162 | |
3163 | /** |
3164 | * Override the texref format with a format inferred from the array. |
3165 | * Flag for ::cuTexRefSetArray() |
3166 | */ |
3167 | #define CU_TRSA_OVERRIDE_FORMAT 0x01 |
3168 | |
3169 | /** |
3170 | * Read the texture as integers rather than promoting the values to floats |
3171 | * in the range [0,1]. |
3172 | * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
3173 | */ |
3174 | #define CU_TRSF_READ_AS_INTEGER 0x01 |
3175 | |
3176 | /** |
3177 | * Use normalized texture coordinates in the range [0,1) instead of [0,dim). |
3178 | * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
3179 | */ |
3180 | #define CU_TRSF_NORMALIZED_COORDINATES 0x02 |
3181 | |
3182 | /** |
3183 | * Perform sRGB->linear conversion during texture read. |
3184 | * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
3185 | */ |
3186 | #define CU_TRSF_SRGB 0x10 |
3187 | |
3188 | /** |
3189 | * Disable any trilinear filtering optimizations. |
3190 | * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
3191 | */ |
3192 | #define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 |
3193 | |
3194 | /** |
3195 | * End of array terminator for the \p extra parameter to |
3196 | * ::cuLaunchKernel |
3197 | */ |
3198 | #define CU_LAUNCH_PARAM_END ((void*)0x00) |
3199 | |
3200 | /** |
3201 | * Indicator that the next value in the \p extra parameter to |
3202 | * ::cuLaunchKernel will be a pointer to a buffer containing all kernel |
3203 | * parameters used for launching kernel \p f. This buffer needs to |
3204 | * honor all alignment/padding requirements of the individual parameters. |
3205 | * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the |
3206 | * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no |
3207 | * effect. |
3208 | */ |
3209 | #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) |
3210 | |
3211 | /** |
3212 | * Indicator that the next value in the \p extra parameter to |
3213 | * ::cuLaunchKernel will be a pointer to a size_t which contains the |
3214 | * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. |
3215 | * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified |
3216 | * in the \p extra array if the value associated with |
3217 | * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. |
3218 | */ |
3219 | #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) |
3220 | |
3221 | /** |
3222 | * For texture references loaded into the module, use default texunit from |
3223 | * texture reference. |
3224 | */ |
3225 | #define CU_PARAM_TR_DEFAULT -1 |
3226 | |
3227 | /** |
3228 | * Device that represents the CPU |
3229 | */ |
3230 | #define CU_DEVICE_CPU ((CUdevice)-1) |
3231 | |
3232 | /** |
3233 | * Device that represents an invalid device |
3234 | */ |
3235 | #define CU_DEVICE_INVALID ((CUdevice)-2) |
3236 | |
3237 | /** |
3238 | * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS |
3239 | */ |
3240 | typedef enum CUflushGPUDirectRDMAWritesOptions_enum { |
3241 | CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ |
3242 | CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ |
3243 | } CUflushGPUDirectRDMAWritesOptions; |
3244 | |
3245 | /** |
3246 | * Platform native ordering for GPUDirect RDMA writes |
3247 | */ |
3248 | typedef enum CUGPUDirectRDMAWritesOrdering_enum { |
3249 | CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ |
3250 | CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ |
3251 | CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ |
3252 | } CUGPUDirectRDMAWritesOrdering; |
3253 | |
3254 | /** |
3255 | * The scopes for ::cuFlushGPUDirectRDMAWrites |
3256 | */ |
3257 | typedef enum CUflushGPUDirectRDMAWritesScope_enum { |
3258 | CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ |
3259 | CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ |
3260 | } CUflushGPUDirectRDMAWritesScope; |
3261 | |
3262 | /** |
3263 | * The targets for ::cuFlushGPUDirectRDMAWrites |
3264 | */ |
3265 | typedef enum CUflushGPUDirectRDMAWritesTarget_enum { |
3266 | CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ |
3267 | } CUflushGPUDirectRDMAWritesTarget; |
3268 | |
3269 | /** |
3270 | * The additional write options for ::cuGraphDebugDotPrint |
3271 | */ |
3272 | typedef enum CUgraphDebugDot_flags_enum { |
3273 | CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ |
3274 | CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ |
3275 | CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ |
3276 | CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ |
3277 | CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ |
3278 | CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ |
3279 | CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ |
3280 | CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ |
3281 | CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ |
3282 | CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ |
3283 | CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ |
3284 | CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ |
3285 | CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ |
3286 | } CUgraphDebugDot_flags; |
3287 | |
3288 | /** |
3289 | * Flags for user objects for graphs |
3290 | */ |
3291 | typedef enum CUuserObject_flags_enum { |
3292 | CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ |
3293 | } CUuserObject_flags; |
3294 | |
3295 | /** |
3296 | * Flags for retaining user object references for graphs |
3297 | */ |
3298 | typedef enum CUuserObjectRetain_flags_enum { |
3299 | CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ |
3300 | } CUuserObjectRetain_flags; |
3301 | |
3302 | /** |
3303 | * Flags for instantiating a graph |
3304 | */ |
3305 | typedef enum CUgraphInstantiate_flags_enum { |
3306 | CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ |
3307 | } CUgraphInstantiate_flags; |
3308 | |
3309 | /** @} */ /* END CUDA_TYPES */ |
3310 | |
3311 | #if defined(__GNUC__) |
3312 | #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) |
3313 | #pragma GCC visibility push(default) |
3314 | #endif |
3315 | #endif |
3316 | |
3317 | #ifdef _WIN32 |
3318 | #define CUDAAPI __stdcall |
3319 | #else |
3320 | #define CUDAAPI |
3321 | #endif |
3322 | |
3323 | /** |
3324 | * \defgroup CUDA_ERROR Error Handling |
3325 | * |
3326 | * ___MANBRIEF___ error handling functions of the low-level CUDA driver API |
3327 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
3328 | * |
3329 | * This section describes the error handling functions of the low-level CUDA |
3330 | * driver application programming interface. |
3331 | * |
3332 | * @{ |
3333 | */ |
3334 | |
3335 | /** |
3336 | * \brief Gets the string description of an error code |
3337 | * |
3338 | * Sets \p *pStr to the address of a NULL-terminated string description |
3339 | * of the error code \p error. |
3340 | * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE |
3341 | * will be returned and \p *pStr will be set to the NULL address. |
3342 | * |
3343 | * \param error - Error code to convert to string |
3344 | * \param pStr - Address of the string pointer. |
3345 | * |
3346 | * \return |
3347 | * ::CUDA_SUCCESS, |
3348 | * ::CUDA_ERROR_INVALID_VALUE |
3349 | * |
3350 | * \sa |
3351 | * ::CUresult, |
3352 | * ::cudaGetErrorString |
3353 | */ |
3354 | CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); |
3355 | |
3356 | /** |
3357 | * \brief Gets the string representation of an error code enum name |
3358 | * |
3359 | * Sets \p *pStr to the address of a NULL-terminated string representation |
3360 | * of the name of the enum error code \p error. |
3361 | * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE |
3362 | * will be returned and \p *pStr will be set to the NULL address. |
3363 | * |
3364 | * \param error - Error code to convert to string |
3365 | * \param pStr - Address of the string pointer. |
3366 | * |
3367 | * \return |
3368 | * ::CUDA_SUCCESS, |
3369 | * ::CUDA_ERROR_INVALID_VALUE |
3370 | * |
3371 | * \sa |
3372 | * ::CUresult, |
3373 | * ::cudaGetErrorName |
3374 | */ |
3375 | CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); |
3376 | |
3377 | /** @} */ /* END CUDA_ERROR */ |
3378 | |
3379 | /** |
3380 | * \defgroup CUDA_INITIALIZE Initialization |
3381 | * |
3382 | * ___MANBRIEF___ initialization functions of the low-level CUDA driver API |
3383 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
3384 | * |
3385 | * This section describes the initialization functions of the low-level CUDA |
3386 | * driver application programming interface. |
3387 | * |
3388 | * @{ |
3389 | */ |
3390 | |
3391 | /** |
3392 | * \brief Initialize the CUDA driver API |
3393 | * |
3394 | * Initializes the driver API and must be called before any other function from |
3395 | * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() |
3396 | * has not been called, any function from the driver API will return |
3397 | * ::CUDA_ERROR_NOT_INITIALIZED. |
3398 | * |
3399 | * \param Flags - Initialization flag for CUDA. |
3400 | * |
3401 | * \return |
3402 | * ::CUDA_SUCCESS, |
3403 | * ::CUDA_ERROR_INVALID_VALUE, |
3404 | * ::CUDA_ERROR_INVALID_DEVICE, |
3405 | * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, |
3406 | * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE |
3407 | * \notefnerr |
3408 | */ |
3409 | CUresult CUDAAPI cuInit(unsigned int Flags); |
3410 | |
3411 | /** @} */ /* END CUDA_INITIALIZE */ |
3412 | |
3413 | /** |
3414 | * \defgroup CUDA_VERSION Version Management |
3415 | * |
3416 | * ___MANBRIEF___ version management functions of the low-level CUDA driver |
3417 | * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
3418 | * |
3419 | * This section describes the version management functions of the low-level |
3420 | * CUDA driver application programming interface. |
3421 | * |
3422 | * @{ |
3423 | */ |
3424 | |
3425 | /** |
3426 | * \brief Returns the latest CUDA version supported by driver |
3427 | * |
3428 | * Returns in \p *driverVersion the version of CUDA supported by |
3429 | * the driver. The version is returned as |
3430 | * (1000 × major + 10 × minor). For example, CUDA 9.2 |
3431 | * would be represented by 9020. |
3432 | * |
3433 | * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if |
3434 | * \p driverVersion is NULL. |
3435 | * |
3436 | * \param driverVersion - Returns the CUDA driver version |
3437 | * |
3438 | * \return |
3439 | * ::CUDA_SUCCESS, |
3440 | * ::CUDA_ERROR_INVALID_VALUE |
3441 | * \notefnerr |
3442 | * |
3443 | * \sa |
3444 | * ::cudaDriverGetVersion, |
3445 | * ::cudaRuntimeGetVersion |
3446 | */ |
3447 | CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); |
3448 | |
3449 | /** @} */ /* END CUDA_VERSION */ |
3450 | |
3451 | /** |
3452 | * \defgroup CUDA_DEVICE Device Management |
3453 | * |
3454 | * ___MANBRIEF___ device management functions of the low-level CUDA driver API |
3455 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
3456 | * |
3457 | * This section describes the device management functions of the low-level |
3458 | * CUDA driver application programming interface. |
3459 | * |
3460 | * @{ |
3461 | */ |
3462 | |
3463 | /** |
3464 | * \brief Returns a handle to a compute device |
3465 | * |
3466 | * Returns in \p *device a device handle given an ordinal in the range <b>[0, |
3467 | * ::cuDeviceGetCount()-1]</b>. |
3468 | * |
3469 | * \param device - Returned device handle |
3470 | * \param ordinal - Device number to get handle for |
3471 | * |
3472 | * \return |
3473 | * ::CUDA_SUCCESS, |
3474 | * ::CUDA_ERROR_DEINITIALIZED, |
3475 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3476 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3477 | * ::CUDA_ERROR_INVALID_VALUE, |
3478 | * ::CUDA_ERROR_INVALID_DEVICE |
3479 | * \notefnerr |
3480 | * |
3481 | * \sa |
3482 | * ::cuDeviceGetAttribute, |
3483 | * ::cuDeviceGetCount, |
3484 | * ::cuDeviceGetName, |
3485 | * ::cuDeviceGetUuid, |
3486 | * ::cuDeviceGetLuid, |
3487 | * ::cuDeviceTotalMem, |
3488 | * ::cuDeviceGetExecAffinitySupport |
3489 | */ |
3490 | CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); |
3491 | |
3492 | /** |
3493 | * \brief Returns the number of compute-capable devices |
3494 | * |
3495 | * Returns in \p *count the number of devices with compute capability greater |
3496 | * than or equal to 2.0 that are available for execution. If there is no such |
3497 | * device, ::cuDeviceGetCount() returns 0. |
3498 | * |
3499 | * \param count - Returned number of compute-capable devices |
3500 | * |
3501 | * \return |
3502 | * ::CUDA_SUCCESS, |
3503 | * ::CUDA_ERROR_DEINITIALIZED, |
3504 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3505 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3506 | * ::CUDA_ERROR_INVALID_VALUE |
3507 | * \notefnerr |
3508 | * |
3509 | * \sa |
3510 | * ::cuDeviceGetAttribute, |
3511 | * ::cuDeviceGetName, |
3512 | * ::cuDeviceGetUuid, |
3513 | * ::cuDeviceGetLuid, |
3514 | * ::cuDeviceGet, |
3515 | * ::cuDeviceTotalMem, |
3516 | * ::cuDeviceGetExecAffinitySupport, |
3517 | * ::cudaGetDeviceCount |
3518 | */ |
3519 | CUresult CUDAAPI cuDeviceGetCount(int *count); |
3520 | |
3521 | /** |
3522 | * \brief Returns an identifier string for the device |
3523 | * |
3524 | * Returns an ASCII string identifying the device \p dev in the NULL-terminated |
3525 | * string pointed to by \p name. \p len specifies the maximum length of the |
3526 | * string that may be returned. |
3527 | * |
3528 | * \param name - Returned identifier string for the device |
3529 | * \param len - Maximum length of string to store in \p name |
3530 | * \param dev - Device to get identifier string for |
3531 | * |
3532 | * \return |
3533 | * ::CUDA_SUCCESS, |
3534 | * ::CUDA_ERROR_DEINITIALIZED, |
3535 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3536 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3537 | * ::CUDA_ERROR_INVALID_VALUE, |
3538 | * ::CUDA_ERROR_INVALID_DEVICE |
3539 | * \notefnerr |
3540 | * |
3541 | * \sa |
3542 | * ::cuDeviceGetAttribute, |
3543 | * ::cuDeviceGetUuid, |
3544 | * ::cuDeviceGetLuid, |
3545 | * ::cuDeviceGetCount, |
3546 | * ::cuDeviceGet, |
3547 | * ::cuDeviceTotalMem, |
3548 | * ::cuDeviceGetExecAffinitySupport, |
3549 | * ::cudaGetDeviceProperties |
3550 | */ |
3551 | CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); |
3552 | |
3553 | /** |
3554 | * \brief Return an UUID for the device |
3555 | * |
3556 | * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will |
3557 | * supplant this version in 12.0, which is retained for minor version compatibility. |
3558 | * |
3559 | * Returns 16-octets identifying the device \p dev in the structure |
3560 | * pointed by the \p uuid. |
3561 | * |
3562 | * \param uuid - Returned UUID |
3563 | * \param dev - Device to get identifier string for |
3564 | * |
3565 | * \return |
3566 | * ::CUDA_SUCCESS, |
3567 | * ::CUDA_ERROR_DEINITIALIZED, |
3568 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3569 | * ::CUDA_ERROR_INVALID_VALUE, |
3570 | * ::CUDA_ERROR_INVALID_DEVICE |
3571 | * \notefnerr |
3572 | * |
3573 | * \sa |
3574 | * ::cuDeviceGetUuid_v2 |
3575 | * ::cuDeviceGetAttribute, |
3576 | * ::cuDeviceGetCount, |
3577 | * ::cuDeviceGetName, |
3578 | * ::cuDeviceGetLuid, |
3579 | * ::cuDeviceGet, |
3580 | * ::cuDeviceTotalMem, |
3581 | * ::cuDeviceGetExecAffinitySupport, |
3582 | * ::cudaGetDeviceProperties |
3583 | */ |
3584 | CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); |
3585 | |
3586 | /** |
3587 | * \brief Return an UUID for the device (11.4+) |
3588 | * |
3589 | * Returns 16-octets identifying the device \p dev in the structure |
3590 | * pointed by the \p uuid. If the device is in MIG mode, returns its |
3591 | * MIG UUID which uniquely identifies the subscribed MIG compute instance. |
3592 | * |
3593 | * \param uuid - Returned UUID |
3594 | * \param dev - Device to get identifier string for |
3595 | * |
3596 | * \return |
3597 | * ::CUDA_SUCCESS, |
3598 | * ::CUDA_ERROR_DEINITIALIZED, |
3599 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3600 | * ::CUDA_ERROR_INVALID_VALUE, |
3601 | * ::CUDA_ERROR_INVALID_DEVICE |
3602 | * \notefnerr |
3603 | * |
3604 | * \sa |
3605 | * ::cuDeviceGetAttribute, |
3606 | * ::cuDeviceGetCount, |
3607 | * ::cuDeviceGetName, |
3608 | * ::cuDeviceGetLuid, |
3609 | * ::cuDeviceGet, |
3610 | * ::cuDeviceTotalMem, |
3611 | * ::cudaGetDeviceProperties |
3612 | */ |
3613 | CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); |
3614 | |
3615 | /** |
3616 | * \brief Return an LUID and device node mask for the device |
3617 | * |
3618 | * Return identifying information (\p luid and \p deviceNodeMask) to allow |
3619 | * matching device with graphics APIs. |
3620 | * |
3621 | * \param luid - Returned LUID |
3622 | * \param deviceNodeMask - Returned device node mask |
3623 | * \param dev - Device to get identifier string for |
3624 | * |
3625 | * \return |
3626 | * ::CUDA_SUCCESS, |
3627 | * ::CUDA_ERROR_DEINITIALIZED, |
3628 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3629 | * ::CUDA_ERROR_INVALID_VALUE, |
3630 | * ::CUDA_ERROR_INVALID_DEVICE |
3631 | * \notefnerr |
3632 | * |
3633 | * \sa |
3634 | * ::cuDeviceGetAttribute, |
3635 | * ::cuDeviceGetCount, |
3636 | * ::cuDeviceGetName, |
3637 | * ::cuDeviceGet, |
3638 | * ::cuDeviceTotalMem, |
3639 | * ::cuDeviceGetExecAffinitySupport, |
3640 | * ::cudaGetDeviceProperties |
3641 | */ |
3642 | CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); |
3643 | |
3644 | /** |
3645 | * \brief Returns the total amount of memory on the device |
3646 | * |
3647 | * Returns in \p *bytes the total amount of memory available on the device |
3648 | * \p dev in bytes. |
3649 | * |
3650 | * \param bytes - Returned memory available on device in bytes |
3651 | * \param dev - Device handle |
3652 | * |
3653 | * \return |
3654 | * ::CUDA_SUCCESS, |
3655 | * ::CUDA_ERROR_DEINITIALIZED, |
3656 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3657 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3658 | * ::CUDA_ERROR_INVALID_VALUE, |
3659 | * ::CUDA_ERROR_INVALID_DEVICE |
3660 | * \notefnerr |
3661 | * |
3662 | * \sa |
3663 | * ::cuDeviceGetAttribute, |
3664 | * ::cuDeviceGetCount, |
3665 | * ::cuDeviceGetName, |
3666 | * ::cuDeviceGetUuid, |
3667 | * ::cuDeviceGet, |
3668 | * ::cuDeviceGetExecAffinitySupport, |
3669 | * ::cudaMemGetInfo |
3670 | */ |
3671 | CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); |
3672 | |
3673 | /** |
3674 | * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. |
3675 | * |
3676 | * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture |
3677 | * for given \p format and \p numChannels. |
3678 | * |
3679 | * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. |
3680 | * \param format - Texture format. |
3681 | * \param numChannels - Number of channels per texture element. |
3682 | * \param dev - Device handle. |
3683 | * |
3684 | * \return |
3685 | * ::CUDA_SUCCESS, |
3686 | * ::CUDA_ERROR_DEINITIALIZED, |
3687 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3688 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3689 | * ::CUDA_ERROR_INVALID_VALUE, |
3690 | * ::CUDA_ERROR_INVALID_DEVICE |
3691 | * \notefnerr |
3692 | * |
3693 | * \sa |
3694 | * ::cuDeviceGetAttribute, |
3695 | * ::cuDeviceGetCount, |
3696 | * ::cuDeviceGetName, |
3697 | * ::cuDeviceGetUuid, |
3698 | * ::cuDeviceGet, |
3699 | * ::cudaMemGetInfo, |
3700 | * ::cuDeviceTotalMem |
3701 | */ |
3702 | CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); |
3703 | |
3704 | /** |
3705 | * \brief Returns information about the device |
3706 | * |
3707 | * Returns in \p *pi the integer value of the attribute \p attrib on device |
3708 | * \p dev. The supported attributes are: |
3709 | * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per |
3710 | * block; |
3711 | * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; |
3712 | * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; |
3713 | * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; |
3714 | * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; |
3715 | * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; |
3716 | * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; |
3717 | * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of |
3718 | * shared memory available to a thread block in bytes; |
3719 | * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for |
3720 | * __constant__ variables in a CUDA C kernel in bytes; |
3721 | * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; |
3722 | * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the |
3723 | * memory copy functions that involve memory regions allocated through |
3724 | * ::cuMemAllocPitch(); |
3725 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D |
3726 | * texture width; |
3727 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width |
3728 | * for a 1D texture bound to linear memory; |
3729 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum |
3730 | * mipmapped 1D texture width; |
3731 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D |
3732 | * texture width; |
3733 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D |
3734 | * texture height; |
3735 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width |
3736 | * for a 2D texture bound to linear memory; |
3737 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height |
3738 | * for a 2D texture bound to linear memory; |
3739 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch |
3740 | * in bytes for a 2D texture bound to linear memory; |
3741 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum |
3742 | * mipmapped 2D texture width; |
3743 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum |
3744 | * mipmapped 2D texture height; |
3745 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D |
3746 | * texture width; |
3747 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D |
3748 | * texture height; |
3749 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D |
3750 | * texture depth; |
3751 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: |
3752 | * Alternate maximum 3D texture width, 0 if no alternate |
3753 | * maximum 3D texture size is supported; |
3754 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: |
3755 | * Alternate maximum 3D texture height, 0 if no alternate |
3756 | * maximum 3D texture size is supported; |
3757 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: |
3758 | * Alternate maximum 3D texture depth, 0 if no alternate |
3759 | * maximum 3D texture size is supported; |
3760 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: |
3761 | * Maximum cubemap texture width or height; |
3762 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: |
3763 | * Maximum 1D layered texture width; |
3764 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: |
3765 | * Maximum layers in a 1D layered texture; |
3766 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: |
3767 | * Maximum 2D layered texture width; |
3768 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: |
3769 | * Maximum 2D layered texture height; |
3770 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: |
3771 | * Maximum layers in a 2D layered texture; |
3772 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: |
3773 | * Maximum cubemap layered texture width or height; |
3774 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: |
3775 | * Maximum layers in a cubemap layered texture; |
3776 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: |
3777 | * Maximum 1D surface width; |
3778 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: |
3779 | * Maximum 2D surface width; |
3780 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: |
3781 | * Maximum 2D surface height; |
3782 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: |
3783 | * Maximum 3D surface width; |
3784 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: |
3785 | * Maximum 3D surface height; |
3786 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: |
3787 | * Maximum 3D surface depth; |
3788 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: |
3789 | * Maximum 1D layered surface width; |
3790 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: |
3791 | * Maximum layers in a 1D layered surface; |
3792 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: |
3793 | * Maximum 2D layered surface width; |
3794 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: |
3795 | * Maximum 2D layered surface height; |
3796 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: |
3797 | * Maximum layers in a 2D layered surface; |
3798 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: |
3799 | * Maximum cubemap surface width; |
3800 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: |
3801 | * Maximum cubemap layered surface width; |
3802 | * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: |
3803 | * Maximum layers in a cubemap layered surface; |
3804 | * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit |
3805 | * registers available to a thread block; |
3806 | * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; |
3807 | * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture |
3808 | * base addresses aligned to ::textureAlign bytes do not need an offset |
3809 | * applied to texture fetches; |
3810 | * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement |
3811 | * for 2D texture references bound to pitched memory; |
3812 | * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy |
3813 | * memory between host and device while executing a kernel, or 0 if not; |
3814 | * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on |
3815 | * the device; |
3816 | * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit |
3817 | * for kernels executed on the device, or 0 if not; |
3818 | * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the |
3819 | * memory subsystem, or 0 if not; |
3820 | * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host |
3821 | * memory into the CUDA address space, or 0 if not; |
3822 | * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently |
3823 | * in. Available modes are as follows: |
3824 | * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and |
3825 | * can have multiple CUDA contexts present at a single time. |
3826 | * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is |
3827 | * prohibited from creating new CUDA contexts. |
3828 | * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device |
3829 | * can have only one context used by a single process at a time. |
3830 | * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports |
3831 | * executing multiple kernels within the same context simultaneously, or 0 if |
3832 | * not. It is not guaranteed that multiple kernels will be resident |
3833 | * on the device concurrently so this feature should not be relied upon for |
3834 | * correctness; |
3835 | * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the |
3836 | * device, 0 if error correction is disabled or not supported by the device; |
3837 | * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; |
3838 | * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier |
3839 | * of the device; |
3840 | * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device |
3841 | * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC |
3842 | * is only available on Tesla hardware running Windows Vista or later; |
3843 | * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; |
3844 | * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; |
3845 | * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; |
3846 | * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; |
3847 | * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with |
3848 | * the host, or 0 if not; |
3849 | * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; |
3850 | * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; |
3851 | * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals |
3852 | * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; |
3853 | * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals |
3854 | * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; |
3855 | * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of |
3856 | * shared memory available to a multiprocessor in bytes; this amount is shared |
3857 | * by all thread blocks simultaneously resident on a multiprocessor; |
3858 | * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit |
3859 | * registers available to a multiprocessor; this number is shared by all thread |
3860 | * blocks simultaneously resident on a multiprocessor; |
3861 | * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory |
3862 | * on this system, 0 if allocating managed memory is not supported by the device on this system. |
3863 | * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. |
3864 | * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices |
3865 | * associated with the same board. Devices on the same multi-GPU board will share the same identifier. |
3866 | * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host |
3867 | * supports native atomic operations. |
3868 | * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance |
3869 | * (in floating-point operations per second) to double precision performance. |
3870 | * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing |
3871 | * pageable memory without calling cudaHostRegister on it. |
3872 | * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory |
3873 | * concurrently with the CPU. |
3874 | * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. |
3875 | * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered |
3876 | * memory at the same virtual address as the CPU. |
3877 | * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size |
3878 | * supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. |
3879 | * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES |
3880 | * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's |
3881 | * page tables. |
3882 | * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. |
3883 | * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs |
3884 | * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate |
3885 | * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate |
3886 | * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate |
3887 | * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes. |
3888 | * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes. |
3889 | * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor. |
3890 | * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate |
3891 | * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes. |
3892 | * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU |
3893 | * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs |
3894 | * |
3895 | * \param pi - Returned device attribute value |
3896 | * \param attrib - Device attribute to query |
3897 | * \param dev - Device handle |
3898 | * |
3899 | * \return |
3900 | * ::CUDA_SUCCESS, |
3901 | * ::CUDA_ERROR_DEINITIALIZED, |
3902 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3903 | * ::CUDA_ERROR_INVALID_CONTEXT, |
3904 | * ::CUDA_ERROR_INVALID_VALUE, |
3905 | * ::CUDA_ERROR_INVALID_DEVICE |
3906 | * \notefnerr |
3907 | * |
3908 | * \sa |
3909 | * ::cuDeviceGetCount, |
3910 | * ::cuDeviceGetName, |
3911 | * ::cuDeviceGetUuid, |
3912 | * ::cuDeviceGet, |
3913 | * ::cuDeviceTotalMem, |
3914 | * ::cuDeviceGetExecAffinitySupport, |
3915 | * ::cudaDeviceGetAttribute, |
3916 | * ::cudaGetDeviceProperties |
3917 | */ |
3918 | CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); |
3919 | |
3920 | /** |
3921 | * \brief Return NvSciSync attributes that this device can support. |
3922 | * |
3923 | * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that |
3924 | * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList |
3925 | * can be used to create an NvSciSync object that matches this device's capabilities. |
3926 | * |
3927 | * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is |
3928 | * already set this API will return ::CUDA_ERROR_INVALID_VALUE. |
3929 | * |
3930 | * The applications should set \p nvSciSyncAttrList to a valid |
3931 | * NvSciSyncAttrList failing which this API will return |
3932 | * ::CUDA_ERROR_INVALID_HANDLE. |
3933 | * |
3934 | * The \p flags controls how applications intends to use |
3935 | * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: |
3936 | * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to |
3937 | * signal an NvSciSync on this CUDA device. |
3938 | * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to |
3939 | * wait on an NvSciSync on this CUDA device. |
3940 | * |
3941 | * At least one of these flags must be set, failing which the API |
3942 | * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal |
3943 | * to one another: a developer may set both these flags that allows to |
3944 | * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. |
3945 | * |
3946 | * \param nvSciSyncAttrList - Return NvSciSync attributes supported. |
3947 | * \param dev - Valid Cuda Device to get NvSciSync attributes for. |
3948 | * \param flags - flags describing NvSciSync usage. |
3949 | * |
3950 | * \return |
3951 | * |
3952 | * ::CUDA_SUCCESS, |
3953 | * ::CUDA_ERROR_DEINITIALIZED, |
3954 | * ::CUDA_ERROR_NOT_INITIALIZED, |
3955 | * ::CUDA_ERROR_INVALID_VALUE, |
3956 | * ::CUDA_ERROR_INVALID_HANDLE, |
3957 | * ::CUDA_ERROR_INVALID_DEVICE, |
3958 | * ::CUDA_ERROR_NOT_SUPPORTED, |
3959 | * ::CUDA_ERROR_OUT_OF_MEMORY |
3960 | * |
3961 | * \sa |
3962 | * ::cuImportExternalSemaphore, |
3963 | * ::cuDestroyExternalSemaphore, |
3964 | * ::cuSignalExternalSemaphoresAsync, |
3965 | * ::cuWaitExternalSemaphoresAsync |
3966 | */ |
3967 | CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); |
3968 | |
3969 | /** |
3970 | * \brief Sets the current memory pool of a device |
3971 | * |
3972 | * The memory pool must be local to the specified device. |
3973 | * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. |
3974 | * By default, a device's current memory pool is its default memory pool. |
3975 | * |
3976 | * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different |
3977 | * than the one the stream runs on. |
3978 | * |
3979 | * \returns |
3980 | * ::CUDA_SUCCESS, |
3981 | * ::CUDA_ERROR_INVALID_VALUE |
3982 | * |
3983 | * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync |
3984 | */ |
3985 | CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); |
3986 | |
3987 | /** |
3988 | * \brief Gets the current mempool for a device |
3989 | * |
3990 | * Returns the last pool provided to ::cuDeviceSetMemPool for this device |
3991 | * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. |
3992 | * By default the current mempool is the default mempool for a device. |
3993 | * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. |
3994 | * |
3995 | * \returns |
3996 | * ::CUDA_SUCCESS, |
3997 | * ::CUDA_ERROR_INVALID_VALUE |
3998 | * |
3999 | * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool |
4000 | */ |
4001 | CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); |
4002 | |
4003 | /** |
4004 | * \brief Returns the default mempool of a device |
4005 | * |
4006 | * The default mempool of a device contains device memory from that device. |
4007 | * |
4008 | * \return |
4009 | * ::CUDA_SUCCESS, |
4010 | * ::CUDA_ERROR_DEINITIALIZED |
4011 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4012 | * ::CUDA_ERROR_INVALID_VALUE, |
4013 | * ::CUDA_ERROR_INVALID_DEVICE, |
4014 | * ::CUDA_ERROR_NOT_SUPPORTED |
4015 | * \notefnerr |
4016 | * |
4017 | * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate |
4018 | */ |
4019 | CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); |
4020 | |
4021 | /** |
4022 | * \brief Blocks until remote writes are visible to the specified scope |
4023 | * |
4024 | * Blocks until GPUDirect RDMA writes to the target context via mappings |
4025 | * created through APIs like nvidia_p2p_get_pages (see |
4026 | * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are |
4027 | * visible to the specified scope. |
4028 | * |
4029 | * If the scope equals or lies within the scope indicated by |
4030 | * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call |
4031 | * will be a no-op and can be safely omitted for performance. This can be |
4032 | * determined by comparing the numerical values between the two enums, with |
4033 | * smaller scopes having smaller values. |
4034 | * |
4035 | * Users may query support for this API via |
4036 | * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. |
4037 | * |
4038 | * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget |
4039 | * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope |
4040 | * |
4041 | * \return |
4042 | * ::CUDA_SUCCESS, |
4043 | * ::CUDA_ERROR_DEINITIALIZED, |
4044 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4045 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4046 | * ::CUDA_ERROR_INVALID_VALUE, |
4047 | * \notefnerr |
4048 | * |
4049 | */ |
4050 | CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); |
4051 | |
4052 | /** @} */ /* END CUDA_DEVICE */ |
4053 | |
4054 | /** |
4055 | * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] |
4056 | * |
4057 | * ___MANBRIEF___ deprecated device management functions of the low-level CUDA |
4058 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
4059 | * |
4060 | * This section describes the device management functions of the low-level |
4061 | * CUDA driver application programming interface. |
4062 | * |
4063 | * @{ |
4064 | */ |
4065 | |
4066 | /** |
4067 | * \brief Returns properties for a selected device |
4068 | * |
4069 | * \deprecated |
4070 | * |
4071 | * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). |
4072 | * |
4073 | * Returns in \p *prop the properties of device \p dev. The ::CUdevprop |
4074 | * structure is defined as: |
4075 | * |
4076 | * \code |
4077 | typedef struct CUdevprop_st { |
4078 | int maxThreadsPerBlock; |
4079 | int maxThreadsDim[3]; |
4080 | int maxGridSize[3]; |
4081 | int sharedMemPerBlock; |
4082 | int totalConstantMemory; |
4083 | int SIMDWidth; |
4084 | int memPitch; |
4085 | int regsPerBlock; |
4086 | int clockRate; |
4087 | int textureAlign |
4088 | } CUdevprop; |
4089 | * \endcode |
4090 | * where: |
4091 | * |
4092 | * - ::maxThreadsPerBlock is the maximum number of threads per block; |
4093 | * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; |
4094 | * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; |
4095 | * - ::sharedMemPerBlock is the total amount of shared memory available per |
4096 | * block in bytes; |
4097 | * - ::totalConstantMemory is the total amount of constant memory available on |
4098 | * the device in bytes; |
4099 | * - ::SIMDWidth is the warp size; |
4100 | * - ::memPitch is the maximum pitch allowed by the memory copy functions that |
4101 | * involve memory regions allocated through ::cuMemAllocPitch(); |
4102 | * - ::regsPerBlock is the total number of registers available per block; |
4103 | * - ::clockRate is the clock frequency in kilohertz; |
4104 | * - ::textureAlign is the alignment requirement; texture base addresses that |
4105 | * are aligned to ::textureAlign bytes do not need an offset applied to |
4106 | * texture fetches. |
4107 | * |
4108 | * \param prop - Returned properties of device |
4109 | * \param dev - Device to get properties for |
4110 | * |
4111 | * \return |
4112 | * ::CUDA_SUCCESS, |
4113 | * ::CUDA_ERROR_DEINITIALIZED, |
4114 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4115 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4116 | * ::CUDA_ERROR_INVALID_VALUE, |
4117 | * ::CUDA_ERROR_INVALID_DEVICE |
4118 | * \notefnerr |
4119 | * |
4120 | * \sa |
4121 | * ::cuDeviceGetAttribute, |
4122 | * ::cuDeviceGetCount, |
4123 | * ::cuDeviceGetName, |
4124 | * ::cuDeviceGetUuid, |
4125 | * ::cuDeviceGet, |
4126 | * ::cuDeviceTotalMem |
4127 | */ |
4128 | __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); |
4129 | |
4130 | /** |
4131 | * \brief Returns the compute capability of the device |
4132 | * |
4133 | * \deprecated |
4134 | * |
4135 | * This function was deprecated as of CUDA 5.0 and its functionality superseded |
4136 | * by ::cuDeviceGetAttribute(). |
4137 | * |
4138 | * Returns in \p *major and \p *minor the major and minor revision numbers that |
4139 | * define the compute capability of the device \p dev. |
4140 | * |
4141 | * \param major - Major revision number |
4142 | * \param minor - Minor revision number |
4143 | * \param dev - Device handle |
4144 | * |
4145 | * \return |
4146 | * ::CUDA_SUCCESS, |
4147 | * ::CUDA_ERROR_DEINITIALIZED, |
4148 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4149 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4150 | * ::CUDA_ERROR_INVALID_VALUE, |
4151 | * ::CUDA_ERROR_INVALID_DEVICE |
4152 | * \notefnerr |
4153 | * |
4154 | * \sa |
4155 | * ::cuDeviceGetAttribute, |
4156 | * ::cuDeviceGetCount, |
4157 | * ::cuDeviceGetName, |
4158 | * ::cuDeviceGetUuid, |
4159 | * ::cuDeviceGet, |
4160 | * ::cuDeviceTotalMem |
4161 | */ |
4162 | __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); |
4163 | |
4164 | /** @} */ /* END CUDA_DEVICE_DEPRECATED */ |
4165 | |
4166 | /** |
4167 | * \defgroup CUDA_PRIMARY_CTX Primary Context Management |
4168 | * |
4169 | * ___MANBRIEF___ primary context management functions of the low-level CUDA driver |
4170 | * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
4171 | * |
4172 | * This section describes the primary context management functions of the low-level |
4173 | * CUDA driver application programming interface. |
4174 | * |
4175 | * The primary context is unique per device and shared with the CUDA runtime API. |
4176 | * These functions allow integration with other libraries using CUDA. |
4177 | * |
4178 | * @{ |
4179 | */ |
4180 | |
4181 | /** |
4182 | * \brief Retain the primary context on the GPU |
4183 | * |
4184 | * Retains the primary context on the device. |
4185 | * Once the user successfully retains the primary context, the primary context |
4186 | * will be active and available to the user until the user releases it |
4187 | * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). |
4188 | * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. |
4189 | * |
4190 | * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN |
4191 | * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function |
4192 | * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to |
4193 | * determine the compute mode of the device. |
4194 | * The <i>nvidia-smi</i> tool can be used to set the compute mode for |
4195 | * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a |
4196 | * -h option to it. |
4197 | * |
4198 | * Please note that the primary context always supports pinned allocations. Other |
4199 | * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). |
4200 | * |
4201 | * \param pctx - Returned context handle of the new context |
4202 | * \param dev - Device for which primary context is requested |
4203 | * |
4204 | * \return |
4205 | * ::CUDA_SUCCESS, |
4206 | * ::CUDA_ERROR_DEINITIALIZED, |
4207 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4208 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4209 | * ::CUDA_ERROR_INVALID_DEVICE, |
4210 | * ::CUDA_ERROR_INVALID_VALUE, |
4211 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
4212 | * ::CUDA_ERROR_UNKNOWN |
4213 | * \notefnerr |
4214 | * |
4215 | * \sa ::cuDevicePrimaryCtxRelease, |
4216 | * ::cuDevicePrimaryCtxSetFlags, |
4217 | * ::cuCtxCreate, |
4218 | * ::cuCtxGetApiVersion, |
4219 | * ::cuCtxGetCacheConfig, |
4220 | * ::cuCtxGetDevice, |
4221 | * ::cuCtxGetFlags, |
4222 | * ::cuCtxGetLimit, |
4223 | * ::cuCtxPopCurrent, |
4224 | * ::cuCtxPushCurrent, |
4225 | * ::cuCtxSetCacheConfig, |
4226 | * ::cuCtxSetLimit, |
4227 | * ::cuCtxSynchronize |
4228 | */ |
4229 | CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); |
4230 | |
4231 | /** |
4232 | * \brief Release the primary context on the GPU |
4233 | * |
4234 | * Releases the primary context interop on the device. |
4235 | * A retained context should always be released once the user is done using |
4236 | * it. The context is automatically reset once the last reference to it is |
4237 | * released. This behavior is different when the primary context was retained |
4238 | * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary |
4239 | * context remains always active. |
4240 | * |
4241 | * Releasing a primary context that has not been previously retained will |
4242 | * fail with ::CUDA_ERROR_INVALID_CONTEXT. |
4243 | * |
4244 | * Please note that unlike ::cuCtxDestroy() this method does not pop the context |
4245 | * from stack in any circumstances. |
4246 | * |
4247 | * \param dev - Device which primary context is released |
4248 | * |
4249 | * \return |
4250 | * ::CUDA_SUCCESS, |
4251 | * ::CUDA_ERROR_DEINITIALIZED, |
4252 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4253 | * ::CUDA_ERROR_INVALID_DEVICE, |
4254 | * ::CUDA_ERROR_INVALID_CONTEXT |
4255 | * \notefnerr |
4256 | * |
4257 | * \sa ::cuDevicePrimaryCtxRetain, |
4258 | * ::cuCtxDestroy, |
4259 | * ::cuCtxGetApiVersion, |
4260 | * ::cuCtxGetCacheConfig, |
4261 | * ::cuCtxGetDevice, |
4262 | * ::cuCtxGetFlags, |
4263 | * ::cuCtxGetLimit, |
4264 | * ::cuCtxPopCurrent, |
4265 | * ::cuCtxPushCurrent, |
4266 | * ::cuCtxSetCacheConfig, |
4267 | * ::cuCtxSetLimit, |
4268 | * ::cuCtxSynchronize |
4269 | */ |
4270 | CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); |
4271 | |
4272 | /** |
4273 | * \brief Set flags for the primary context |
4274 | * |
4275 | * Sets the flags for the primary context on the device overwriting perviously |
4276 | * set ones. |
4277 | * |
4278 | * The three LSBs of the \p flags parameter can be used to control how the OS |
4279 | * thread, which owns the CUDA context at the time of an API call, interacts |
4280 | * with the OS scheduler when waiting for results from the GPU. Only one of |
4281 | * the scheduling flags can be set when creating a context. |
4282 | * |
4283 | * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for |
4284 | * results from the GPU. This can decrease latency when waiting for the GPU, |
4285 | * but may lower the performance of CPU threads if they are performing work in |
4286 | * parallel with the CUDA thread. |
4287 | * |
4288 | * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for |
4289 | * results from the GPU. This can increase latency when waiting for the GPU, |
4290 | * but can increase the performance of CPU threads performing work in parallel |
4291 | * with the GPU. |
4292 | * |
4293 | * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4294 | * synchronization primitive when waiting for the GPU to finish work. |
4295 | * |
4296 | * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4297 | * synchronization primitive when waiting for the GPU to finish work. <br> |
4298 | * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was |
4299 | * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. |
4300 | * |
4301 | * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, |
4302 | * uses a heuristic based on the number of active CUDA contexts in the |
4303 | * process \e C and the number of logical processors in the system \e P. If |
4304 | * \e C > \e P, then CUDA will yield to other OS threads when waiting for |
4305 | * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while |
4306 | * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). |
4307 | * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on |
4308 | * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC |
4309 | * for low-powered devices. |
4310 | * |
4311 | * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory |
4312 | * after resizing local memory for a kernel. This can prevent thrashing by |
4313 | * local memory allocations when launching many kernels with high local |
4314 | * memory usage at the cost of potentially increased memory usage. <br> |
4315 | * <b>Deprecated:</b> This flag is deprecated and the behavior enabled |
4316 | * by this flag is now the default and cannot be disabled. |
4317 | * |
4318 | * \param dev - Device for which the primary context flags are set |
4319 | * \param flags - New flags for the device |
4320 | * |
4321 | * \return |
4322 | * ::CUDA_SUCCESS, |
4323 | * ::CUDA_ERROR_DEINITIALIZED, |
4324 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4325 | * ::CUDA_ERROR_INVALID_DEVICE, |
4326 | * ::CUDA_ERROR_INVALID_VALUE, |
4327 | * \notefnerr |
4328 | * |
4329 | * \sa ::cuDevicePrimaryCtxRetain, |
4330 | * ::cuDevicePrimaryCtxGetState, |
4331 | * ::cuCtxCreate, |
4332 | * ::cuCtxGetFlags, |
4333 | * ::cudaSetDeviceFlags |
4334 | */ |
4335 | CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); |
4336 | |
4337 | /** |
4338 | * \brief Get the state of the primary context |
4339 | * |
4340 | * Returns in \p *flags the flags for the primary context of \p dev, and in |
4341 | * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag |
4342 | * values. |
4343 | * |
4344 | * \param dev - Device to get primary context flags for |
4345 | * \param flags - Pointer to store flags |
4346 | * \param active - Pointer to store context state; 0 = inactive, 1 = active |
4347 | * |
4348 | * \return |
4349 | * ::CUDA_SUCCESS, |
4350 | * ::CUDA_ERROR_DEINITIALIZED, |
4351 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4352 | * ::CUDA_ERROR_INVALID_DEVICE, |
4353 | * ::CUDA_ERROR_INVALID_VALUE, |
4354 | * \notefnerr |
4355 | * |
4356 | * \sa |
4357 | * ::cuDevicePrimaryCtxSetFlags, |
4358 | * ::cuCtxGetFlags, |
4359 | * ::cudaGetDeviceFlags |
4360 | */ |
4361 | CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); |
4362 | |
4363 | /** |
4364 | * \brief Destroy all allocations and reset all state on the primary context |
4365 | * |
4366 | * Explicitly destroys and cleans up all resources associated with the current |
4367 | * device in the current process. |
4368 | * |
4369 | * Note that it is responsibility of the calling function to ensure that no |
4370 | * other module in the process is using the device any more. For that reason |
4371 | * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. |
4372 | * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() |
4373 | * even after resetting the device. |
4374 | * Resetting the primary context does not release it, an application that has |
4375 | * retained the primary context should explicitly release its usage. |
4376 | * |
4377 | * \param dev - Device for which primary context is destroyed |
4378 | * |
4379 | * \return |
4380 | * ::CUDA_SUCCESS, |
4381 | * ::CUDA_ERROR_DEINITIALIZED, |
4382 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4383 | * ::CUDA_ERROR_INVALID_DEVICE, |
4384 | * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE |
4385 | * \notefnerr |
4386 | * |
4387 | * \sa ::cuDevicePrimaryCtxRetain, |
4388 | * ::cuDevicePrimaryCtxRelease, |
4389 | * ::cuCtxGetApiVersion, |
4390 | * ::cuCtxGetCacheConfig, |
4391 | * ::cuCtxGetDevice, |
4392 | * ::cuCtxGetFlags, |
4393 | * ::cuCtxGetLimit, |
4394 | * ::cuCtxPopCurrent, |
4395 | * ::cuCtxPushCurrent, |
4396 | * ::cuCtxSetCacheConfig, |
4397 | * ::cuCtxSetLimit, |
4398 | * ::cuCtxSynchronize, |
4399 | * ::cudaDeviceReset |
4400 | */ |
4401 | CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); |
4402 | |
4403 | /** @} */ /* END CUDA_PRIMARY_CTX */ |
4404 | |
4405 | /** |
4406 | * \brief Returns information about the execution affinity support of the device. |
4407 | * |
4408 | * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. |
4409 | * The supported types are: |
4410 | * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, |
4411 | * or 0 if not; |
4412 | * |
4413 | * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not |
4414 | * \param type - Execution affinity type to query |
4415 | * \param dev - Device handle |
4416 | * |
4417 | * \return |
4418 | * ::CUDA_SUCCESS, |
4419 | * ::CUDA_ERROR_DEINITIALIZED, |
4420 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4421 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4422 | * ::CUDA_ERROR_INVALID_VALUE, |
4423 | * ::CUDA_ERROR_INVALID_DEVICE |
4424 | * \notefnerr |
4425 | * |
4426 | * \sa |
4427 | * ::cuDeviceGetAttribute, |
4428 | * ::cuDeviceGetCount, |
4429 | * ::cuDeviceGetName, |
4430 | * ::cuDeviceGetUuid, |
4431 | * ::cuDeviceGet, |
4432 | * ::cuDeviceTotalMem |
4433 | */ |
4434 | CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); |
4435 | |
4436 | /** |
4437 | * \defgroup CUDA_CTX Context Management |
4438 | * |
4439 | * ___MANBRIEF___ context management functions of the low-level CUDA driver |
4440 | * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
4441 | * |
4442 | * This section describes the context management functions of the low-level |
4443 | * CUDA driver application programming interface. |
4444 | * |
4445 | * Please note that some functions are described in |
4446 | * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. |
4447 | * |
4448 | * @{ |
4449 | */ |
4450 | |
4451 | /** |
4452 | * \brief Create a CUDA context |
4453 | * |
4454 | * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. |
4455 | * |
4456 | * Creates a new CUDA context and associates it with the calling thread. The |
4457 | * \p flags parameter is described below. The context is created with a usage |
4458 | * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or |
4459 | * when done using the context. If a context is already current to the thread, |
4460 | * it is supplanted by the newly created context and may be restored by a subsequent |
4461 | * call to ::cuCtxPopCurrent(). |
4462 | * |
4463 | * The three LSBs of the \p flags parameter can be used to control how the OS |
4464 | * thread, which owns the CUDA context at the time of an API call, interacts |
4465 | * with the OS scheduler when waiting for results from the GPU. Only one of |
4466 | * the scheduling flags can be set when creating a context. |
4467 | * |
4468 | * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for |
4469 | * results from the GPU. This can decrease latency when waiting for the GPU, |
4470 | * but may lower the performance of CPU threads if they are performing work in |
4471 | * parallel with the CUDA thread. |
4472 | * |
4473 | * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for |
4474 | * results from the GPU. This can increase latency when waiting for the GPU, |
4475 | * but can increase the performance of CPU threads performing work in parallel |
4476 | * with the GPU. |
4477 | * |
4478 | * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4479 | * synchronization primitive when waiting for the GPU to finish work. |
4480 | * |
4481 | * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4482 | * synchronization primitive when waiting for the GPU to finish work. <br> |
4483 | * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was |
4484 | * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. |
4485 | * |
4486 | * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, |
4487 | * uses a heuristic based on the number of active CUDA contexts in the |
4488 | * process \e C and the number of logical processors in the system \e P. If |
4489 | * \e C > \e P, then CUDA will yield to other OS threads when waiting for |
4490 | * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while |
4491 | * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). |
4492 | * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on |
4493 | * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC |
4494 | * for low-powered devices. |
4495 | * |
4496 | * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. |
4497 | * This flag must be set in order to allocate pinned host memory that is |
4498 | * accessible to the GPU. |
4499 | * |
4500 | * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory |
4501 | * after resizing local memory for a kernel. This can prevent thrashing by |
4502 | * local memory allocations when launching many kernels with high local |
4503 | * memory usage at the cost of potentially increased memory usage. <br> |
4504 | * <b>Deprecated:</b> This flag is deprecated and the behavior enabled |
4505 | * by this flag is now the default and cannot be disabled. |
4506 | * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). |
4507 | * |
4508 | * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of |
4509 | * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() |
4510 | * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the |
4511 | * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set |
4512 | * the compute mode for * devices. |
4513 | * Documentation for <i>nvidia-smi</i> can be obtained by passing a |
4514 | * -h option to it. |
4515 | * |
4516 | * \param pctx - Returned context handle of the new context |
4517 | * \param flags - Context creation flags |
4518 | * \param dev - Device to create context on |
4519 | * |
4520 | * \return |
4521 | * ::CUDA_SUCCESS, |
4522 | * ::CUDA_ERROR_DEINITIALIZED, |
4523 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4524 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4525 | * ::CUDA_ERROR_INVALID_DEVICE, |
4526 | * ::CUDA_ERROR_INVALID_VALUE, |
4527 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
4528 | * ::CUDA_ERROR_UNKNOWN |
4529 | * \notefnerr |
4530 | * |
4531 | * \sa ::cuCtxDestroy, |
4532 | * ::cuCtxGetApiVersion, |
4533 | * ::cuCtxGetCacheConfig, |
4534 | * ::cuCtxGetDevice, |
4535 | * ::cuCtxGetFlags, |
4536 | * ::cuCtxGetLimit, |
4537 | * ::cuCtxPopCurrent, |
4538 | * ::cuCtxPushCurrent, |
4539 | * ::cuCtxSetCacheConfig, |
4540 | * ::cuCtxSetLimit, |
4541 | * ::cuCtxSynchronize |
4542 | */ |
4543 | CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); |
4544 | |
4545 | /** |
4546 | * \brief Create a CUDA context with execution affinity |
4547 | * |
4548 | * Creates a new CUDA context with execution affinity and associates it with |
4549 | * the calling thread. The \p paramsArray and \p flags parameter are described below. |
4550 | * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must |
4551 | * call ::cuCtxDestroy() or when done using the context. If a context is already |
4552 | * current to the thread, it is supplanted by the newly created context and may |
4553 | * be restored by a subsequent call to ::cuCtxPopCurrent(). |
4554 | * |
4555 | * The type and the amount of execution resource the context can use is limited by \p paramsArray |
4556 | * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams |
4557 | * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, |
4558 | * the latter execution affinity parameter overrides the former execution affinity parameter. |
4559 | * The supported execution affinity types are: |
4560 | * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion |
4561 | * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally |
4562 | * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution |
4563 | * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute |
4564 | * is only supported under Volta+ MPS. |
4565 | * |
4566 | * The three LSBs of the \p flags parameter can be used to control how the OS |
4567 | * thread, which owns the CUDA context at the time of an API call, interacts |
4568 | * with the OS scheduler when waiting for results from the GPU. Only one of |
4569 | * the scheduling flags can be set when creating a context. |
4570 | * |
4571 | * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for |
4572 | * results from the GPU. This can decrease latency when waiting for the GPU, |
4573 | * but may lower the performance of CPU threads if they are performing work in |
4574 | * parallel with the CUDA thread. |
4575 | * |
4576 | * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for |
4577 | * results from the GPU. This can increase latency when waiting for the GPU, |
4578 | * but can increase the performance of CPU threads performing work in parallel |
4579 | * with the GPU. |
4580 | * |
4581 | * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4582 | * synchronization primitive when waiting for the GPU to finish work. |
4583 | * |
4584 | * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
4585 | * synchronization primitive when waiting for the GPU to finish work. <br> |
4586 | * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was |
4587 | * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. |
4588 | * |
4589 | * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, |
4590 | * uses a heuristic based on the number of active CUDA contexts in the |
4591 | * process \e C and the number of logical processors in the system \e P. If |
4592 | * \e C > \e P, then CUDA will yield to other OS threads when waiting for |
4593 | * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while |
4594 | * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). |
4595 | * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on |
4596 | * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC |
4597 | * for low-powered devices. |
4598 | * |
4599 | * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. |
4600 | * This flag must be set in order to allocate pinned host memory that is |
4601 | * accessible to the GPU. |
4602 | * |
4603 | * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory |
4604 | * after resizing local memory for a kernel. This can prevent thrashing by |
4605 | * local memory allocations when launching many kernels with high local |
4606 | * memory usage at the cost of potentially increased memory usage. <br> |
4607 | * <b>Deprecated:</b> This flag is deprecated and the behavior enabled |
4608 | * by this flag is now the default and cannot be disabled. |
4609 | * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). |
4610 | * |
4611 | * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of |
4612 | * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() |
4613 | * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the |
4614 | * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set |
4615 | * the compute mode for * devices. |
4616 | * Documentation for <i>nvidia-smi</i> can be obtained by passing a |
4617 | * -h option to it. |
4618 | * |
4619 | * \param pctx - Returned context handle of the new context |
4620 | * \param paramsArray - Execution affinity parameters |
4621 | * \param numParams - Number of execution affinity parameters |
4622 | * \param flags - Context creation flags |
4623 | * \param dev - Device to create context on |
4624 | * |
4625 | * \return |
4626 | * ::CUDA_SUCCESS, |
4627 | * ::CUDA_ERROR_DEINITIALIZED, |
4628 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4629 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4630 | * ::CUDA_ERROR_INVALID_DEVICE, |
4631 | * ::CUDA_ERROR_INVALID_VALUE, |
4632 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
4633 | * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, |
4634 | * ::CUDA_ERROR_UNKNOWN |
4635 | * \notefnerr |
4636 | * |
4637 | * \sa ::cuCtxDestroy, |
4638 | * ::cuCtxGetApiVersion, |
4639 | * ::cuCtxGetCacheConfig, |
4640 | * ::cuCtxGetDevice, |
4641 | * ::cuCtxGetFlags, |
4642 | * ::cuCtxGetLimit, |
4643 | * ::cuCtxPopCurrent, |
4644 | * ::cuCtxPushCurrent, |
4645 | * ::cuCtxSetCacheConfig, |
4646 | * ::cuCtxSetLimit, |
4647 | * ::cuCtxSynchronize, |
4648 | * ::CUexecAffinityParam |
4649 | */ |
4650 | CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); |
4651 | |
4652 | /** |
4653 | * \brief Destroy a CUDA context |
4654 | * |
4655 | * Destroys the CUDA context specified by \p ctx. The context \p ctx will be |
4656 | * destroyed regardless of how many threads it is current to. |
4657 | * It is the responsibility of the calling function to ensure that no API |
4658 | * call issues using \p ctx while ::cuCtxDestroy() is executing. |
4659 | * |
4660 | * If \p ctx is current to the calling thread then \p ctx will also be |
4661 | * popped from the current thread's context stack (as though ::cuCtxPopCurrent() |
4662 | * were called). If \p ctx is current to other threads, then \p ctx will |
4663 | * remain current to those threads, and attempting to access \p ctx from |
4664 | * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. |
4665 | * |
4666 | * \param ctx - Context to destroy |
4667 | * |
4668 | * \return |
4669 | * ::CUDA_SUCCESS, |
4670 | * ::CUDA_ERROR_DEINITIALIZED, |
4671 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4672 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4673 | * ::CUDA_ERROR_INVALID_VALUE |
4674 | * \notefnerr |
4675 | * |
4676 | * \sa ::cuCtxCreate, |
4677 | * ::cuCtxGetApiVersion, |
4678 | * ::cuCtxGetCacheConfig, |
4679 | * ::cuCtxGetDevice, |
4680 | * ::cuCtxGetFlags, |
4681 | * ::cuCtxGetLimit, |
4682 | * ::cuCtxPopCurrent, |
4683 | * ::cuCtxPushCurrent, |
4684 | * ::cuCtxSetCacheConfig, |
4685 | * ::cuCtxSetLimit, |
4686 | * ::cuCtxSynchronize |
4687 | */ |
4688 | CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); |
4689 | |
4690 | /** |
4691 | * \brief Pushes a context on the current CPU thread |
4692 | * |
4693 | * Pushes the given context \p ctx onto the CPU thread's stack of current |
4694 | * contexts. The specified context becomes the CPU thread's current context, so |
4695 | * all CUDA functions that operate on the current context are affected. |
4696 | * |
4697 | * The previous current context may be made current again by calling |
4698 | * ::cuCtxDestroy() or ::cuCtxPopCurrent(). |
4699 | * |
4700 | * \param ctx - Context to push |
4701 | * |
4702 | * \return |
4703 | * ::CUDA_SUCCESS, |
4704 | * ::CUDA_ERROR_DEINITIALIZED, |
4705 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4706 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4707 | * ::CUDA_ERROR_INVALID_VALUE |
4708 | * \notefnerr |
4709 | * |
4710 | * \sa ::cuCtxCreate, |
4711 | * ::cuCtxDestroy, |
4712 | * ::cuCtxGetApiVersion, |
4713 | * ::cuCtxGetCacheConfig, |
4714 | * ::cuCtxGetDevice, |
4715 | * ::cuCtxGetFlags, |
4716 | * ::cuCtxGetLimit, |
4717 | * ::cuCtxPopCurrent, |
4718 | * ::cuCtxSetCacheConfig, |
4719 | * ::cuCtxSetLimit, |
4720 | * ::cuCtxSynchronize |
4721 | */ |
4722 | CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); |
4723 | |
4724 | /** |
4725 | * \brief Pops the current CUDA context from the current CPU thread. |
4726 | * |
4727 | * Pops the current CUDA context from the CPU thread and passes back the |
4728 | * old context handle in \p *pctx. That context may then be made current |
4729 | * to a different CPU thread by calling ::cuCtxPushCurrent(). |
4730 | * |
4731 | * If a context was current to the CPU thread before ::cuCtxCreate() or |
4732 | * ::cuCtxPushCurrent() was called, this function makes that context current to |
4733 | * the CPU thread again. |
4734 | * |
4735 | * \param pctx - Returned new context handle |
4736 | * |
4737 | * \return |
4738 | * ::CUDA_SUCCESS, |
4739 | * ::CUDA_ERROR_DEINITIALIZED, |
4740 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4741 | * ::CUDA_ERROR_INVALID_CONTEXT |
4742 | * \notefnerr |
4743 | * |
4744 | * \sa ::cuCtxCreate, |
4745 | * ::cuCtxDestroy, |
4746 | * ::cuCtxGetApiVersion, |
4747 | * ::cuCtxGetCacheConfig, |
4748 | * ::cuCtxGetDevice, |
4749 | * ::cuCtxGetFlags, |
4750 | * ::cuCtxGetLimit, |
4751 | * ::cuCtxPushCurrent, |
4752 | * ::cuCtxSetCacheConfig, |
4753 | * ::cuCtxSetLimit, |
4754 | * ::cuCtxSynchronize |
4755 | */ |
4756 | CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); |
4757 | |
4758 | /** |
4759 | * \brief Binds the specified CUDA context to the calling CPU thread |
4760 | * |
4761 | * Binds the specified CUDA context to the calling CPU thread. |
4762 | * If \p ctx is NULL then the CUDA context previously bound to the |
4763 | * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. |
4764 | * |
4765 | * If there exists a CUDA context stack on the calling CPU thread, this |
4766 | * will replace the top of that stack with \p ctx. |
4767 | * If \p ctx is NULL then this will be equivalent to popping the top |
4768 | * of the calling CPU thread's CUDA context stack (or a no-op if the |
4769 | * calling CPU thread's CUDA context stack is empty). |
4770 | * |
4771 | * \param ctx - Context to bind to the calling CPU thread |
4772 | * |
4773 | * \return |
4774 | * ::CUDA_SUCCESS, |
4775 | * ::CUDA_ERROR_DEINITIALIZED, |
4776 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4777 | * ::CUDA_ERROR_INVALID_CONTEXT |
4778 | * \notefnerr |
4779 | * |
4780 | * \sa |
4781 | * ::cuCtxGetCurrent, |
4782 | * ::cuCtxCreate, |
4783 | * ::cuCtxDestroy, |
4784 | * ::cudaSetDevice |
4785 | */ |
4786 | CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); |
4787 | |
4788 | /** |
4789 | * \brief Returns the CUDA context bound to the calling CPU thread. |
4790 | * |
4791 | * Returns in \p *pctx the CUDA context bound to the calling CPU thread. |
4792 | * If no context is bound to the calling CPU thread then \p *pctx is |
4793 | * set to NULL and ::CUDA_SUCCESS is returned. |
4794 | * |
4795 | * \param pctx - Returned context handle |
4796 | * |
4797 | * \return |
4798 | * ::CUDA_SUCCESS, |
4799 | * ::CUDA_ERROR_DEINITIALIZED, |
4800 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4801 | * \notefnerr |
4802 | * |
4803 | * \sa |
4804 | * ::cuCtxSetCurrent, |
4805 | * ::cuCtxCreate, |
4806 | * ::cuCtxDestroy, |
4807 | * ::cudaGetDevice |
4808 | */ |
4809 | CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); |
4810 | |
4811 | /** |
4812 | * \brief Returns the device ID for the current context |
4813 | * |
4814 | * Returns in \p *device the ordinal of the current context's device. |
4815 | * |
4816 | * \param device - Returned device ID for the current context |
4817 | * |
4818 | * \return |
4819 | * ::CUDA_SUCCESS, |
4820 | * ::CUDA_ERROR_DEINITIALIZED, |
4821 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4822 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4823 | * ::CUDA_ERROR_INVALID_VALUE, |
4824 | * \notefnerr |
4825 | * |
4826 | * \sa ::cuCtxCreate, |
4827 | * ::cuCtxDestroy, |
4828 | * ::cuCtxGetApiVersion, |
4829 | * ::cuCtxGetCacheConfig, |
4830 | * ::cuCtxGetFlags, |
4831 | * ::cuCtxGetLimit, |
4832 | * ::cuCtxPopCurrent, |
4833 | * ::cuCtxPushCurrent, |
4834 | * ::cuCtxSetCacheConfig, |
4835 | * ::cuCtxSetLimit, |
4836 | * ::cuCtxSynchronize, |
4837 | * ::cudaGetDevice |
4838 | */ |
4839 | CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); |
4840 | |
4841 | /** |
4842 | * \brief Returns the flags for the current context |
4843 | * |
4844 | * Returns in \p *flags the flags of the current context. See ::cuCtxCreate |
4845 | * for flag values. |
4846 | * |
4847 | * \param flags - Pointer to store flags of current context |
4848 | * |
4849 | * \return |
4850 | * ::CUDA_SUCCESS, |
4851 | * ::CUDA_ERROR_DEINITIALIZED, |
4852 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4853 | * ::CUDA_ERROR_INVALID_CONTEXT, |
4854 | * ::CUDA_ERROR_INVALID_VALUE, |
4855 | * \notefnerr |
4856 | * |
4857 | * \sa ::cuCtxCreate, |
4858 | * ::cuCtxGetApiVersion, |
4859 | * ::cuCtxGetCacheConfig, |
4860 | * ::cuCtxGetCurrent, |
4861 | * ::cuCtxGetDevice, |
4862 | * ::cuCtxGetLimit, |
4863 | * ::cuCtxGetSharedMemConfig, |
4864 | * ::cuCtxGetStreamPriorityRange, |
4865 | * ::cudaGetDeviceFlags |
4866 | */ |
4867 | CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); |
4868 | |
4869 | /** |
4870 | * \brief Block for a context's tasks to complete |
4871 | * |
4872 | * Blocks until the device has completed all preceding requested tasks. |
4873 | * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. |
4874 | * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the |
4875 | * CPU thread will block until the GPU context has finished its work. |
4876 | * |
4877 | * \return |
4878 | * ::CUDA_SUCCESS, |
4879 | * ::CUDA_ERROR_DEINITIALIZED, |
4880 | * ::CUDA_ERROR_NOT_INITIALIZED, |
4881 | * ::CUDA_ERROR_INVALID_CONTEXT |
4882 | * \notefnerr |
4883 | * |
4884 | * \sa ::cuCtxCreate, |
4885 | * ::cuCtxDestroy, |
4886 | * ::cuCtxGetApiVersion, |
4887 | * ::cuCtxGetCacheConfig, |
4888 | * ::cuCtxGetDevice, |
4889 | * ::cuCtxGetFlags, |
4890 | * ::cuCtxGetLimit, |
4891 | * ::cuCtxPopCurrent, |
4892 | * ::cuCtxPushCurrent, |
4893 | * ::cuCtxSetCacheConfig, |
4894 | * ::cuCtxSetLimit, |
4895 | * ::cudaDeviceSynchronize |
4896 | */ |
4897 | CUresult CUDAAPI cuCtxSynchronize(void); |
4898 | |
4899 | /** |
4900 | * \brief Set resource limits |
4901 | * |
4902 | * Setting \p limit to \p value is a request by the application to update |
4903 | * the current limit maintained by the context. The driver is free to |
4904 | * modify the requested value to meet h/w requirements (this could be |
4905 | * clamping to minimum or maximum values, rounding up to nearest element |
4906 | * size, etc). The application can use ::cuCtxGetLimit() to find out exactly |
4907 | * what the limit has been set to. |
4908 | * |
4909 | * Setting each ::CUlimit has its own specific restrictions, so each is |
4910 | * discussed here. |
4911 | * |
4912 | * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. |
4913 | * The driver automatically increases the per-thread stack size |
4914 | * for each kernel launch as needed. This size isn't reset back to the |
4915 | * original value after each launch. Setting this value will take effect |
4916 | * immediately, and if necessary, the device will block until all preceding |
4917 | * requested tasks are complete. |
4918 | * |
4919 | * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used |
4920 | * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE |
4921 | * must be performed before launching any kernel that uses the ::printf() |
4922 | * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. |
4923 | * |
4924 | * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used |
4925 | * by the ::malloc() and ::free() device system calls. Setting |
4926 | * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel |
4927 | * that uses the ::malloc() or ::free() device system calls, otherwise |
4928 | * ::CUDA_ERROR_INVALID_VALUE will be returned. |
4929 | * |
4930 | * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of |
4931 | * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting |
4932 | * this limit must be performed before any launch of a kernel that uses the |
4933 | * device runtime and calls ::cudaDeviceSynchronize() above the default sync |
4934 | * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail |
4935 | * with error code ::cudaErrorSyncDepthExceeded if the limitation is |
4936 | * violated. This limit can be set smaller than the default or up the maximum |
4937 | * launch depth of 24. When setting this limit, keep in mind that additional |
4938 | * levels of sync depth require the driver to reserve large amounts of device |
4939 | * memory which can no longer be used for user allocations. If these |
4940 | * reservations of device memory fail, ::cuCtxSetLimit() will return |
4941 | * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. |
4942 | * This limit is only applicable to devices of compute capability 3.5 and |
4943 | * higher. Attempting to set this limit on devices of compute capability less |
4944 | * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being |
4945 | * returned. |
4946 | * |
4947 | * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of |
4948 | * outstanding device runtime launches that can be made from the current |
4949 | * context. A grid is outstanding from the point of launch up until the grid |
4950 | * is known to have been completed. Device runtime launches which violate |
4951 | * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when |
4952 | * ::cudaGetLastError() is called after launch. If more pending launches than |
4953 | * the default (2048 launches) are needed for a module using the device |
4954 | * runtime, this limit can be increased. Keep in mind that being able to |
4955 | * sustain additional pending launches will require the driver to reserve |
4956 | * larger amounts of device memory upfront which can no longer be used for |
4957 | * allocations. If these reservations fail, ::cuCtxSetLimit() will return |
4958 | * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. |
4959 | * This limit is only applicable to devices of compute capability 3.5 and |
4960 | * higher. Attempting to set this limit on devices of compute capability less |
4961 | * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being |
4962 | * returned. |
4963 | * |
4964 | * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. |
4965 | * Values can range from 0B to 128B. This is purely a performance hint and |
4966 | * it can be ignored or clamped depending on the platform. |
4967 | * |
4968 | * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for |
4969 | * persisting L2 cache. This is purely a performance hint and it can be |
4970 | * ignored or clamped depending on the platform. |
4971 | * |
4972 | * \param limit - Limit to set |
4973 | * \param value - Size of limit |
4974 | * |
4975 | * \return |
4976 | * ::CUDA_SUCCESS, |
4977 | * ::CUDA_ERROR_INVALID_VALUE, |
4978 | * ::CUDA_ERROR_UNSUPPORTED_LIMIT, |
4979 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
4980 | * ::CUDA_ERROR_INVALID_CONTEXT |
4981 | * \notefnerr |
4982 | * |
4983 | * \sa ::cuCtxCreate, |
4984 | * ::cuCtxDestroy, |
4985 | * ::cuCtxGetApiVersion, |
4986 | * ::cuCtxGetCacheConfig, |
4987 | * ::cuCtxGetDevice, |
4988 | * ::cuCtxGetFlags, |
4989 | * ::cuCtxGetLimit, |
4990 | * ::cuCtxPopCurrent, |
4991 | * ::cuCtxPushCurrent, |
4992 | * ::cuCtxSetCacheConfig, |
4993 | * ::cuCtxSynchronize, |
4994 | * ::cudaDeviceSetLimit |
4995 | */ |
4996 | CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); |
4997 | |
4998 | /** |
4999 | * \brief Returns resource limits |
5000 | * |
5001 | * Returns in \p *pvalue the current size of \p limit. The supported |
5002 | * ::CUlimit values are: |
5003 | * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. |
5004 | * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the |
5005 | * ::printf() device system call. |
5006 | * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the |
5007 | * ::malloc() and ::free() device system calls. |
5008 | * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread |
5009 | * can issue the device runtime call ::cudaDeviceSynchronize() to wait on |
5010 | * child grid launches to complete. |
5011 | * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding |
5012 | * device runtime launches that can be made from this context. |
5013 | * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. |
5014 | * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes |
5015 | * |
5016 | * \param limit - Limit to query |
5017 | * \param pvalue - Returned size of limit |
5018 | * |
5019 | * \return |
5020 | * ::CUDA_SUCCESS, |
5021 | * ::CUDA_ERROR_INVALID_VALUE, |
5022 | * ::CUDA_ERROR_UNSUPPORTED_LIMIT |
5023 | * \notefnerr |
5024 | * |
5025 | * \sa ::cuCtxCreate, |
5026 | * ::cuCtxDestroy, |
5027 | * ::cuCtxGetApiVersion, |
5028 | * ::cuCtxGetCacheConfig, |
5029 | * ::cuCtxGetDevice, |
5030 | * ::cuCtxGetFlags, |
5031 | * ::cuCtxPopCurrent, |
5032 | * ::cuCtxPushCurrent, |
5033 | * ::cuCtxSetCacheConfig, |
5034 | * ::cuCtxSetLimit, |
5035 | * ::cuCtxSynchronize, |
5036 | * ::cudaDeviceGetLimit |
5037 | */ |
5038 | CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); |
5039 | |
5040 | /** |
5041 | * \brief Returns the preferred cache configuration for the current context. |
5042 | * |
5043 | * On devices where the L1 cache and shared memory use the same hardware |
5044 | * resources, this function returns through \p pconfig the preferred cache configuration |
5045 | * for the current context. This is only a preference. The driver will use |
5046 | * the requested configuration if possible, but it is free to choose a different |
5047 | * configuration if required to execute functions. |
5048 | * |
5049 | * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices |
5050 | * where the size of the L1 cache and shared memory are fixed. |
5051 | * |
5052 | * The supported cache configurations are: |
5053 | * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
5054 | * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
5055 | * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
5056 | * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
5057 | * |
5058 | * \param pconfig - Returned cache configuration |
5059 | * |
5060 | * \return |
5061 | * ::CUDA_SUCCESS, |
5062 | * ::CUDA_ERROR_DEINITIALIZED, |
5063 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5064 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5065 | * ::CUDA_ERROR_INVALID_VALUE |
5066 | * \notefnerr |
5067 | * |
5068 | * \sa ::cuCtxCreate, |
5069 | * ::cuCtxDestroy, |
5070 | * ::cuCtxGetApiVersion, |
5071 | * ::cuCtxGetDevice, |
5072 | * ::cuCtxGetFlags, |
5073 | * ::cuCtxGetLimit, |
5074 | * ::cuCtxPopCurrent, |
5075 | * ::cuCtxPushCurrent, |
5076 | * ::cuCtxSetCacheConfig, |
5077 | * ::cuCtxSetLimit, |
5078 | * ::cuCtxSynchronize, |
5079 | * ::cuFuncSetCacheConfig, |
5080 | * ::cudaDeviceGetCacheConfig |
5081 | */ |
5082 | CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); |
5083 | |
5084 | /** |
5085 | * \brief Sets the preferred cache configuration for the current context. |
5086 | * |
5087 | * On devices where the L1 cache and shared memory use the same hardware |
5088 | * resources, this sets through \p config the preferred cache configuration for |
5089 | * the current context. This is only a preference. The driver will use |
5090 | * the requested configuration if possible, but it is free to choose a different |
5091 | * configuration if required to execute the function. Any function preference |
5092 | * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide |
5093 | * setting. Setting the context-wide cache configuration to |
5094 | * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer |
5095 | * to not change the cache configuration unless required to launch the kernel. |
5096 | * |
5097 | * This setting does nothing on devices where the size of the L1 cache and |
5098 | * shared memory are fixed. |
5099 | * |
5100 | * Launching a kernel with a different preference than the most recent |
5101 | * preference setting may insert a device-side synchronization point. |
5102 | * |
5103 | * The supported cache configurations are: |
5104 | * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
5105 | * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
5106 | * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
5107 | * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
5108 | * |
5109 | * \param config - Requested cache configuration |
5110 | * |
5111 | * \return |
5112 | * ::CUDA_SUCCESS, |
5113 | * ::CUDA_ERROR_DEINITIALIZED, |
5114 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5115 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5116 | * ::CUDA_ERROR_INVALID_VALUE |
5117 | * \notefnerr |
5118 | * |
5119 | * \sa ::cuCtxCreate, |
5120 | * ::cuCtxDestroy, |
5121 | * ::cuCtxGetApiVersion, |
5122 | * ::cuCtxGetCacheConfig, |
5123 | * ::cuCtxGetDevice, |
5124 | * ::cuCtxGetFlags, |
5125 | * ::cuCtxGetLimit, |
5126 | * ::cuCtxPopCurrent, |
5127 | * ::cuCtxPushCurrent, |
5128 | * ::cuCtxSetLimit, |
5129 | * ::cuCtxSynchronize, |
5130 | * ::cuFuncSetCacheConfig, |
5131 | * ::cudaDeviceSetCacheConfig |
5132 | */ |
5133 | CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); |
5134 | |
5135 | /** |
5136 | * \brief Returns the current shared memory configuration for the current context. |
5137 | * |
5138 | * This function will return in \p pConfig the current size of shared memory banks |
5139 | * in the current context. On devices with configurable shared memory banks, |
5140 | * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all |
5141 | * subsequent kernel launches will by default use the new bank size. When |
5142 | * ::cuCtxGetSharedMemConfig is called on devices without configurable shared |
5143 | * memory, it will return the fixed bank size of the hardware. |
5144 | * |
5145 | * The returned bank configurations can be either: |
5146 | * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is |
5147 | * four bytes. |
5148 | * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will |
5149 | * eight bytes. |
5150 | * |
5151 | * \param pConfig - returned shared memory configuration |
5152 | * \return |
5153 | * ::CUDA_SUCCESS, |
5154 | * ::CUDA_ERROR_DEINITIALIZED, |
5155 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5156 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5157 | * ::CUDA_ERROR_INVALID_VALUE |
5158 | * \notefnerr |
5159 | * |
5160 | * \sa ::cuCtxCreate, |
5161 | * ::cuCtxDestroy, |
5162 | * ::cuCtxGetApiVersion, |
5163 | * ::cuCtxGetCacheConfig, |
5164 | * ::cuCtxGetDevice, |
5165 | * ::cuCtxGetFlags, |
5166 | * ::cuCtxGetLimit, |
5167 | * ::cuCtxPopCurrent, |
5168 | * ::cuCtxPushCurrent, |
5169 | * ::cuCtxSetLimit, |
5170 | * ::cuCtxSynchronize, |
5171 | * ::cuCtxGetSharedMemConfig, |
5172 | * ::cuFuncSetCacheConfig, |
5173 | * ::cudaDeviceGetSharedMemConfig |
5174 | */ |
5175 | CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); |
5176 | |
5177 | /** |
5178 | * \brief Sets the shared memory configuration for the current context. |
5179 | * |
5180 | * On devices with configurable shared memory banks, this function will set |
5181 | * the context's shared memory bank size which is used for subsequent kernel |
5182 | * launches. |
5183 | * |
5184 | * Changed the shared memory configuration between launches may insert a device |
5185 | * side synchronization point between those launches. |
5186 | * |
5187 | * Changing the shared memory bank size will not increase shared memory usage |
5188 | * or affect occupancy of kernels, but may have major effects on performance. |
5189 | * Larger bank sizes will allow for greater potential bandwidth to shared memory, |
5190 | * but will change what kinds of accesses to shared memory will result in bank |
5191 | * conflicts. |
5192 | * |
5193 | * This function will do nothing on devices with fixed shared memory bank size. |
5194 | * |
5195 | * The supported bank configurations are: |
5196 | * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial |
5197 | * setting (currently, four bytes). |
5198 | * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to |
5199 | * be natively four bytes. |
5200 | * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to |
5201 | * be natively eight bytes. |
5202 | * |
5203 | * \param config - requested shared memory configuration |
5204 | * |
5205 | * \return |
5206 | * ::CUDA_SUCCESS, |
5207 | * ::CUDA_ERROR_DEINITIALIZED, |
5208 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5209 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5210 | * ::CUDA_ERROR_INVALID_VALUE |
5211 | * \notefnerr |
5212 | * |
5213 | * \sa ::cuCtxCreate, |
5214 | * ::cuCtxDestroy, |
5215 | * ::cuCtxGetApiVersion, |
5216 | * ::cuCtxGetCacheConfig, |
5217 | * ::cuCtxGetDevice, |
5218 | * ::cuCtxGetFlags, |
5219 | * ::cuCtxGetLimit, |
5220 | * ::cuCtxPopCurrent, |
5221 | * ::cuCtxPushCurrent, |
5222 | * ::cuCtxSetLimit, |
5223 | * ::cuCtxSynchronize, |
5224 | * ::cuCtxGetSharedMemConfig, |
5225 | * ::cuFuncSetCacheConfig, |
5226 | * ::cudaDeviceSetSharedMemConfig |
5227 | */ |
5228 | CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); |
5229 | |
5230 | /** |
5231 | * \brief Gets the context's API version. |
5232 | * |
5233 | * Returns a version number in \p version corresponding to the capabilities of |
5234 | * the context (e.g. 3010 or 3020), which library developers can use to direct |
5235 | * callers to a specific API version. If \p ctx is NULL, returns the API version |
5236 | * used to create the currently bound context. |
5237 | * |
5238 | * Note that new API versions are only introduced when context capabilities are |
5239 | * changed that break binary compatibility, so the API version and driver version |
5240 | * may be different. For example, it is valid for the API version to be 3020 while |
5241 | * the driver version is 4020. |
5242 | * |
5243 | * \param ctx - Context to check |
5244 | * \param version - Pointer to version |
5245 | * |
5246 | * \return |
5247 | * ::CUDA_SUCCESS, |
5248 | * ::CUDA_ERROR_DEINITIALIZED, |
5249 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5250 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5251 | * ::CUDA_ERROR_INVALID_VALUE, |
5252 | * ::CUDA_ERROR_UNKNOWN |
5253 | * \notefnerr |
5254 | * |
5255 | * \sa ::cuCtxCreate, |
5256 | * ::cuCtxDestroy, |
5257 | * ::cuCtxGetDevice, |
5258 | * ::cuCtxGetFlags, |
5259 | * ::cuCtxGetLimit, |
5260 | * ::cuCtxPopCurrent, |
5261 | * ::cuCtxPushCurrent, |
5262 | * ::cuCtxSetCacheConfig, |
5263 | * ::cuCtxSetLimit, |
5264 | * ::cuCtxSynchronize |
5265 | */ |
5266 | CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); |
5267 | |
5268 | /** |
5269 | * \brief Returns numerical values that correspond to the least and |
5270 | * greatest stream priorities. |
5271 | * |
5272 | * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond |
5273 | * to the least and greatest stream priorities respectively. Stream priorities |
5274 | * follow a convention where lower numbers imply greater priorities. The range of |
5275 | * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. |
5276 | * If the user attempts to create a stream with a priority value that is |
5277 | * outside the meaningful range as specified by this API, the priority is |
5278 | * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority |
5279 | * respectively. See ::cuStreamCreateWithPriority for details on creating a |
5280 | * priority stream. |
5281 | * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value |
5282 | * is not desired. |
5283 | * |
5284 | * This function will return '0' in both \p *leastPriority and \p *greatestPriority if |
5285 | * the current context's device does not support stream priorities |
5286 | * (see ::cuDeviceGetAttribute). |
5287 | * |
5288 | * \param leastPriority - Pointer to an int in which the numerical value for least |
5289 | * stream priority is returned |
5290 | * \param greatestPriority - Pointer to an int in which the numerical value for greatest |
5291 | * stream priority is returned |
5292 | * |
5293 | * \return |
5294 | * ::CUDA_SUCCESS, |
5295 | * ::CUDA_ERROR_INVALID_VALUE, |
5296 | * \notefnerr |
5297 | * |
5298 | * \sa ::cuStreamCreateWithPriority, |
5299 | * ::cuStreamGetPriority, |
5300 | * ::cuCtxGetDevice, |
5301 | * ::cuCtxGetFlags, |
5302 | * ::cuCtxSetLimit, |
5303 | * ::cuCtxSynchronize, |
5304 | * ::cudaDeviceGetStreamPriorityRange |
5305 | */ |
5306 | CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); |
5307 | |
5308 | /** |
5309 | * \brief Resets all persisting lines in cache to normal status. |
5310 | * |
5311 | * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal |
5312 | * status. Takes effect on function return. |
5313 | * |
5314 | * \return |
5315 | * ::CUDA_SUCCESS, |
5316 | * ::CUDA_ERROR_NOT_SUPPORTED |
5317 | * \notefnerr |
5318 | * |
5319 | * \sa |
5320 | * ::CUaccessPolicyWindow |
5321 | */ |
5322 | CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); |
5323 | |
5324 | /** |
5325 | * \brief Returns the execution affinity setting for the current context. |
5326 | * |
5327 | * Returns in \p *pExecAffinity the current value of \p type. The supported |
5328 | * ::CUexecAffinityType values are: |
5329 | * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. |
5330 | * |
5331 | * \param type - Execution affinity type to query |
5332 | * \param pExecAffinity - Returned execution affinity |
5333 | * |
5334 | * \return |
5335 | * ::CUDA_SUCCESS, |
5336 | * ::CUDA_ERROR_DEINITIALIZED, |
5337 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5338 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5339 | * ::CUDA_ERROR_INVALID_VALUE, |
5340 | * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY |
5341 | * \notefnerr |
5342 | * |
5343 | * \sa |
5344 | * ::CUexecAffinityParam |
5345 | */ |
5346 | CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); |
5347 | |
5348 | |
5349 | /** @} */ /* END CUDA_CTX */ |
5350 | |
5351 | /** |
5352 | * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] |
5353 | * |
5354 | * ___MANBRIEF___ deprecated context management functions of the low-level CUDA |
5355 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
5356 | * |
5357 | * This section describes the deprecated context management functions of the low-level |
5358 | * CUDA driver application programming interface. |
5359 | * |
5360 | * @{ |
5361 | */ |
5362 | |
5363 | /** |
5364 | * \brief Increment a context's usage-count |
5365 | * |
5366 | * \deprecated |
5367 | * |
5368 | * Note that this function is deprecated and should not be used. |
5369 | * |
5370 | * Increments the usage count of the context and passes back a context handle |
5371 | * in \p *pctx that must be passed to ::cuCtxDetach() when the application is |
5372 | * done with the context. ::cuCtxAttach() fails if there is no context current |
5373 | * to the thread. |
5374 | * |
5375 | * Currently, the \p flags parameter must be 0. |
5376 | * |
5377 | * \param pctx - Returned context handle of the current context |
5378 | * \param flags - Context attach flags (must be 0) |
5379 | * |
5380 | * \return |
5381 | * ::CUDA_SUCCESS, |
5382 | * ::CUDA_ERROR_DEINITIALIZED, |
5383 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5384 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5385 | * ::CUDA_ERROR_INVALID_VALUE |
5386 | * \notefnerr |
5387 | * |
5388 | * \sa ::cuCtxCreate, |
5389 | * ::cuCtxDestroy, |
5390 | * ::cuCtxDetach, |
5391 | * ::cuCtxGetApiVersion, |
5392 | * ::cuCtxGetCacheConfig, |
5393 | * ::cuCtxGetDevice, |
5394 | * ::cuCtxGetFlags, |
5395 | * ::cuCtxGetLimit, |
5396 | * ::cuCtxPopCurrent, |
5397 | * ::cuCtxPushCurrent, |
5398 | * ::cuCtxSetCacheConfig, |
5399 | * ::cuCtxSetLimit, |
5400 | * ::cuCtxSynchronize |
5401 | */ |
5402 | __CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); |
5403 | |
5404 | /** |
5405 | * \brief Decrement a context's usage-count |
5406 | * |
5407 | * \deprecated |
5408 | * |
5409 | * Note that this function is deprecated and should not be used. |
5410 | * |
5411 | * Decrements the usage count of the context \p ctx, and destroys the context |
5412 | * if the usage count goes to 0. The context must be a handle that was passed |
5413 | * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the |
5414 | * calling thread. |
5415 | * |
5416 | * \param ctx - Context to destroy |
5417 | * |
5418 | * \return |
5419 | * ::CUDA_SUCCESS, |
5420 | * ::CUDA_ERROR_DEINITIALIZED, |
5421 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5422 | * ::CUDA_ERROR_INVALID_CONTEXT |
5423 | * \notefnerr |
5424 | * |
5425 | * \sa ::cuCtxCreate, |
5426 | * ::cuCtxDestroy, |
5427 | * ::cuCtxGetApiVersion, |
5428 | * ::cuCtxGetCacheConfig, |
5429 | * ::cuCtxGetDevice, |
5430 | * ::cuCtxGetFlags, |
5431 | * ::cuCtxGetLimit, |
5432 | * ::cuCtxPopCurrent, |
5433 | * ::cuCtxPushCurrent, |
5434 | * ::cuCtxSetCacheConfig, |
5435 | * ::cuCtxSetLimit, |
5436 | * ::cuCtxSynchronize |
5437 | */ |
5438 | __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); |
5439 | |
5440 | /** @} */ /* END CUDA_CTX_DEPRECATED */ |
5441 | |
5442 | |
5443 | /** |
5444 | * \defgroup CUDA_MODULE Module Management |
5445 | * |
5446 | * ___MANBRIEF___ module management functions of the low-level CUDA driver API |
5447 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
5448 | * |
5449 | * This section describes the module management functions of the low-level CUDA |
5450 | * driver application programming interface. |
5451 | * |
5452 | * @{ |
5453 | */ |
5454 | |
5455 | /** |
5456 | * \brief Loads a compute module |
5457 | * |
5458 | * Takes a filename \p fname and loads the corresponding module \p module into |
5459 | * the current context. The CUDA driver API does not attempt to lazily |
5460 | * allocate the resources needed by a module; if the memory for functions and |
5461 | * data (constant and global) needed by the module cannot be allocated, |
5462 | * ::cuModuleLoad() fails. The file should be a \e cubin file as output by |
5463 | * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or |
5464 | * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. |
5465 | * |
5466 | * \param module - Returned module |
5467 | * \param fname - Filename of module to load |
5468 | * |
5469 | * \return |
5470 | * ::CUDA_SUCCESS, |
5471 | * ::CUDA_ERROR_DEINITIALIZED, |
5472 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5473 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5474 | * ::CUDA_ERROR_INVALID_VALUE, |
5475 | * ::CUDA_ERROR_INVALID_PTX, |
5476 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5477 | * ::CUDA_ERROR_NOT_FOUND, |
5478 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5479 | * ::CUDA_ERROR_FILE_NOT_FOUND, |
5480 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
5481 | * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
5482 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
5483 | * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
5484 | * \notefnerr |
5485 | * |
5486 | * \sa ::cuModuleGetFunction, |
5487 | * ::cuModuleGetGlobal, |
5488 | * ::cuModuleGetTexRef, |
5489 | * ::cuModuleLoadData, |
5490 | * ::cuModuleLoadDataEx, |
5491 | * ::cuModuleLoadFatBinary, |
5492 | * ::cuModuleUnload |
5493 | */ |
5494 | CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); |
5495 | |
5496 | /** |
5497 | * \brief Load a module's data |
5498 | * |
5499 | * Takes a pointer \p image and loads the corresponding module \p module into |
5500 | * the current context. The pointer may be obtained by mapping a \e cubin or |
5501 | * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file |
5502 | * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin |
5503 | * object into the executable resources and using operating system calls such |
5504 | * as Windows \c FindResource() to obtain the pointer. |
5505 | * |
5506 | * \param module - Returned module |
5507 | * \param image - Module data to load |
5508 | * |
5509 | * \return |
5510 | * ::CUDA_SUCCESS, |
5511 | * ::CUDA_ERROR_DEINITIALIZED, |
5512 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5513 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5514 | * ::CUDA_ERROR_INVALID_VALUE, |
5515 | * ::CUDA_ERROR_INVALID_PTX, |
5516 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5517 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5518 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
5519 | * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
5520 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
5521 | * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
5522 | * \notefnerr |
5523 | * |
5524 | * \sa ::cuModuleGetFunction, |
5525 | * ::cuModuleGetGlobal, |
5526 | * ::cuModuleGetTexRef, |
5527 | * ::cuModuleLoad, |
5528 | * ::cuModuleLoadDataEx, |
5529 | * ::cuModuleLoadFatBinary, |
5530 | * ::cuModuleUnload |
5531 | */ |
5532 | CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); |
5533 | |
5534 | /** |
5535 | * \brief Load a module's data with options |
5536 | * |
5537 | * Takes a pointer \p image and loads the corresponding module \p module into |
5538 | * the current context. The pointer may be obtained by mapping a \e cubin or |
5539 | * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file |
5540 | * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin |
5541 | * object into the executable resources and using operating system calls such |
5542 | * as Windows \c FindResource() to obtain the pointer. Options are passed as |
5543 | * an array via \p options and any corresponding parameters are passed in |
5544 | * \p optionValues. The number of total options is supplied via \p numOptions. |
5545 | * Any outputs will be returned via \p optionValues. |
5546 | * |
5547 | * \param module - Returned module |
5548 | * \param image - Module data to load |
5549 | * \param numOptions - Number of options |
5550 | * \param options - Options for JIT |
5551 | * \param optionValues - Option values for JIT |
5552 | * |
5553 | * \return |
5554 | * ::CUDA_SUCCESS, |
5555 | * ::CUDA_ERROR_DEINITIALIZED, |
5556 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5557 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5558 | * ::CUDA_ERROR_INVALID_VALUE, |
5559 | * ::CUDA_ERROR_INVALID_PTX, |
5560 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5561 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5562 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
5563 | * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
5564 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
5565 | * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
5566 | * \notefnerr |
5567 | * |
5568 | * \sa ::cuModuleGetFunction, |
5569 | * ::cuModuleGetGlobal, |
5570 | * ::cuModuleGetTexRef, |
5571 | * ::cuModuleLoad, |
5572 | * ::cuModuleLoadData, |
5573 | * ::cuModuleLoadFatBinary, |
5574 | * ::cuModuleUnload |
5575 | */ |
5576 | CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); |
5577 | |
5578 | /** |
5579 | * \brief Load a module's data |
5580 | * |
5581 | * Takes a pointer \p fatCubin and loads the corresponding module \p module |
5582 | * into the current context. The pointer represents a <i>fat binary</i> object, |
5583 | * which is a collection of different \e cubin and/or \e PTX files, all |
5584 | * representing the same device code, but compiled and optimized for different |
5585 | * architectures. |
5586 | * |
5587 | * Prior to CUDA 4.0, there was no documented API for constructing and using |
5588 | * fat binary objects by programmers. Starting with CUDA 4.0, fat binary |
5589 | * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc. |
5590 | * More information can be found in the \b nvcc document. |
5591 | * |
5592 | * \param module - Returned module |
5593 | * \param fatCubin - Fat binary to load |
5594 | * |
5595 | * \return |
5596 | * ::CUDA_SUCCESS, |
5597 | * ::CUDA_ERROR_DEINITIALIZED, |
5598 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5599 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5600 | * ::CUDA_ERROR_INVALID_VALUE, |
5601 | * ::CUDA_ERROR_INVALID_PTX, |
5602 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5603 | * ::CUDA_ERROR_NOT_FOUND, |
5604 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5605 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
5606 | * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
5607 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
5608 | * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
5609 | * \notefnerr |
5610 | * |
5611 | * \sa ::cuModuleGetFunction, |
5612 | * ::cuModuleGetGlobal, |
5613 | * ::cuModuleGetTexRef, |
5614 | * ::cuModuleLoad, |
5615 | * ::cuModuleLoadData, |
5616 | * ::cuModuleLoadDataEx, |
5617 | * ::cuModuleUnload |
5618 | */ |
5619 | CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); |
5620 | |
5621 | /** |
5622 | * \brief Unloads a module |
5623 | * |
5624 | * Unloads a module \p hmod from the current context. |
5625 | * |
5626 | * \param hmod - Module to unload |
5627 | * |
5628 | * \return |
5629 | * ::CUDA_SUCCESS, |
5630 | * ::CUDA_ERROR_DEINITIALIZED, |
5631 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5632 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5633 | * ::CUDA_ERROR_INVALID_VALUE |
5634 | * \notefnerr |
5635 | * |
5636 | * \sa ::cuModuleGetFunction, |
5637 | * ::cuModuleGetGlobal, |
5638 | * ::cuModuleGetTexRef, |
5639 | * ::cuModuleLoad, |
5640 | * ::cuModuleLoadData, |
5641 | * ::cuModuleLoadDataEx, |
5642 | * ::cuModuleLoadFatBinary |
5643 | */ |
5644 | CUresult CUDAAPI cuModuleUnload(CUmodule hmod); |
5645 | |
5646 | /** |
5647 | * \brief Returns a function handle |
5648 | * |
5649 | * Returns in \p *hfunc the handle of the function of name \p name located in |
5650 | * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() |
5651 | * returns ::CUDA_ERROR_NOT_FOUND. |
5652 | * |
5653 | * \param hfunc - Returned function handle |
5654 | * \param hmod - Module to retrieve function from |
5655 | * \param name - Name of function to retrieve |
5656 | * |
5657 | * \return |
5658 | * ::CUDA_SUCCESS, |
5659 | * ::CUDA_ERROR_DEINITIALIZED, |
5660 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5661 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5662 | * ::CUDA_ERROR_INVALID_VALUE, |
5663 | * ::CUDA_ERROR_NOT_FOUND |
5664 | * \notefnerr |
5665 | * |
5666 | * \sa ::cuModuleGetGlobal, |
5667 | * ::cuModuleGetTexRef, |
5668 | * ::cuModuleLoad, |
5669 | * ::cuModuleLoadData, |
5670 | * ::cuModuleLoadDataEx, |
5671 | * ::cuModuleLoadFatBinary, |
5672 | * ::cuModuleUnload |
5673 | */ |
5674 | CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); |
5675 | |
5676 | /** |
5677 | * \brief Returns a global pointer from a module |
5678 | * |
5679 | * Returns in \p *dptr and \p *bytes the base pointer and size of the |
5680 | * global of name \p name located in module \p hmod. If no variable of that name |
5681 | * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both |
5682 | * parameters \p dptr and \p bytes are optional. If one of them is |
5683 | * NULL, it is ignored. |
5684 | * |
5685 | * \param dptr - Returned global device pointer |
5686 | * \param bytes - Returned global size in bytes |
5687 | * \param hmod - Module to retrieve global from |
5688 | * \param name - Name of global to retrieve |
5689 | * |
5690 | * \return |
5691 | * ::CUDA_SUCCESS, |
5692 | * ::CUDA_ERROR_DEINITIALIZED, |
5693 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5694 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5695 | * ::CUDA_ERROR_INVALID_VALUE, |
5696 | * ::CUDA_ERROR_NOT_FOUND |
5697 | * \notefnerr |
5698 | * |
5699 | * \sa ::cuModuleGetFunction, |
5700 | * ::cuModuleGetTexRef, |
5701 | * ::cuModuleLoad, |
5702 | * ::cuModuleLoadData, |
5703 | * ::cuModuleLoadDataEx, |
5704 | * ::cuModuleLoadFatBinary, |
5705 | * ::cuModuleUnload, |
5706 | * ::cudaGetSymbolAddress, |
5707 | * ::cudaGetSymbolSize |
5708 | */ |
5709 | CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); |
5710 | |
5711 | /** |
5712 | * \brief Returns a handle to a texture reference |
5713 | * |
5714 | * Returns in \p *pTexRef the handle of the texture reference of name \p name |
5715 | * in the module \p hmod. If no texture reference of that name exists, |
5716 | * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference |
5717 | * handle should not be destroyed, since it will be destroyed when the module |
5718 | * is unloaded. |
5719 | * |
5720 | * \param pTexRef - Returned texture reference |
5721 | * \param hmod - Module to retrieve texture reference from |
5722 | * \param name - Name of texture reference to retrieve |
5723 | * |
5724 | * \return |
5725 | * ::CUDA_SUCCESS, |
5726 | * ::CUDA_ERROR_DEINITIALIZED, |
5727 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5728 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5729 | * ::CUDA_ERROR_INVALID_VALUE, |
5730 | * ::CUDA_ERROR_NOT_FOUND |
5731 | * \notefnerr |
5732 | * |
5733 | * \sa ::cuModuleGetFunction, |
5734 | * ::cuModuleGetGlobal, |
5735 | * ::cuModuleGetSurfRef, |
5736 | * ::cuModuleLoad, |
5737 | * ::cuModuleLoadData, |
5738 | * ::cuModuleLoadDataEx, |
5739 | * ::cuModuleLoadFatBinary, |
5740 | * ::cuModuleUnload, |
5741 | * ::cudaGetTextureReference |
5742 | */ |
5743 | CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); |
5744 | |
5745 | /** |
5746 | * \brief Returns a handle to a surface reference |
5747 | * |
5748 | * Returns in \p *pSurfRef the handle of the surface reference of name \p name |
5749 | * in the module \p hmod. If no surface reference of that name exists, |
5750 | * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. |
5751 | * |
5752 | * \param pSurfRef - Returned surface reference |
5753 | * \param hmod - Module to retrieve surface reference from |
5754 | * \param name - Name of surface reference to retrieve |
5755 | * |
5756 | * \return |
5757 | * ::CUDA_SUCCESS, |
5758 | * ::CUDA_ERROR_DEINITIALIZED, |
5759 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5760 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5761 | * ::CUDA_ERROR_INVALID_VALUE, |
5762 | * ::CUDA_ERROR_NOT_FOUND |
5763 | * \notefnerr |
5764 | * |
5765 | * \sa ::cuModuleGetFunction, |
5766 | * ::cuModuleGetGlobal, |
5767 | * ::cuModuleGetTexRef, |
5768 | * ::cuModuleLoad, |
5769 | * ::cuModuleLoadData, |
5770 | * ::cuModuleLoadDataEx, |
5771 | * ::cuModuleLoadFatBinary, |
5772 | * ::cuModuleUnload, |
5773 | * ::cudaGetSurfaceReference |
5774 | */ |
5775 | CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); |
5776 | |
5777 | /** |
5778 | * \brief Creates a pending JIT linker invocation. |
5779 | * |
5780 | * If the call is successful, the caller owns the returned CUlinkState, which |
5781 | * should eventually be destroyed with ::cuLinkDestroy. The |
5782 | * device code machine size (32 or 64 bit) will match the calling application. |
5783 | * |
5784 | * Both linker and compiler options may be specified. Compiler options will |
5785 | * be applied to inputs to this linker action which must be compiled from PTX. |
5786 | * The options ::CU_JIT_WALL_TIME, |
5787 | * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES |
5788 | * will accumulate data until the CUlinkState is destroyed. |
5789 | * |
5790 | * \p optionValues must remain valid for the life of the CUlinkState if output |
5791 | * options are used. No other references to inputs are maintained after this |
5792 | * call returns. |
5793 | * |
5794 | * \param numOptions Size of options arrays |
5795 | * \param options Array of linker and compiler options |
5796 | * \param optionValues Array of option values, each cast to void * |
5797 | * \param stateOut On success, this will contain a CUlinkState to specify |
5798 | * and complete this action |
5799 | * |
5800 | * \return |
5801 | * ::CUDA_SUCCESS, |
5802 | * ::CUDA_ERROR_DEINITIALIZED, |
5803 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5804 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5805 | * ::CUDA_ERROR_INVALID_VALUE, |
5806 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5807 | * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
5808 | * \notefnerr |
5809 | * |
5810 | * \sa ::cuLinkAddData, |
5811 | * ::cuLinkAddFile, |
5812 | * ::cuLinkComplete, |
5813 | * ::cuLinkDestroy |
5814 | */ |
5815 | CUresult CUDAAPI |
5816 | cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); |
5817 | |
5818 | /** |
5819 | * \brief Add an input to a pending linker invocation |
5820 | * |
5821 | * Ownership of \p data is retained by the caller. No reference is retained to any |
5822 | * inputs after this call returns. |
5823 | * |
5824 | * This method accepts only compiler options, which are used if the data must |
5825 | * be compiled from PTX, and does not accept any of |
5826 | * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, |
5827 | * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. |
5828 | * |
5829 | * \param state A pending linker action. |
5830 | * \param type The type of the input data. |
5831 | * \param data The input data. PTX must be NULL-terminated. |
5832 | * \param size The length of the input data. |
5833 | * \param name An optional name for this input in log messages. |
5834 | * \param numOptions Size of options. |
5835 | * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). |
5836 | * \param optionValues Array of option values, each cast to void *. |
5837 | * |
5838 | * \return |
5839 | * ::CUDA_SUCCESS, |
5840 | * ::CUDA_ERROR_INVALID_HANDLE, |
5841 | * ::CUDA_ERROR_INVALID_VALUE, |
5842 | * ::CUDA_ERROR_INVALID_IMAGE, |
5843 | * ::CUDA_ERROR_INVALID_PTX, |
5844 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5845 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5846 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU |
5847 | * |
5848 | * \sa ::cuLinkCreate, |
5849 | * ::cuLinkAddFile, |
5850 | * ::cuLinkComplete, |
5851 | * ::cuLinkDestroy |
5852 | */ |
5853 | CUresult CUDAAPI |
5854 | cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, |
5855 | unsigned int numOptions, CUjit_option *options, void **optionValues); |
5856 | |
5857 | /** |
5858 | * \brief Add a file input to a pending linker invocation |
5859 | * |
5860 | * No reference is retained to any inputs after this call returns. |
5861 | * |
5862 | * This method accepts only compiler options, which are used if the input |
5863 | * must be compiled from PTX, and does not accept any of |
5864 | * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, |
5865 | * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. |
5866 | * |
5867 | * This method is equivalent to invoking ::cuLinkAddData on the contents |
5868 | * of the file. |
5869 | * |
5870 | * \param state A pending linker action |
5871 | * \param type The type of the input data |
5872 | * \param path Path to the input file |
5873 | * \param numOptions Size of options |
5874 | * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) |
5875 | * \param optionValues Array of option values, each cast to void * |
5876 | * |
5877 | * \return |
5878 | * ::CUDA_SUCCESS, |
5879 | * ::CUDA_ERROR_FILE_NOT_FOUND |
5880 | * ::CUDA_ERROR_INVALID_HANDLE, |
5881 | * ::CUDA_ERROR_INVALID_VALUE, |
5882 | * ::CUDA_ERROR_INVALID_IMAGE, |
5883 | * ::CUDA_ERROR_INVALID_PTX, |
5884 | * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, |
5885 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
5886 | * ::CUDA_ERROR_NO_BINARY_FOR_GPU |
5887 | * |
5888 | * \sa ::cuLinkCreate, |
5889 | * ::cuLinkAddData, |
5890 | * ::cuLinkComplete, |
5891 | * ::cuLinkDestroy |
5892 | */ |
5893 | CUresult CUDAAPI |
5894 | cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, |
5895 | unsigned int numOptions, CUjit_option *options, void **optionValues); |
5896 | |
5897 | /** |
5898 | * \brief Complete a pending linker invocation |
5899 | * |
5900 | * Completes the pending linker action and returns the cubin image for the linked |
5901 | * device code, which can be used with ::cuModuleLoadData. The cubin is owned by |
5902 | * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. |
5903 | * This call does not destroy \p state. |
5904 | * |
5905 | * \param state A pending linker invocation |
5906 | * \param cubinOut On success, this will point to the output image |
5907 | * \param sizeOut Optional parameter to receive the size of the generated image |
5908 | * |
5909 | * \return |
5910 | * ::CUDA_SUCCESS, |
5911 | * ::CUDA_ERROR_INVALID_HANDLE, |
5912 | * ::CUDA_ERROR_OUT_OF_MEMORY |
5913 | * |
5914 | * \sa ::cuLinkCreate, |
5915 | * ::cuLinkAddData, |
5916 | * ::cuLinkAddFile, |
5917 | * ::cuLinkDestroy, |
5918 | * ::cuModuleLoadData |
5919 | */ |
5920 | CUresult CUDAAPI |
5921 | cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); |
5922 | |
5923 | /** |
5924 | * \brief Destroys state for a JIT linker invocation. |
5925 | * |
5926 | * \param state State object for the linker invocation |
5927 | * |
5928 | * \return |
5929 | * ::CUDA_SUCCESS, |
5930 | * ::CUDA_ERROR_INVALID_HANDLE |
5931 | * |
5932 | * \sa ::cuLinkCreate |
5933 | */ |
5934 | CUresult CUDAAPI |
5935 | cuLinkDestroy(CUlinkState state); |
5936 | |
5937 | /** @} */ /* END CUDA_MODULE */ |
5938 | |
5939 | |
5940 | /** |
5941 | * \defgroup CUDA_MEM Memory Management |
5942 | * |
5943 | * ___MANBRIEF___ memory management functions of the low-level CUDA driver API |
5944 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
5945 | * |
5946 | * This section describes the memory management functions of the low-level CUDA |
5947 | * driver application programming interface. |
5948 | * |
5949 | * @{ |
5950 | */ |
5951 | |
5952 | /** |
5953 | * \brief Gets free and total memory |
5954 | * |
5955 | * Returns in \p *free and \p *total respectively, the free and total amount of |
5956 | * memory available for allocation by the CUDA context, in bytes. |
5957 | * |
5958 | * \param free - Returned free memory in bytes |
5959 | * \param total - Returned total memory in bytes |
5960 | * |
5961 | * \return |
5962 | * ::CUDA_SUCCESS, |
5963 | * ::CUDA_ERROR_DEINITIALIZED, |
5964 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5965 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5966 | * ::CUDA_ERROR_INVALID_VALUE |
5967 | * \notefnerr |
5968 | * |
5969 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
5970 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
5971 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
5972 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
5973 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
5974 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
5975 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
5976 | * ::cuMemGetAddressRange, ::cuMemHostAlloc, |
5977 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
5978 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
5979 | * ::cudaMemGetInfo |
5980 | */ |
5981 | CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); |
5982 | |
5983 | /** |
5984 | * \brief Allocates device memory |
5985 | * |
5986 | * Allocates \p bytesize bytes of linear memory on the device and returns in |
5987 | * \p *dptr a pointer to the allocated memory. The allocated memory is suitably |
5988 | * aligned for any kind of variable. The memory is not cleared. If \p bytesize |
5989 | * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. |
5990 | * |
5991 | * \param dptr - Returned device pointer |
5992 | * \param bytesize - Requested allocation size in bytes |
5993 | * |
5994 | * \return |
5995 | * ::CUDA_SUCCESS, |
5996 | * ::CUDA_ERROR_DEINITIALIZED, |
5997 | * ::CUDA_ERROR_NOT_INITIALIZED, |
5998 | * ::CUDA_ERROR_INVALID_CONTEXT, |
5999 | * ::CUDA_ERROR_INVALID_VALUE, |
6000 | * ::CUDA_ERROR_OUT_OF_MEMORY |
6001 | * \notefnerr |
6002 | * |
6003 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6004 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, |
6005 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6006 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6007 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6008 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6009 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6010 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6011 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6012 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6013 | * ::cudaMalloc |
6014 | */ |
6015 | CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); |
6016 | |
6017 | /** |
6018 | * \brief Allocates pitched device memory |
6019 | * |
6020 | * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on |
6021 | * the device and returns in \p *dptr a pointer to the allocated memory. The |
6022 | * function may pad the allocation to ensure that corresponding pointers in |
6023 | * any given row will continue to meet the alignment requirements for |
6024 | * coalescing as the address is updated from row to row. \p ElementSizeBytes |
6025 | * specifies the size of the largest reads and writes that will be performed |
6026 | * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced |
6027 | * memory transactions are not possible on other data sizes). If |
6028 | * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, |
6029 | * the kernel will run correctly, but possibly at reduced speed. The pitch |
6030 | * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the |
6031 | * allocation. The intended usage of pitch is as a separate parameter of the |
6032 | * allocation, used to compute addresses within the 2D array. Given the row |
6033 | * and column of an array element of type \b T, the address is computed as: |
6034 | * \code |
6035 | T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; |
6036 | * \endcode |
6037 | * |
6038 | * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with |
6039 | * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is |
6040 | * recommended that programmers consider performing pitch allocations using |
6041 | * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is |
6042 | * especially true if the application will be performing 2D memory copies |
6043 | * between different regions of device memory (whether linear memory or CUDA |
6044 | * arrays). |
6045 | * |
6046 | * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed |
6047 | * to match or exceed the alignment requirement for texture binding with |
6048 | * ::cuTexRefSetAddress2D(). |
6049 | * |
6050 | * \param dptr - Returned device pointer |
6051 | * \param pPitch - Returned pitch of allocation in bytes |
6052 | * \param WidthInBytes - Requested allocation width in bytes |
6053 | * \param Height - Requested allocation height in rows |
6054 | * \param ElementSizeBytes - Size of largest reads/writes for range |
6055 | * |
6056 | * \return |
6057 | * ::CUDA_SUCCESS, |
6058 | * ::CUDA_ERROR_DEINITIALIZED, |
6059 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6060 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6061 | * ::CUDA_ERROR_INVALID_VALUE, |
6062 | * ::CUDA_ERROR_OUT_OF_MEMORY |
6063 | * \notefnerr |
6064 | * |
6065 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6066 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6067 | * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6068 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6069 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6070 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6071 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6072 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6073 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6074 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6075 | * ::cudaMallocPitch |
6076 | */ |
6077 | CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); |
6078 | |
6079 | /** |
6080 | * \brief Frees device memory |
6081 | * |
6082 | * Frees the memory space pointed to by \p dptr, which must have been returned |
6083 | * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). |
6084 | * |
6085 | * \param dptr - Pointer to memory to free |
6086 | * |
6087 | * \return |
6088 | * ::CUDA_SUCCESS, |
6089 | * ::CUDA_ERROR_DEINITIALIZED, |
6090 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6091 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6092 | * ::CUDA_ERROR_INVALID_VALUE |
6093 | * \notefnerr |
6094 | * |
6095 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6096 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6097 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6098 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6099 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6100 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6101 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, |
6102 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6103 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6104 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6105 | * ::cudaFree |
6106 | */ |
6107 | CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); |
6108 | |
6109 | /** |
6110 | * \brief Get information on memory allocations |
6111 | * |
6112 | * Returns the base address in \p *pbase and size in \p *psize of the |
6113 | * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input |
6114 | * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one |
6115 | * of them is NULL, it is ignored. |
6116 | * |
6117 | * \param pbase - Returned base address |
6118 | * \param psize - Returned size of device memory allocation |
6119 | * \param dptr - Device pointer to query |
6120 | * |
6121 | * \return |
6122 | * ::CUDA_SUCCESS, |
6123 | * ::CUDA_ERROR_DEINITIALIZED, |
6124 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6125 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6126 | * ::CUDA_ERROR_NOT_FOUND, |
6127 | * ::CUDA_ERROR_INVALID_VALUE |
6128 | * \notefnerr |
6129 | * |
6130 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6131 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6132 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6133 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6134 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6135 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6136 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6137 | * ::cuMemGetInfo, ::cuMemHostAlloc, |
6138 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6139 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 |
6140 | */ |
6141 | CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); |
6142 | |
6143 | /** |
6144 | * \brief Allocates page-locked host memory |
6145 | * |
6146 | * Allocates \p bytesize bytes of host memory that is page-locked and |
6147 | * accessible to the device. The driver tracks the virtual memory ranges |
6148 | * allocated with this function and automatically accelerates calls to |
6149 | * functions such as ::cuMemcpy(). Since the memory can be accessed directly by |
6150 | * the device, it can be read or written with much higher bandwidth than |
6151 | * pageable memory obtained with functions such as ::malloc(). Allocating |
6152 | * excessive amounts of memory with ::cuMemAllocHost() may degrade system |
6153 | * performance, since it reduces the amount of memory available to the system |
6154 | * for paging. As a result, this function is best used sparingly to allocate |
6155 | * staging areas for data exchange between host and device. |
6156 | * |
6157 | * Note all host memory allocated using ::cuMemHostAlloc() will automatically |
6158 | * be immediately accessible to all contexts on all devices which support unified |
6159 | * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). |
6160 | * The device pointer that may be used to access this host memory from those |
6161 | * contexts is always equal to the returned host pointer \p *pp. |
6162 | * See \ref CUDA_UNIFIED for additional details. |
6163 | * |
6164 | * \param pp - Returned host pointer to page-locked memory |
6165 | * \param bytesize - Requested allocation size in bytes |
6166 | * |
6167 | * \return |
6168 | * ::CUDA_SUCCESS, |
6169 | * ::CUDA_ERROR_DEINITIALIZED, |
6170 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6171 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6172 | * ::CUDA_ERROR_INVALID_VALUE, |
6173 | * ::CUDA_ERROR_OUT_OF_MEMORY |
6174 | * \notefnerr |
6175 | * |
6176 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6177 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, |
6178 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6179 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6180 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6181 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6182 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6183 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6184 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6185 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6186 | * ::cudaMallocHost |
6187 | */ |
6188 | CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); |
6189 | |
6190 | /** |
6191 | * \brief Frees page-locked host memory |
6192 | * |
6193 | * Frees the memory space pointed to by \p p, which must have been returned by |
6194 | * a previous call to ::cuMemAllocHost(). |
6195 | * |
6196 | * \param p - Pointer to memory to free |
6197 | * |
6198 | * \return |
6199 | * ::CUDA_SUCCESS, |
6200 | * ::CUDA_ERROR_DEINITIALIZED, |
6201 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6202 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6203 | * ::CUDA_ERROR_INVALID_VALUE |
6204 | * \notefnerr |
6205 | * |
6206 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6207 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6208 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6209 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6210 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6211 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6212 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, |
6213 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6214 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6215 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6216 | * ::cudaFreeHost |
6217 | */ |
6218 | CUresult CUDAAPI cuMemFreeHost(void *p); |
6219 | |
6220 | /** |
6221 | * \brief Allocates page-locked host memory |
6222 | * |
6223 | * Allocates \p bytesize bytes of host memory that is page-locked and accessible |
6224 | * to the device. The driver tracks the virtual memory ranges allocated with |
6225 | * this function and automatically accelerates calls to functions such as |
6226 | * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, |
6227 | * it can be read or written with much higher bandwidth than pageable memory |
6228 | * obtained with functions such as ::malloc(). Allocating excessive amounts of |
6229 | * pinned memory may degrade system performance, since it reduces the amount |
6230 | * of memory available to the system for paging. As a result, this function is |
6231 | * best used sparingly to allocate staging areas for data exchange between |
6232 | * host and device. |
6233 | * |
6234 | * The \p Flags parameter enables different options to be specified that |
6235 | * affect the allocation, as follows. |
6236 | * |
6237 | * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be |
6238 | * considered as pinned memory by all CUDA contexts, not just the one that |
6239 | * performed the allocation. |
6240 | * |
6241 | * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address |
6242 | * space. The device pointer to the memory may be obtained by calling |
6243 | * ::cuMemHostGetDevicePointer(). |
6244 | * |
6245 | * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined |
6246 | * (WC). WC memory can be transferred across the PCI Express bus more |
6247 | * quickly on some system configurations, but cannot be read efficiently by |
6248 | * most CPUs. WC memory is a good option for buffers that will be written by |
6249 | * the CPU and read by the GPU via mapped pinned memory or host->device |
6250 | * transfers. |
6251 | * |
6252 | * All of these flags are orthogonal to one another: a developer may allocate |
6253 | * memory that is portable, mapped and/or write-combined with no restrictions. |
6254 | * |
6255 | * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for |
6256 | * devices that do not support mapped pinned memory. The failure is deferred |
6257 | * to ::cuMemHostGetDevicePointer() because the memory may be mapped into |
6258 | * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. |
6259 | * |
6260 | * The memory allocated by this function must be freed with ::cuMemFreeHost(). |
6261 | * |
6262 | * Note all host memory allocated using ::cuMemHostAlloc() will automatically |
6263 | * be immediately accessible to all contexts on all devices which support unified |
6264 | * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). |
6265 | * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer |
6266 | * that may be used to access this host memory from those contexts is always equal |
6267 | * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED |
6268 | * is specified, then the function ::cuMemHostGetDevicePointer() must be used |
6269 | * to query the device pointer, even if the context supports unified addressing. |
6270 | * See \ref CUDA_UNIFIED for additional details. |
6271 | * |
6272 | * \param pp - Returned host pointer to page-locked memory |
6273 | * \param bytesize - Requested allocation size in bytes |
6274 | * \param Flags - Flags for allocation request |
6275 | * |
6276 | * \return |
6277 | * ::CUDA_SUCCESS, |
6278 | * ::CUDA_ERROR_DEINITIALIZED, |
6279 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6280 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6281 | * ::CUDA_ERROR_INVALID_VALUE, |
6282 | * ::CUDA_ERROR_OUT_OF_MEMORY |
6283 | * \notefnerr |
6284 | * |
6285 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6286 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6287 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6288 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6289 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6290 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6291 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6292 | * ::cuMemGetAddressRange, ::cuMemGetInfo, |
6293 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6294 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6295 | * ::cudaHostAlloc |
6296 | */ |
6297 | CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); |
6298 | |
6299 | /** |
6300 | * \brief Passes back device pointer of mapped pinned memory |
6301 | * |
6302 | * Passes back the device pointer \p pdptr corresponding to the mapped, pinned |
6303 | * host buffer \p p allocated by ::cuMemHostAlloc. |
6304 | * |
6305 | * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP |
6306 | * flag was not specified at the time the memory was allocated, or if the |
6307 | * function is called on a GPU that does not support mapped pinned memory. |
6308 | * |
6309 | * For devices that have a non-zero value for the device attribute |
6310 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory |
6311 | * can also be accessed from the device using the host pointer \p p. |
6312 | * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not |
6313 | * match the original host pointer \p p and depends on the devices visible to the |
6314 | * application. If all devices visible to the application have a non-zero value for the |
6315 | * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() |
6316 | * will match the original pointer \p p. If any device visible to the application |
6317 | * has a zero value for the device attribute, the device pointer returned by |
6318 | * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, |
6319 | * but it will be suitable for use on all devices provided Unified Virtual Addressing |
6320 | * is enabled. In such systems, it is valid to access the memory using either pointer |
6321 | * on devices that have a non-zero value for the device attribute. Note however that |
6322 | * such devices should access the memory using only of the two pointers and not both. |
6323 | * |
6324 | * \p Flags provides for future releases. For now, it must be set to 0. |
6325 | * |
6326 | * \param pdptr - Returned device pointer |
6327 | * \param p - Host pointer |
6328 | * \param Flags - Options (must be 0) |
6329 | * |
6330 | * \return |
6331 | * ::CUDA_SUCCESS, |
6332 | * ::CUDA_ERROR_DEINITIALIZED, |
6333 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6334 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6335 | * ::CUDA_ERROR_INVALID_VALUE |
6336 | * \notefnerr |
6337 | * |
6338 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6339 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6340 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6341 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6342 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6343 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6344 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6345 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6346 | * ::cuMemsetD2D8, ::cuMemsetD2D16, |
6347 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6348 | * ::cudaHostGetDevicePointer |
6349 | */ |
6350 | CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); |
6351 | |
6352 | /** |
6353 | * \brief Passes back flags that were used for a pinned allocation |
6354 | * |
6355 | * Passes back the flags \p pFlags that were specified when allocating |
6356 | * the pinned host buffer \p p allocated by ::cuMemHostAlloc. |
6357 | * |
6358 | * ::cuMemHostGetFlags() will fail if the pointer does not reside in |
6359 | * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). |
6360 | * |
6361 | * \param pFlags - Returned flags word |
6362 | * \param p - Host pointer |
6363 | * |
6364 | * \return |
6365 | * ::CUDA_SUCCESS, |
6366 | * ::CUDA_ERROR_DEINITIALIZED, |
6367 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6368 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6369 | * ::CUDA_ERROR_INVALID_VALUE |
6370 | * \notefnerr |
6371 | * |
6372 | * \sa |
6373 | * ::cuMemAllocHost, |
6374 | * ::cuMemHostAlloc, |
6375 | * ::cudaHostGetFlags |
6376 | */ |
6377 | CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); |
6378 | |
6379 | /** |
6380 | * \brief Allocates memory that will be automatically managed by the Unified Memory system |
6381 | * |
6382 | * Allocates \p bytesize bytes of managed memory on the device and returns in |
6383 | * \p *dptr a pointer to the allocated memory. If the device doesn't support |
6384 | * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support |
6385 | * for managed memory can be queried using the device attribute |
6386 | * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably |
6387 | * aligned for any kind of variable. The memory is not cleared. If \p bytesize |
6388 | * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer |
6389 | * is valid on the CPU and on all GPUs in the system that support managed memory. |
6390 | * All accesses to this pointer must obey the Unified Memory programming model. |
6391 | * |
6392 | * \p flags specifies the default stream association for this allocation. |
6393 | * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If |
6394 | * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from |
6395 | * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the |
6396 | * allocation should not be accessed from devices that have a zero value for the |
6397 | * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to |
6398 | * ::cuStreamAttachMemAsync will be required to enable access on such devices. |
6399 | * |
6400 | * If the association is later changed via ::cuStreamAttachMemAsync to |
6401 | * a single stream, the default association as specified during ::cuMemAllocManaged |
6402 | * is restored when that stream is destroyed. For __managed__ variables, the |
6403 | * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a |
6404 | * stream is an asynchronous operation, and as a result, the change to default |
6405 | * association won't happen until all work in the stream has completed. |
6406 | * |
6407 | * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. |
6408 | * |
6409 | * Device memory oversubscription is possible for GPUs that have a non-zero value for the |
6410 | * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on |
6411 | * such GPUs may be evicted from device memory to host memory at any time by the Unified |
6412 | * Memory driver in order to make room for other allocations. |
6413 | * |
6414 | * In a multi-GPU system where all GPUs have a non-zero value for the device attribute |
6415 | * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this |
6416 | * API returns and instead may be populated on access. In such systems, managed memory can |
6417 | * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to |
6418 | * maintain data locality and prevent excessive page faults to the extent possible. The application |
6419 | * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application |
6420 | * can also explicitly migrate memory to a desired processor's memory via |
6421 | * ::cuMemPrefetchAsync. |
6422 | * |
6423 | * In a multi-GPU system where all of the GPUs have a zero value for the device attribute |
6424 | * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support |
6425 | * with each other, the physical storage for managed memory is created on the GPU which is active |
6426 | * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced |
6427 | * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate |
6428 | * memory among such GPUs. |
6429 | * |
6430 | * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and |
6431 | * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS |
6432 | * is zero for at least one of those GPUs, the location chosen for physical storage of managed |
6433 | * memory is system-dependent. |
6434 | * - On Linux, the location chosen will be device memory as long as the current set of active |
6435 | * contexts are on devices that either have peer-to-peer support with each other or have a |
6436 | * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
6437 | * If there is an active context on a GPU that does not have a non-zero value for that device |
6438 | * attribute and it does not have peer-to-peer support with the other devices that have active |
6439 | * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. |
6440 | * Note that this means that managed memory that is located in device memory is migrated to |
6441 | * host memory if a new context is created on a GPU that doesn't have a non-zero value for |
6442 | * the device attribute and does not support peer-to-peer with at least one of the other devices |
6443 | * that has an active context. This in turn implies that context creation may fail if there is |
6444 | * insufficient host memory to migrate all managed allocations. |
6445 | * - On Windows, the physical storage is always created in 'zero-copy' or host memory. |
6446 | * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these |
6447 | * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to |
6448 | * restrict CUDA to only use those GPUs that have peer-to-peer support. |
6449 | * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a |
6450 | * non-zero value to force the driver to always use device memory for physical storage. |
6451 | * When this environment variable is set to a non-zero value, all contexts created in |
6452 | * that process on devices that support managed memory have to be peer-to-peer compatible |
6453 | * with each other. Context creation will fail if a context is created on a device that |
6454 | * supports managed memory and is not peer-to-peer compatible with any of the other |
6455 | * managed memory supporting devices on which contexts were previously created, even if |
6456 | * those contexts have been destroyed. These environment variables are described |
6457 | * in the CUDA programming guide under the "CUDA environment variables" section. |
6458 | * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. |
6459 | * |
6460 | * \param dptr - Returned device pointer |
6461 | * \param bytesize - Requested allocation size in bytes |
6462 | * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST |
6463 | * |
6464 | * \return |
6465 | * ::CUDA_SUCCESS, |
6466 | * ::CUDA_ERROR_DEINITIALIZED, |
6467 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6468 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6469 | * ::CUDA_ERROR_NOT_SUPPORTED, |
6470 | * ::CUDA_ERROR_INVALID_VALUE, |
6471 | * ::CUDA_ERROR_OUT_OF_MEMORY |
6472 | * \notefnerr |
6473 | * |
6474 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6475 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, |
6476 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6477 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6478 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6479 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6480 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6481 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6482 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6483 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6484 | * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, |
6485 | * ::cudaMallocManaged |
6486 | */ |
6487 | CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); |
6488 | |
6489 | /** |
6490 | * \brief Returns a handle to a compute device |
6491 | * |
6492 | * Returns in \p *device a device handle given a PCI bus ID string. |
6493 | * |
6494 | * \param dev - Returned device handle |
6495 | * |
6496 | * \param pciBusId - String in one of the following forms: |
6497 | * [domain]:[bus]:[device].[function] |
6498 | * [domain]:[bus]:[device] |
6499 | * [bus]:[device].[function] |
6500 | * where \p domain, \p bus, \p device, and \p function are all hexadecimal values |
6501 | * |
6502 | * \return |
6503 | * ::CUDA_SUCCESS, |
6504 | * ::CUDA_ERROR_DEINITIALIZED, |
6505 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6506 | * ::CUDA_ERROR_INVALID_VALUE, |
6507 | * ::CUDA_ERROR_INVALID_DEVICE |
6508 | * \notefnerr |
6509 | * |
6510 | * \sa |
6511 | * ::cuDeviceGet, |
6512 | * ::cuDeviceGetAttribute, |
6513 | * ::cuDeviceGetPCIBusId, |
6514 | * ::cudaDeviceGetByPCIBusId |
6515 | */ |
6516 | CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); |
6517 | |
6518 | /** |
6519 | * \brief Returns a PCI Bus Id string for the device |
6520 | * |
6521 | * Returns an ASCII string identifying the device \p dev in the NULL-terminated |
6522 | * string pointed to by \p pciBusId. \p len specifies the maximum length of the |
6523 | * string that may be returned. |
6524 | * |
6525 | * \param pciBusId - Returned identifier string for the device in the following format |
6526 | * [domain]:[bus]:[device].[function] |
6527 | * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. |
6528 | * pciBusId should be large enough to store 13 characters including the NULL-terminator. |
6529 | * |
6530 | * \param len - Maximum length of string to store in \p name |
6531 | * |
6532 | * \param dev - Device to get identifier string for |
6533 | * |
6534 | * \return |
6535 | * ::CUDA_SUCCESS, |
6536 | * ::CUDA_ERROR_DEINITIALIZED, |
6537 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6538 | * ::CUDA_ERROR_INVALID_VALUE, |
6539 | * ::CUDA_ERROR_INVALID_DEVICE |
6540 | * \notefnerr |
6541 | * |
6542 | * \sa |
6543 | * ::cuDeviceGet, |
6544 | * ::cuDeviceGetAttribute, |
6545 | * ::cuDeviceGetByPCIBusId, |
6546 | * ::cudaDeviceGetPCIBusId |
6547 | */ |
6548 | CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); |
6549 | |
6550 | /** |
6551 | * \brief Gets an interprocess handle for a previously allocated event |
6552 | * |
6553 | * Takes as input a previously allocated event. This event must have been |
6554 | * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING |
6555 | * flags set. This opaque handle may be copied into other processes and |
6556 | * opened with ::cuIpcOpenEventHandle to allow efficient hardware |
6557 | * synchronization between GPU work in different processes. |
6558 | * |
6559 | * After the event has been opened in the importing process, |
6560 | * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and |
6561 | * ::cuEventQuery may be used in either process. Performing operations |
6562 | * on the imported event after the exported event has been freed |
6563 | * with ::cuEventDestroy will result in undefined behavior. |
6564 | * |
6565 | * IPC functionality is restricted to devices with support for unified |
6566 | * addressing on Linux and Windows operating systems. |
6567 | * IPC functionality on Windows is restricted to GPUs in TCC mode |
6568 | * |
6569 | * \param pHandle - Pointer to a user allocated CUipcEventHandle |
6570 | * in which to return the opaque event handle |
6571 | * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and |
6572 | * ::CU_EVENT_DISABLE_TIMING flags. |
6573 | * |
6574 | * \return |
6575 | * ::CUDA_SUCCESS, |
6576 | * ::CUDA_ERROR_INVALID_HANDLE, |
6577 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
6578 | * ::CUDA_ERROR_MAP_FAILED, |
6579 | * ::CUDA_ERROR_INVALID_VALUE |
6580 | * |
6581 | * \sa |
6582 | * ::cuEventCreate, |
6583 | * ::cuEventDestroy, |
6584 | * ::cuEventSynchronize, |
6585 | * ::cuEventQuery, |
6586 | * ::cuStreamWaitEvent, |
6587 | * ::cuIpcOpenEventHandle, |
6588 | * ::cuIpcGetMemHandle, |
6589 | * ::cuIpcOpenMemHandle, |
6590 | * ::cuIpcCloseMemHandle, |
6591 | * ::cudaIpcGetEventHandle |
6592 | */ |
6593 | CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); |
6594 | |
6595 | /** |
6596 | * \brief Opens an interprocess event handle for use in the current process |
6597 | * |
6598 | * Opens an interprocess event handle exported from another process with |
6599 | * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like |
6600 | * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. |
6601 | * This event must be freed with ::cuEventDestroy. |
6602 | * |
6603 | * Performing operations on the imported event after the exported event has |
6604 | * been freed with ::cuEventDestroy will result in undefined behavior. |
6605 | * |
6606 | * IPC functionality is restricted to devices with support for unified |
6607 | * addressing on Linux and Windows operating systems. |
6608 | * IPC functionality on Windows is restricted to GPUs in TCC mode |
6609 | * |
6610 | * \param phEvent - Returns the imported event |
6611 | * \param handle - Interprocess handle to open |
6612 | * |
6613 | * \returns |
6614 | * ::CUDA_SUCCESS, |
6615 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6616 | * ::CUDA_ERROR_MAP_FAILED, |
6617 | * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, |
6618 | * ::CUDA_ERROR_INVALID_HANDLE, |
6619 | * ::CUDA_ERROR_INVALID_VALUE |
6620 | * |
6621 | * \sa |
6622 | * ::cuEventCreate, |
6623 | * ::cuEventDestroy, |
6624 | * ::cuEventSynchronize, |
6625 | * ::cuEventQuery, |
6626 | * ::cuStreamWaitEvent, |
6627 | * ::cuIpcGetEventHandle, |
6628 | * ::cuIpcGetMemHandle, |
6629 | * ::cuIpcOpenMemHandle, |
6630 | * ::cuIpcCloseMemHandle, |
6631 | * ::cudaIpcOpenEventHandle |
6632 | */ |
6633 | CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); |
6634 | |
6635 | /** |
6636 | * \brief Gets an interprocess memory handle for an existing device memory |
6637 | * allocation |
6638 | * |
6639 | * Takes a pointer to the base of an existing device memory allocation created |
6640 | * with ::cuMemAlloc and exports it for use in another process. This is a |
6641 | * lightweight operation and may be called multiple times on an allocation |
6642 | * without adverse effects. |
6643 | * |
6644 | * If a region of memory is freed with ::cuMemFree and a subsequent call |
6645 | * to ::cuMemAlloc returns memory with the same device address, |
6646 | * ::cuIpcGetMemHandle will return a unique handle for the |
6647 | * new memory. |
6648 | * |
6649 | * IPC functionality is restricted to devices with support for unified |
6650 | * addressing on Linux and Windows operating systems. |
6651 | * IPC functionality on Windows is restricted to GPUs in TCC mode |
6652 | * |
6653 | * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return |
6654 | * the handle in. |
6655 | * \param dptr - Base pointer to previously allocated device memory |
6656 | * |
6657 | * \returns |
6658 | * ::CUDA_SUCCESS, |
6659 | * ::CUDA_ERROR_INVALID_HANDLE, |
6660 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6661 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
6662 | * ::CUDA_ERROR_MAP_FAILED, |
6663 | * ::CUDA_ERROR_INVALID_VALUE |
6664 | * |
6665 | * \sa |
6666 | * ::cuMemAlloc, |
6667 | * ::cuMemFree, |
6668 | * ::cuIpcGetEventHandle, |
6669 | * ::cuIpcOpenEventHandle, |
6670 | * ::cuIpcOpenMemHandle, |
6671 | * ::cuIpcCloseMemHandle, |
6672 | * ::cudaIpcGetMemHandle |
6673 | */ |
6674 | CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); |
6675 | |
6676 | /** |
6677 | * \brief Opens an interprocess memory handle exported from another process |
6678 | * and returns a device pointer usable in the local process. |
6679 | * |
6680 | * Maps memory exported from another process with ::cuIpcGetMemHandle into |
6681 | * the current device address space. For contexts on different devices |
6682 | * ::cuIpcOpenMemHandle can attempt to enable peer access between the |
6683 | * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is |
6684 | * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. |
6685 | * ::cuDeviceCanAccessPeer can determine if a mapping is possible. |
6686 | * |
6687 | * Contexts that may open ::CUipcMemHandles are restricted in the following way. |
6688 | * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened |
6689 | * by one ::CUcontext per ::CUdevice per other process. |
6690 | * |
6691 | * If the memory handle has already been opened by the current context, the |
6692 | * reference count on the handle is incremented by 1 and the existing device pointer |
6693 | * is returned. |
6694 | * |
6695 | * Memory returned from ::cuIpcOpenMemHandle must be freed with |
6696 | * ::cuIpcCloseMemHandle. |
6697 | * |
6698 | * Calling ::cuMemFree on an exported memory region before calling |
6699 | * ::cuIpcCloseMemHandle in the importing context will result in undefined |
6700 | * behavior. |
6701 | * |
6702 | * IPC functionality is restricted to devices with support for unified |
6703 | * addressing on Linux and Windows operating systems. |
6704 | * IPC functionality on Windows is restricted to GPUs in TCC mode |
6705 | * |
6706 | * \param pdptr - Returned device pointer |
6707 | * \param handle - ::CUipcMemHandle to open |
6708 | * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS |
6709 | * |
6710 | * \returns |
6711 | * ::CUDA_SUCCESS, |
6712 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6713 | * ::CUDA_ERROR_MAP_FAILED, |
6714 | * ::CUDA_ERROR_INVALID_HANDLE, |
6715 | * ::CUDA_ERROR_TOO_MANY_PEERS, |
6716 | * ::CUDA_ERROR_INVALID_VALUE |
6717 | * |
6718 | * \note No guarantees are made about the address returned in \p *pdptr. |
6719 | * In particular, multiple processes may not receive the same address for the same \p handle. |
6720 | * |
6721 | * \sa |
6722 | * ::cuMemAlloc, |
6723 | * ::cuMemFree, |
6724 | * ::cuIpcGetEventHandle, |
6725 | * ::cuIpcOpenEventHandle, |
6726 | * ::cuIpcGetMemHandle, |
6727 | * ::cuIpcCloseMemHandle, |
6728 | * ::cuCtxEnablePeerAccess, |
6729 | * ::cuDeviceCanAccessPeer, |
6730 | * ::cudaIpcOpenMemHandle |
6731 | */ |
6732 | CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); |
6733 | |
6734 | /** |
6735 | * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle |
6736 | * |
6737 | * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. |
6738 | * When the reference count reaches 0, this API unmaps the memory. The original allocation |
6739 | * in the exporting process as well as imported mappings in other processes |
6740 | * will be unaffected. |
6741 | * |
6742 | * Any resources used to enable peer access will be freed if this is the |
6743 | * last mapping using them. |
6744 | * |
6745 | * IPC functionality is restricted to devices with support for unified |
6746 | * addressing on Linux and Windows operating systems. |
6747 | * IPC functionality on Windows is restricted to GPUs in TCC mode |
6748 | * |
6749 | * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle |
6750 | * |
6751 | * \returns |
6752 | * ::CUDA_SUCCESS, |
6753 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6754 | * ::CUDA_ERROR_MAP_FAILED, |
6755 | * ::CUDA_ERROR_INVALID_HANDLE, |
6756 | * ::CUDA_ERROR_INVALID_VALUE |
6757 | * \sa |
6758 | * ::cuMemAlloc, |
6759 | * ::cuMemFree, |
6760 | * ::cuIpcGetEventHandle, |
6761 | * ::cuIpcOpenEventHandle, |
6762 | * ::cuIpcGetMemHandle, |
6763 | * ::cuIpcOpenMemHandle, |
6764 | * ::cudaIpcCloseMemHandle |
6765 | */ |
6766 | CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); |
6767 | |
6768 | /** |
6769 | * \brief Registers an existing host memory range for use by CUDA |
6770 | * |
6771 | * Page-locks the memory range specified by \p p and \p bytesize and maps it |
6772 | * for the device(s) as specified by \p Flags. This memory range also is added |
6773 | * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate |
6774 | * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed |
6775 | * directly by the device, it can be read or written with much higher bandwidth |
6776 | * than pageable memory that has not been registered. Page-locking excessive |
6777 | * amounts of memory may degrade system performance, since it reduces the amount |
6778 | * of memory available to the system for paging. As a result, this function is |
6779 | * best used sparingly to register staging areas for data exchange between |
6780 | * host and device. |
6781 | * |
6782 | * This function has limited support on Mac OS X. OS 10.7 or higher is required. |
6783 | * |
6784 | * The \p Flags parameter enables different options to be specified that |
6785 | * affect the allocation, as follows. |
6786 | * |
6787 | * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be |
6788 | * considered as pinned memory by all CUDA contexts, not just the one that |
6789 | * performed the allocation. |
6790 | * |
6791 | * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address |
6792 | * space. The device pointer to the memory may be obtained by calling |
6793 | * ::cuMemHostGetDevicePointer(). |
6794 | * |
6795 | * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some |
6796 | * I/O memory space, e.g. the PCI Express resource of a 3rd party device. |
6797 | * |
6798 | * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory |
6799 | * that is considered read-only by the device. On platforms without |
6800 | * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is |
6801 | * required in order to register memory mapped to the CPU as read-only. Support |
6802 | * for the use of this flag can be queried from the device attribute |
6803 | * CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with |
6804 | * a current context associated with a device that does not have this attribute |
6805 | * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. |
6806 | * |
6807 | * All of these flags are orthogonal to one another: a developer may page-lock |
6808 | * memory that is portable or mapped with no restrictions. |
6809 | * |
6810 | * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for |
6811 | * devices that do not support mapped pinned memory. The failure is deferred |
6812 | * to ::cuMemHostGetDevicePointer() because the memory may be mapped into |
6813 | * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. |
6814 | * |
6815 | * For devices that have a non-zero value for the device attribute |
6816 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory |
6817 | * can also be accessed from the device using the host pointer \p p. |
6818 | * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not |
6819 | * match the original host pointer \p ptr and depends on the devices visible to the |
6820 | * application. If all devices visible to the application have a non-zero value for the |
6821 | * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() |
6822 | * will match the original pointer \p ptr. If any device visible to the application |
6823 | * has a zero value for the device attribute, the device pointer returned by |
6824 | * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, |
6825 | * but it will be suitable for use on all devices provided Unified Virtual Addressing |
6826 | * is enabled. In such systems, it is valid to access the memory using either pointer |
6827 | * on devices that have a non-zero value for the device attribute. Note however that |
6828 | * such devices should access the memory using only of the two pointers and not both. |
6829 | * |
6830 | * The memory page-locked by this function must be unregistered with |
6831 | * ::cuMemHostUnregister(). |
6832 | * |
6833 | * \param p - Host pointer to memory to page-lock |
6834 | * \param bytesize - Size in bytes of the address range to page-lock |
6835 | * \param Flags - Flags for allocation request |
6836 | * |
6837 | * \return |
6838 | * ::CUDA_SUCCESS, |
6839 | * ::CUDA_ERROR_DEINITIALIZED, |
6840 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6841 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6842 | * ::CUDA_ERROR_INVALID_VALUE, |
6843 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
6844 | * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, |
6845 | * ::CUDA_ERROR_NOT_PERMITTED, |
6846 | * ::CUDA_ERROR_NOT_SUPPORTED |
6847 | * \notefnerr |
6848 | * |
6849 | * \sa |
6850 | * ::cuMemHostUnregister, |
6851 | * ::cuMemHostGetFlags, |
6852 | * ::cuMemHostGetDevicePointer, |
6853 | * ::cudaHostRegister |
6854 | */ |
6855 | CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); |
6856 | |
6857 | /** |
6858 | * \brief Unregisters a memory range that was registered with cuMemHostRegister. |
6859 | * |
6860 | * Unmaps the memory range whose base address is specified by \p p, and makes |
6861 | * it pageable again. |
6862 | * |
6863 | * The base address must be the same one specified to ::cuMemHostRegister(). |
6864 | * |
6865 | * \param p - Host pointer to memory to unregister |
6866 | * |
6867 | * \return |
6868 | * ::CUDA_SUCCESS, |
6869 | * ::CUDA_ERROR_DEINITIALIZED, |
6870 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6871 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6872 | * ::CUDA_ERROR_INVALID_VALUE, |
6873 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
6874 | * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, |
6875 | * \notefnerr |
6876 | * |
6877 | * \sa |
6878 | * ::cuMemHostRegister, |
6879 | * ::cudaHostUnregister |
6880 | */ |
6881 | CUresult CUDAAPI cuMemHostUnregister(void *p); |
6882 | |
6883 | /** |
6884 | * \brief Copies memory |
6885 | * |
6886 | * Copies data between two pointers. |
6887 | * \p dst and \p src are base pointers of the destination and source, respectively. |
6888 | * \p ByteCount specifies the number of bytes to copy. |
6889 | * Note that this function infers the type of the transfer (host to host, host to |
6890 | * device, device to device, or device to host) from the pointer values. This |
6891 | * function is only allowed in contexts which support unified addressing. |
6892 | * |
6893 | * \param dst - Destination unified virtual address space pointer |
6894 | * \param src - Source unified virtual address space pointer |
6895 | * \param ByteCount - Size of memory copy in bytes |
6896 | * |
6897 | * \return |
6898 | * ::CUDA_SUCCESS, |
6899 | * ::CUDA_ERROR_DEINITIALIZED, |
6900 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6901 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6902 | * ::CUDA_ERROR_INVALID_VALUE |
6903 | * \notefnerr |
6904 | * \note_sync |
6905 | * \note_memcpy |
6906 | * |
6907 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6908 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6909 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6910 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6911 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, |
6912 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6913 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6914 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6915 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6916 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6917 | * ::cudaMemcpy, |
6918 | * ::cudaMemcpyToSymbol, |
6919 | * ::cudaMemcpyFromSymbol |
6920 | */ |
6921 | CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); |
6922 | |
6923 | /** |
6924 | * \brief Copies device memory between two contexts |
6925 | * |
6926 | * Copies from device memory in one context to device memory in another |
6927 | * context. \p dstDevice is the base device pointer of the destination memory |
6928 | * and \p dstContext is the destination context. \p srcDevice is the base |
6929 | * device pointer of the source memory and \p srcContext is the source pointer. |
6930 | * \p ByteCount specifies the number of bytes to copy. |
6931 | * |
6932 | * \param dstDevice - Destination device pointer |
6933 | * \param dstContext - Destination context |
6934 | * \param srcDevice - Source device pointer |
6935 | * \param srcContext - Source context |
6936 | * \param ByteCount - Size of memory copy in bytes |
6937 | * |
6938 | * \return |
6939 | * ::CUDA_SUCCESS, |
6940 | * ::CUDA_ERROR_DEINITIALIZED, |
6941 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6942 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6943 | * ::CUDA_ERROR_INVALID_VALUE |
6944 | * \notefnerr |
6945 | * \note_sync |
6946 | * |
6947 | * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
6948 | * ::cuMemcpy3DPeerAsync, |
6949 | * ::cudaMemcpyPeer |
6950 | */ |
6951 | CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); |
6952 | |
6953 | /** |
6954 | * \brief Copies memory from Host to Device |
6955 | * |
6956 | * Copies from host memory to device memory. \p dstDevice and \p srcHost are |
6957 | * the base addresses of the destination and source, respectively. \p ByteCount |
6958 | * specifies the number of bytes to copy. |
6959 | * |
6960 | * \param dstDevice - Destination device pointer |
6961 | * \param srcHost - Source host pointer |
6962 | * \param ByteCount - Size of memory copy in bytes |
6963 | * |
6964 | * \return |
6965 | * ::CUDA_SUCCESS, |
6966 | * ::CUDA_ERROR_DEINITIALIZED, |
6967 | * ::CUDA_ERROR_NOT_INITIALIZED, |
6968 | * ::CUDA_ERROR_INVALID_CONTEXT, |
6969 | * ::CUDA_ERROR_INVALID_VALUE |
6970 | * \notefnerr |
6971 | * \note_sync |
6972 | * \note_memcpy |
6973 | * |
6974 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
6975 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
6976 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
6977 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
6978 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
6979 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
6980 | * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
6981 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
6982 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
6983 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
6984 | * ::cudaMemcpy, |
6985 | * ::cudaMemcpyToSymbol |
6986 | */ |
6987 | CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); |
6988 | |
6989 | /** |
6990 | * \brief Copies memory from Device to Host |
6991 | * |
6992 | * Copies from device to host memory. \p dstHost and \p srcDevice specify the |
6993 | * base pointers of the destination and source, respectively. \p ByteCount |
6994 | * specifies the number of bytes to copy. |
6995 | * |
6996 | * \param dstHost - Destination host pointer |
6997 | * \param srcDevice - Source device pointer |
6998 | * \param ByteCount - Size of memory copy in bytes |
6999 | * |
7000 | * \return |
7001 | * ::CUDA_SUCCESS, |
7002 | * ::CUDA_ERROR_DEINITIALIZED, |
7003 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7004 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7005 | * ::CUDA_ERROR_INVALID_VALUE |
7006 | * \notefnerr |
7007 | * \note_sync |
7008 | * \note_memcpy |
7009 | * |
7010 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7011 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7012 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7013 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7014 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7015 | * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7016 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7017 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7018 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7019 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7020 | * ::cudaMemcpy, |
7021 | * ::cudaMemcpyFromSymbol |
7022 | */ |
7023 | CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); |
7024 | |
7025 | /** |
7026 | * \brief Copies memory from Device to Device |
7027 | * |
7028 | * Copies from device memory to device memory. \p dstDevice and \p srcDevice |
7029 | * are the base pointers of the destination and source, respectively. |
7030 | * \p ByteCount specifies the number of bytes to copy. |
7031 | * |
7032 | * \param dstDevice - Destination device pointer |
7033 | * \param srcDevice - Source device pointer |
7034 | * \param ByteCount - Size of memory copy in bytes |
7035 | * |
7036 | * \return |
7037 | * ::CUDA_SUCCESS, |
7038 | * ::CUDA_ERROR_DEINITIALIZED, |
7039 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7040 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7041 | * ::CUDA_ERROR_INVALID_VALUE |
7042 | * \notefnerr |
7043 | * \note_sync |
7044 | * |
7045 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7046 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7047 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7048 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7049 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, |
7050 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7051 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7052 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7053 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7054 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7055 | * ::cudaMemcpy, |
7056 | * ::cudaMemcpyToSymbol, |
7057 | * ::cudaMemcpyFromSymbol |
7058 | */ |
7059 | CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); |
7060 | |
7061 | /** |
7062 | * \brief Copies memory from Device to Array |
7063 | * |
7064 | * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset |
7065 | * specify the CUDA array handle and starting index of the destination data. |
7066 | * \p srcDevice specifies the base pointer of the source. \p ByteCount |
7067 | * specifies the number of bytes to copy. |
7068 | * |
7069 | * \param dstArray - Destination array |
7070 | * \param dstOffset - Offset in bytes of destination array |
7071 | * \param srcDevice - Source device pointer |
7072 | * \param ByteCount - Size of memory copy in bytes |
7073 | * |
7074 | * \return |
7075 | * ::CUDA_SUCCESS, |
7076 | * ::CUDA_ERROR_DEINITIALIZED, |
7077 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7078 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7079 | * ::CUDA_ERROR_INVALID_VALUE |
7080 | * \notefnerr |
7081 | * \note_sync |
7082 | * |
7083 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7084 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7085 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7086 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7087 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7088 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7089 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7090 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7091 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7092 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7093 | * ::cudaMemcpyToArray |
7094 | */ |
7095 | CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); |
7096 | |
7097 | /** |
7098 | * \brief Copies memory from Array to Device |
7099 | * |
7100 | * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the |
7101 | * base pointer of the destination and must be naturally aligned with the CUDA |
7102 | * array elements. \p srcArray and \p srcOffset specify the CUDA array handle |
7103 | * and the offset in bytes into the array where the copy is to begin. |
7104 | * \p ByteCount specifies the number of bytes to copy and must be evenly |
7105 | * divisible by the array element size. |
7106 | * |
7107 | * \param dstDevice - Destination device pointer |
7108 | * \param srcArray - Source array |
7109 | * \param srcOffset - Offset in bytes of source array |
7110 | * \param ByteCount - Size of memory copy in bytes |
7111 | * |
7112 | * \return |
7113 | * ::CUDA_SUCCESS, |
7114 | * ::CUDA_ERROR_DEINITIALIZED, |
7115 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7116 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7117 | * ::CUDA_ERROR_INVALID_VALUE |
7118 | * \notefnerr |
7119 | * \note_sync |
7120 | * |
7121 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7122 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7123 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7124 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, |
7125 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7126 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7127 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7128 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7129 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7130 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7131 | * ::cudaMemcpyFromArray |
7132 | */ |
7133 | CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
7134 | |
7135 | /** |
7136 | * \brief Copies memory from Host to Array |
7137 | * |
7138 | * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset |
7139 | * specify the CUDA array handle and starting offset in bytes of the destination |
7140 | * data. \p pSrc specifies the base address of the source. \p ByteCount specifies |
7141 | * the number of bytes to copy. |
7142 | * |
7143 | * \param dstArray - Destination array |
7144 | * \param dstOffset - Offset in bytes of destination array |
7145 | * \param srcHost - Source host pointer |
7146 | * \param ByteCount - Size of memory copy in bytes |
7147 | * |
7148 | * \return |
7149 | * ::CUDA_SUCCESS, |
7150 | * ::CUDA_ERROR_DEINITIALIZED, |
7151 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7152 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7153 | * ::CUDA_ERROR_INVALID_VALUE |
7154 | * \notefnerr |
7155 | * \note_sync |
7156 | * \note_memcpy |
7157 | * |
7158 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7159 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7160 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7161 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7162 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7163 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, |
7164 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7165 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7166 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7167 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7168 | * ::cudaMemcpyToArray |
7169 | */ |
7170 | CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); |
7171 | |
7172 | /** |
7173 | * \brief Copies memory from Array to Host |
7174 | * |
7175 | * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base |
7176 | * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA |
7177 | * array handle and starting offset in bytes of the source data. |
7178 | * \p ByteCount specifies the number of bytes to copy. |
7179 | * |
7180 | * \param dstHost - Destination device pointer |
7181 | * \param srcArray - Source array |
7182 | * \param srcOffset - Offset in bytes of source array |
7183 | * \param ByteCount - Size of memory copy in bytes |
7184 | * |
7185 | * \return |
7186 | * ::CUDA_SUCCESS, |
7187 | * ::CUDA_ERROR_DEINITIALIZED, |
7188 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7189 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7190 | * ::CUDA_ERROR_INVALID_VALUE |
7191 | * \notefnerr |
7192 | * \note_sync |
7193 | * \note_memcpy |
7194 | * |
7195 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7196 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7197 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7198 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7199 | * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7200 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7201 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7202 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7203 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7204 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7205 | * ::cudaMemcpyFromArray |
7206 | */ |
7207 | CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
7208 | |
7209 | /** |
7210 | * \brief Copies memory from Array to Array |
7211 | * |
7212 | * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray |
7213 | * specify the handles of the destination and source CUDA arrays for the copy, |
7214 | * respectively. \p dstOffset and \p srcOffset specify the destination and |
7215 | * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of |
7216 | * bytes to be copied. The size of the elements in the CUDA arrays need not be |
7217 | * the same format, but the elements must be the same size; and count must be |
7218 | * evenly divisible by that size. |
7219 | * |
7220 | * \param dstArray - Destination array |
7221 | * \param dstOffset - Offset in bytes of destination array |
7222 | * \param srcArray - Source array |
7223 | * \param srcOffset - Offset in bytes of source array |
7224 | * \param ByteCount - Size of memory copy in bytes |
7225 | * |
7226 | * \return |
7227 | * ::CUDA_SUCCESS, |
7228 | * ::CUDA_ERROR_DEINITIALIZED, |
7229 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7230 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7231 | * ::CUDA_ERROR_INVALID_VALUE |
7232 | * \notefnerr |
7233 | * \note_sync |
7234 | * |
7235 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7236 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7237 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7238 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, |
7239 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7240 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7241 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7242 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7243 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7244 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7245 | * ::cudaMemcpyArrayToArray |
7246 | */ |
7247 | CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
7248 | |
7249 | /** |
7250 | * \brief Copies memory for 2D arrays |
7251 | * |
7252 | * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
7253 | * The ::CUDA_MEMCPY2D structure is defined as: |
7254 | * |
7255 | * \code |
7256 | typedef struct CUDA_MEMCPY2D_st { |
7257 | unsigned int srcXInBytes, srcY; |
7258 | CUmemorytype srcMemoryType; |
7259 | const void *srcHost; |
7260 | CUdeviceptr srcDevice; |
7261 | CUarray srcArray; |
7262 | unsigned int srcPitch; |
7263 | |
7264 | unsigned int dstXInBytes, dstY; |
7265 | CUmemorytype dstMemoryType; |
7266 | void *dstHost; |
7267 | CUdeviceptr dstDevice; |
7268 | CUarray dstArray; |
7269 | unsigned int dstPitch; |
7270 | |
7271 | unsigned int WidthInBytes; |
7272 | unsigned int Height; |
7273 | } CUDA_MEMCPY2D; |
7274 | * \endcode |
7275 | * where: |
7276 | * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
7277 | * source and destination, respectively; ::CUmemorytype_enum is defined as: |
7278 | * |
7279 | * \code |
7280 | typedef enum CUmemorytype_enum { |
7281 | CU_MEMORYTYPE_HOST = 0x01, |
7282 | CU_MEMORYTYPE_DEVICE = 0x02, |
7283 | CU_MEMORYTYPE_ARRAY = 0x03, |
7284 | CU_MEMORYTYPE_UNIFIED = 0x04 |
7285 | } CUmemorytype; |
7286 | * \endcode |
7287 | * |
7288 | * \par |
7289 | * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
7290 | * specify the (unified virtual address space) base address of the source data |
7291 | * and the bytes per row to apply. ::srcArray is ignored. |
7292 | * This value may be used only if unified addressing is supported in the calling |
7293 | * context. |
7294 | * |
7295 | * \par |
7296 | * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
7297 | * specify the (host) base address of the source data and the bytes per row to |
7298 | * apply. ::srcArray is ignored. |
7299 | * |
7300 | * \par |
7301 | * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
7302 | * specify the (device) base address of the source data and the bytes per row |
7303 | * to apply. ::srcArray is ignored. |
7304 | * |
7305 | * \par |
7306 | * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
7307 | * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
7308 | * ignored. |
7309 | * |
7310 | * \par |
7311 | * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
7312 | * specify the (host) base address of the destination data and the bytes per |
7313 | * row to apply. ::dstArray is ignored. |
7314 | * |
7315 | * \par |
7316 | * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
7317 | * specify the (unified virtual address space) base address of the source data |
7318 | * and the bytes per row to apply. ::dstArray is ignored. |
7319 | * This value may be used only if unified addressing is supported in the calling |
7320 | * context. |
7321 | * |
7322 | * \par |
7323 | * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
7324 | * specify the (device) base address of the destination data and the bytes per |
7325 | * row to apply. ::dstArray is ignored. |
7326 | * |
7327 | * \par |
7328 | * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
7329 | * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
7330 | * ignored. |
7331 | * |
7332 | * - ::srcXInBytes and ::srcY specify the base address of the source data for |
7333 | * the copy. |
7334 | * |
7335 | * \par |
7336 | * For host pointers, the starting address is |
7337 | * \code |
7338 | void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
7339 | * \endcode |
7340 | * |
7341 | * \par |
7342 | * For device pointers, the starting address is |
7343 | * \code |
7344 | CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
7345 | * \endcode |
7346 | * |
7347 | * \par |
7348 | * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
7349 | * element size. |
7350 | * |
7351 | * - ::dstXInBytes and ::dstY specify the base address of the destination data |
7352 | * for the copy. |
7353 | * |
7354 | * \par |
7355 | * For host pointers, the base address is |
7356 | * \code |
7357 | void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
7358 | * \endcode |
7359 | * |
7360 | * \par |
7361 | * For device pointers, the starting address is |
7362 | * \code |
7363 | CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
7364 | * \endcode |
7365 | * |
7366 | * \par |
7367 | * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
7368 | * element size. |
7369 | * |
7370 | * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
7371 | * the 2D copy being performed. |
7372 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
7373 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
7374 | * ::WidthInBytes + dstXInBytes. |
7375 | * |
7376 | * \par |
7377 | * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum |
7378 | * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
7379 | * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
7380 | * (device to device, CUDA array to device, CUDA array to CUDA array), |
7381 | * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). |
7382 | * ::cuMemcpy2DUnaligned() does not have this restriction, but may run |
7383 | * significantly slower in the cases where ::cuMemcpy2D() would have returned |
7384 | * an error code. |
7385 | * |
7386 | * \param pCopy - Parameters for the memory copy |
7387 | * |
7388 | * \return |
7389 | * ::CUDA_SUCCESS, |
7390 | * ::CUDA_ERROR_DEINITIALIZED, |
7391 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7392 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7393 | * ::CUDA_ERROR_INVALID_VALUE |
7394 | * \notefnerr |
7395 | * \note_sync |
7396 | * |
7397 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7398 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7399 | * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7400 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7401 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7402 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7403 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7404 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7405 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7406 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7407 | * ::cudaMemcpy2D, |
7408 | * ::cudaMemcpy2DToArray, |
7409 | * ::cudaMemcpy2DFromArray |
7410 | */ |
7411 | CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); |
7412 | |
7413 | /** |
7414 | * \brief Copies memory for 2D arrays |
7415 | * |
7416 | * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
7417 | * The ::CUDA_MEMCPY2D structure is defined as: |
7418 | * |
7419 | * \code |
7420 | typedef struct CUDA_MEMCPY2D_st { |
7421 | unsigned int srcXInBytes, srcY; |
7422 | CUmemorytype srcMemoryType; |
7423 | const void *srcHost; |
7424 | CUdeviceptr srcDevice; |
7425 | CUarray srcArray; |
7426 | unsigned int srcPitch; |
7427 | unsigned int dstXInBytes, dstY; |
7428 | CUmemorytype dstMemoryType; |
7429 | void *dstHost; |
7430 | CUdeviceptr dstDevice; |
7431 | CUarray dstArray; |
7432 | unsigned int dstPitch; |
7433 | unsigned int WidthInBytes; |
7434 | unsigned int Height; |
7435 | } CUDA_MEMCPY2D; |
7436 | * \endcode |
7437 | * where: |
7438 | * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
7439 | * source and destination, respectively; ::CUmemorytype_enum is defined as: |
7440 | * |
7441 | * \code |
7442 | typedef enum CUmemorytype_enum { |
7443 | CU_MEMORYTYPE_HOST = 0x01, |
7444 | CU_MEMORYTYPE_DEVICE = 0x02, |
7445 | CU_MEMORYTYPE_ARRAY = 0x03, |
7446 | CU_MEMORYTYPE_UNIFIED = 0x04 |
7447 | } CUmemorytype; |
7448 | * \endcode |
7449 | * |
7450 | * \par |
7451 | * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
7452 | * specify the (unified virtual address space) base address of the source data |
7453 | * and the bytes per row to apply. ::srcArray is ignored. |
7454 | * This value may be used only if unified addressing is supported in the calling |
7455 | * context. |
7456 | * |
7457 | * \par |
7458 | * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
7459 | * specify the (host) base address of the source data and the bytes per row to |
7460 | * apply. ::srcArray is ignored. |
7461 | * |
7462 | * \par |
7463 | * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
7464 | * specify the (device) base address of the source data and the bytes per row |
7465 | * to apply. ::srcArray is ignored. |
7466 | * |
7467 | * \par |
7468 | * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
7469 | * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
7470 | * ignored. |
7471 | * |
7472 | * \par |
7473 | * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
7474 | * specify the (unified virtual address space) base address of the source data |
7475 | * and the bytes per row to apply. ::dstArray is ignored. |
7476 | * This value may be used only if unified addressing is supported in the calling |
7477 | * context. |
7478 | * |
7479 | * \par |
7480 | * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
7481 | * specify the (host) base address of the destination data and the bytes per |
7482 | * row to apply. ::dstArray is ignored. |
7483 | * |
7484 | * \par |
7485 | * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
7486 | * specify the (device) base address of the destination data and the bytes per |
7487 | * row to apply. ::dstArray is ignored. |
7488 | * |
7489 | * \par |
7490 | * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
7491 | * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
7492 | * ignored. |
7493 | * |
7494 | * - ::srcXInBytes and ::srcY specify the base address of the source data for |
7495 | * the copy. |
7496 | * |
7497 | * \par |
7498 | * For host pointers, the starting address is |
7499 | * \code |
7500 | void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
7501 | * \endcode |
7502 | * |
7503 | * \par |
7504 | * For device pointers, the starting address is |
7505 | * \code |
7506 | CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
7507 | * \endcode |
7508 | * |
7509 | * \par |
7510 | * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
7511 | * element size. |
7512 | * |
7513 | * - ::dstXInBytes and ::dstY specify the base address of the destination data |
7514 | * for the copy. |
7515 | * |
7516 | * \par |
7517 | * For host pointers, the base address is |
7518 | * \code |
7519 | void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
7520 | * \endcode |
7521 | * |
7522 | * \par |
7523 | * For device pointers, the starting address is |
7524 | * \code |
7525 | CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
7526 | * \endcode |
7527 | * |
7528 | * \par |
7529 | * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
7530 | * element size. |
7531 | * |
7532 | * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
7533 | * the 2D copy being performed. |
7534 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
7535 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
7536 | * ::WidthInBytes + dstXInBytes. |
7537 | * |
7538 | * \par |
7539 | * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum |
7540 | * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
7541 | * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
7542 | * (device to device, CUDA array to device, CUDA array to CUDA array), |
7543 | * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). |
7544 | * ::cuMemcpy2DUnaligned() does not have this restriction, but may run |
7545 | * significantly slower in the cases where ::cuMemcpy2D() would have returned |
7546 | * an error code. |
7547 | * |
7548 | * \param pCopy - Parameters for the memory copy |
7549 | * |
7550 | * \return |
7551 | * ::CUDA_SUCCESS, |
7552 | * ::CUDA_ERROR_DEINITIALIZED, |
7553 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7554 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7555 | * ::CUDA_ERROR_INVALID_VALUE |
7556 | * \notefnerr |
7557 | * \note_sync |
7558 | * |
7559 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7560 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7561 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, |
7562 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7563 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7564 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7565 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7566 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7567 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7568 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7569 | * ::cudaMemcpy2D, |
7570 | * ::cudaMemcpy2DToArray, |
7571 | * ::cudaMemcpy2DFromArray |
7572 | */ |
7573 | CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); |
7574 | |
7575 | /** |
7576 | * \brief Copies memory for 3D arrays |
7577 | * |
7578 | * Perform a 3D memory copy according to the parameters specified in |
7579 | * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: |
7580 | * |
7581 | * \code |
7582 | typedef struct CUDA_MEMCPY3D_st { |
7583 | |
7584 | unsigned int srcXInBytes, srcY, srcZ; |
7585 | unsigned int srcLOD; |
7586 | CUmemorytype srcMemoryType; |
7587 | const void *srcHost; |
7588 | CUdeviceptr srcDevice; |
7589 | CUarray srcArray; |
7590 | unsigned int srcPitch; // ignored when src is array |
7591 | unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 |
7592 | |
7593 | unsigned int dstXInBytes, dstY, dstZ; |
7594 | unsigned int dstLOD; |
7595 | CUmemorytype dstMemoryType; |
7596 | void *dstHost; |
7597 | CUdeviceptr dstDevice; |
7598 | CUarray dstArray; |
7599 | unsigned int dstPitch; // ignored when dst is array |
7600 | unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 |
7601 | |
7602 | unsigned int WidthInBytes; |
7603 | unsigned int Height; |
7604 | unsigned int Depth; |
7605 | } CUDA_MEMCPY3D; |
7606 | * \endcode |
7607 | * where: |
7608 | * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
7609 | * source and destination, respectively; ::CUmemorytype_enum is defined as: |
7610 | * |
7611 | * \code |
7612 | typedef enum CUmemorytype_enum { |
7613 | CU_MEMORYTYPE_HOST = 0x01, |
7614 | CU_MEMORYTYPE_DEVICE = 0x02, |
7615 | CU_MEMORYTYPE_ARRAY = 0x03, |
7616 | CU_MEMORYTYPE_UNIFIED = 0x04 |
7617 | } CUmemorytype; |
7618 | * \endcode |
7619 | * |
7620 | * \par |
7621 | * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
7622 | * specify the (unified virtual address space) base address of the source data |
7623 | * and the bytes per row to apply. ::srcArray is ignored. |
7624 | * This value may be used only if unified addressing is supported in the calling |
7625 | * context. |
7626 | * |
7627 | * \par |
7628 | * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and |
7629 | * ::srcHeight specify the (host) base address of the source data, the bytes |
7630 | * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
7631 | * ignored. |
7632 | * |
7633 | * \par |
7634 | * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and |
7635 | * ::srcHeight specify the (device) base address of the source data, the bytes |
7636 | * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
7637 | * ignored. |
7638 | * |
7639 | * \par |
7640 | * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
7641 | * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and |
7642 | * ::srcHeight are ignored. |
7643 | * |
7644 | * \par |
7645 | * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
7646 | * specify the (unified virtual address space) base address of the source data |
7647 | * and the bytes per row to apply. ::dstArray is ignored. |
7648 | * This value may be used only if unified addressing is supported in the calling |
7649 | * context. |
7650 | * |
7651 | * \par |
7652 | * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
7653 | * specify the (host) base address of the destination data, the bytes per row, |
7654 | * and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
7655 | * |
7656 | * \par |
7657 | * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
7658 | * specify the (device) base address of the destination data, the bytes per |
7659 | * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
7660 | * |
7661 | * \par |
7662 | * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
7663 | * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and |
7664 | * ::dstHeight are ignored. |
7665 | * |
7666 | * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source |
7667 | * data for the copy. |
7668 | * |
7669 | * \par |
7670 | * For host pointers, the starting address is |
7671 | * \code |
7672 | void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); |
7673 | * \endcode |
7674 | * |
7675 | * \par |
7676 | * For device pointers, the starting address is |
7677 | * \code |
7678 | CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; |
7679 | * \endcode |
7680 | * |
7681 | * \par |
7682 | * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
7683 | * element size. |
7684 | * |
7685 | * - dstXInBytes, ::dstY and ::dstZ specify the base address of the |
7686 | * destination data for the copy. |
7687 | * |
7688 | * \par |
7689 | * For host pointers, the base address is |
7690 | * \code |
7691 | void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); |
7692 | * \endcode |
7693 | * |
7694 | * \par |
7695 | * For device pointers, the starting address is |
7696 | * \code |
7697 | CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; |
7698 | * \endcode |
7699 | * |
7700 | * \par |
7701 | * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
7702 | * element size. |
7703 | * |
7704 | * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height |
7705 | * and depth of the 3D copy being performed. |
7706 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
7707 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
7708 | * ::WidthInBytes + dstXInBytes. |
7709 | * - If specified, ::srcHeight must be greater than or equal to ::Height + |
7710 | * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
7711 | * |
7712 | * \par |
7713 | * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum |
7714 | * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). |
7715 | * |
7716 | * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be |
7717 | * set to 0. |
7718 | * |
7719 | * \param pCopy - Parameters for the memory copy |
7720 | * |
7721 | * \return |
7722 | * ::CUDA_SUCCESS, |
7723 | * ::CUDA_ERROR_DEINITIALIZED, |
7724 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7725 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7726 | * ::CUDA_ERROR_INVALID_VALUE |
7727 | * \notefnerr |
7728 | * \note_sync |
7729 | * |
7730 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7731 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7732 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7733 | * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7734 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7735 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7736 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7737 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7738 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
7739 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
7740 | * ::cudaMemcpy3D |
7741 | */ |
7742 | CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); |
7743 | |
7744 | /** |
7745 | * \brief Copies memory between contexts |
7746 | * |
7747 | * Perform a 3D memory copy according to the parameters specified in |
7748 | * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure |
7749 | * for documentation of its parameters. |
7750 | * |
7751 | * \param pCopy - Parameters for the memory copy |
7752 | * |
7753 | * \return |
7754 | * ::CUDA_SUCCESS, |
7755 | * ::CUDA_ERROR_DEINITIALIZED, |
7756 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7757 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7758 | * ::CUDA_ERROR_INVALID_VALUE |
7759 | * \notefnerr |
7760 | * \note_sync |
7761 | * |
7762 | * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
7763 | * ::cuMemcpy3DPeerAsync, |
7764 | * ::cudaMemcpy3DPeer |
7765 | */ |
7766 | CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); |
7767 | |
7768 | /** |
7769 | * \brief Copies memory asynchronously |
7770 | * |
7771 | * Copies data between two pointers. |
7772 | * \p dst and \p src are base pointers of the destination and source, respectively. |
7773 | * \p ByteCount specifies the number of bytes to copy. |
7774 | * Note that this function infers the type of the transfer (host to host, host to |
7775 | * device, device to device, or device to host) from the pointer values. This |
7776 | * function is only allowed in contexts which support unified addressing. |
7777 | * |
7778 | * \param dst - Destination unified virtual address space pointer |
7779 | * \param src - Source unified virtual address space pointer |
7780 | * \param ByteCount - Size of memory copy in bytes |
7781 | * \param hStream - Stream identifier |
7782 | * |
7783 | * \return |
7784 | * ::CUDA_SUCCESS, |
7785 | * ::CUDA_ERROR_DEINITIALIZED, |
7786 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7787 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7788 | * ::CUDA_ERROR_INVALID_VALUE, |
7789 | * ::CUDA_ERROR_INVALID_HANDLE |
7790 | * \notefnerr |
7791 | * \note_async |
7792 | * \note_null_stream |
7793 | * \note_memcpy |
7794 | * |
7795 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7796 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7797 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7798 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7799 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, |
7800 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7801 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7802 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7803 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
7804 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
7805 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
7806 | * ::cuMemsetD32, ::cuMemsetD32Async, |
7807 | * ::cudaMemcpyAsync, |
7808 | * ::cudaMemcpyToSymbolAsync, |
7809 | * ::cudaMemcpyFromSymbolAsync |
7810 | */ |
7811 | CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); |
7812 | |
7813 | /** |
7814 | * \brief Copies device memory between two contexts asynchronously. |
7815 | * |
7816 | * Copies from device memory in one context to device memory in another |
7817 | * context. \p dstDevice is the base device pointer of the destination memory |
7818 | * and \p dstContext is the destination context. \p srcDevice is the base |
7819 | * device pointer of the source memory and \p srcContext is the source pointer. |
7820 | * \p ByteCount specifies the number of bytes to copy. |
7821 | * |
7822 | * \param dstDevice - Destination device pointer |
7823 | * \param dstContext - Destination context |
7824 | * \param srcDevice - Source device pointer |
7825 | * \param srcContext - Source context |
7826 | * \param ByteCount - Size of memory copy in bytes |
7827 | * \param hStream - Stream identifier |
7828 | * |
7829 | * \return |
7830 | * ::CUDA_SUCCESS, |
7831 | * ::CUDA_ERROR_DEINITIALIZED, |
7832 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7833 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7834 | * ::CUDA_ERROR_INVALID_VALUE, |
7835 | * ::CUDA_ERROR_INVALID_HANDLE |
7836 | * \notefnerr |
7837 | * \note_async |
7838 | * \note_null_stream |
7839 | * |
7840 | * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, |
7841 | * ::cuMemcpy3DPeerAsync, |
7842 | * ::cudaMemcpyPeerAsync |
7843 | */ |
7844 | CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); |
7845 | |
7846 | /** |
7847 | * \brief Copies memory from Host to Device |
7848 | * |
7849 | * Copies from host memory to device memory. \p dstDevice and \p srcHost are |
7850 | * the base addresses of the destination and source, respectively. \p ByteCount |
7851 | * specifies the number of bytes to copy. |
7852 | * |
7853 | * \param dstDevice - Destination device pointer |
7854 | * \param srcHost - Source host pointer |
7855 | * \param ByteCount - Size of memory copy in bytes |
7856 | * \param hStream - Stream identifier |
7857 | * |
7858 | * \return |
7859 | * ::CUDA_SUCCESS, |
7860 | * ::CUDA_ERROR_DEINITIALIZED, |
7861 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7862 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7863 | * ::CUDA_ERROR_INVALID_VALUE, |
7864 | * ::CUDA_ERROR_INVALID_HANDLE |
7865 | * \notefnerr |
7866 | * \note_async |
7867 | * \note_null_stream |
7868 | * \note_memcpy |
7869 | * |
7870 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7871 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7872 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7873 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7874 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7875 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7876 | * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, |
7877 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7878 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
7879 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
7880 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
7881 | * ::cuMemsetD32, ::cuMemsetD32Async, |
7882 | * ::cudaMemcpyAsync, |
7883 | * ::cudaMemcpyToSymbolAsync |
7884 | */ |
7885 | CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); |
7886 | |
7887 | /** |
7888 | * \brief Copies memory from Device to Host |
7889 | * |
7890 | * Copies from device to host memory. \p dstHost and \p srcDevice specify the |
7891 | * base pointers of the destination and source, respectively. \p ByteCount |
7892 | * specifies the number of bytes to copy. |
7893 | * |
7894 | * \param dstHost - Destination host pointer |
7895 | * \param srcDevice - Source device pointer |
7896 | * \param ByteCount - Size of memory copy in bytes |
7897 | * \param hStream - Stream identifier |
7898 | * |
7899 | * \return |
7900 | * ::CUDA_SUCCESS, |
7901 | * ::CUDA_ERROR_DEINITIALIZED, |
7902 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7903 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7904 | * ::CUDA_ERROR_INVALID_VALUE, |
7905 | * ::CUDA_ERROR_INVALID_HANDLE |
7906 | * \notefnerr |
7907 | * \note_async |
7908 | * \note_null_stream |
7909 | * \note_memcpy |
7910 | * |
7911 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7912 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7913 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7914 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7915 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
7916 | * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7917 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7918 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7919 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
7920 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
7921 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
7922 | * ::cuMemsetD32, ::cuMemsetD32Async, |
7923 | * ::cudaMemcpyAsync, |
7924 | * ::cudaMemcpyFromSymbolAsync |
7925 | */ |
7926 | CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
7927 | |
7928 | /** |
7929 | * \brief Copies memory from Device to Device |
7930 | * |
7931 | * Copies from device memory to device memory. \p dstDevice and \p srcDevice |
7932 | * are the base pointers of the destination and source, respectively. |
7933 | * \p ByteCount specifies the number of bytes to copy. |
7934 | * |
7935 | * \param dstDevice - Destination device pointer |
7936 | * \param srcDevice - Source device pointer |
7937 | * \param ByteCount - Size of memory copy in bytes |
7938 | * \param hStream - Stream identifier |
7939 | * |
7940 | * \return |
7941 | * ::CUDA_SUCCESS, |
7942 | * ::CUDA_ERROR_DEINITIALIZED, |
7943 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7944 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7945 | * ::CUDA_ERROR_INVALID_VALUE, |
7946 | * ::CUDA_ERROR_INVALID_HANDLE |
7947 | * \notefnerr |
7948 | * \note_async |
7949 | * \note_null_stream |
7950 | * |
7951 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7952 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7953 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7954 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7955 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, |
7956 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
7957 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
7958 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
7959 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
7960 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
7961 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
7962 | * ::cuMemsetD32, ::cuMemsetD32Async, |
7963 | * ::cudaMemcpyAsync, |
7964 | * ::cudaMemcpyToSymbolAsync, |
7965 | * ::cudaMemcpyFromSymbolAsync |
7966 | */ |
7967 | CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
7968 | |
7969 | /** |
7970 | * \brief Copies memory from Host to Array |
7971 | * |
7972 | * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset |
7973 | * specify the CUDA array handle and starting offset in bytes of the |
7974 | * destination data. \p srcHost specifies the base address of the source. |
7975 | * \p ByteCount specifies the number of bytes to copy. |
7976 | * |
7977 | * \param dstArray - Destination array |
7978 | * \param dstOffset - Offset in bytes of destination array |
7979 | * \param srcHost - Source host pointer |
7980 | * \param ByteCount - Size of memory copy in bytes |
7981 | * \param hStream - Stream identifier |
7982 | * |
7983 | * \return |
7984 | * ::CUDA_SUCCESS, |
7985 | * ::CUDA_ERROR_DEINITIALIZED, |
7986 | * ::CUDA_ERROR_NOT_INITIALIZED, |
7987 | * ::CUDA_ERROR_INVALID_CONTEXT, |
7988 | * ::CUDA_ERROR_INVALID_VALUE, |
7989 | * ::CUDA_ERROR_INVALID_HANDLE |
7990 | * \notefnerr |
7991 | * \note_async |
7992 | * \note_null_stream |
7993 | * \note_memcpy |
7994 | * |
7995 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
7996 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
7997 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
7998 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
7999 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8000 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, |
8001 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8002 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8003 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8004 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8005 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8006 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8007 | * ::cudaMemcpyToArrayAsync |
8008 | */ |
8009 | CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); |
8010 | |
8011 | /** |
8012 | * \brief Copies memory from Array to Host |
8013 | * |
8014 | * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base |
8015 | * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA |
8016 | * array handle and starting offset in bytes of the source data. |
8017 | * \p ByteCount specifies the number of bytes to copy. |
8018 | * |
8019 | * \param dstHost - Destination pointer |
8020 | * \param srcArray - Source array |
8021 | * \param srcOffset - Offset in bytes of source array |
8022 | * \param ByteCount - Size of memory copy in bytes |
8023 | * \param hStream - Stream identifier |
8024 | * |
8025 | * \return |
8026 | * ::CUDA_SUCCESS, |
8027 | * ::CUDA_ERROR_DEINITIALIZED, |
8028 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8029 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8030 | * ::CUDA_ERROR_INVALID_VALUE, |
8031 | * ::CUDA_ERROR_INVALID_HANDLE |
8032 | * \notefnerr |
8033 | * \note_async |
8034 | * \note_null_stream |
8035 | * \note_memcpy |
8036 | * |
8037 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8038 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8039 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8040 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8041 | * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8042 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8043 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8044 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8045 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8046 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8047 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8048 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8049 | * ::cudaMemcpyFromArrayAsync |
8050 | */ |
8051 | CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); |
8052 | |
8053 | /** |
8054 | * \brief Copies memory for 2D arrays |
8055 | * |
8056 | * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
8057 | * The ::CUDA_MEMCPY2D structure is defined as: |
8058 | * |
8059 | * \code |
8060 | typedef struct CUDA_MEMCPY2D_st { |
8061 | unsigned int srcXInBytes, srcY; |
8062 | CUmemorytype srcMemoryType; |
8063 | const void *srcHost; |
8064 | CUdeviceptr srcDevice; |
8065 | CUarray srcArray; |
8066 | unsigned int srcPitch; |
8067 | unsigned int dstXInBytes, dstY; |
8068 | CUmemorytype dstMemoryType; |
8069 | void *dstHost; |
8070 | CUdeviceptr dstDevice; |
8071 | CUarray dstArray; |
8072 | unsigned int dstPitch; |
8073 | unsigned int WidthInBytes; |
8074 | unsigned int Height; |
8075 | } CUDA_MEMCPY2D; |
8076 | * \endcode |
8077 | * where: |
8078 | * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
8079 | * source and destination, respectively; ::CUmemorytype_enum is defined as: |
8080 | * |
8081 | * \code |
8082 | typedef enum CUmemorytype_enum { |
8083 | CU_MEMORYTYPE_HOST = 0x01, |
8084 | CU_MEMORYTYPE_DEVICE = 0x02, |
8085 | CU_MEMORYTYPE_ARRAY = 0x03, |
8086 | CU_MEMORYTYPE_UNIFIED = 0x04 |
8087 | } CUmemorytype; |
8088 | * \endcode |
8089 | * |
8090 | * \par |
8091 | * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
8092 | * specify the (host) base address of the source data and the bytes per row to |
8093 | * apply. ::srcArray is ignored. |
8094 | * |
8095 | * \par |
8096 | * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
8097 | * specify the (unified virtual address space) base address of the source data |
8098 | * and the bytes per row to apply. ::srcArray is ignored. |
8099 | * This value may be used only if unified addressing is supported in the calling |
8100 | * context. |
8101 | * |
8102 | * \par |
8103 | * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
8104 | * specify the (device) base address of the source data and the bytes per row |
8105 | * to apply. ::srcArray is ignored. |
8106 | * |
8107 | * \par |
8108 | * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
8109 | * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
8110 | * ignored. |
8111 | * |
8112 | * \par |
8113 | * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
8114 | * specify the (unified virtual address space) base address of the source data |
8115 | * and the bytes per row to apply. ::dstArray is ignored. |
8116 | * This value may be used only if unified addressing is supported in the calling |
8117 | * context. |
8118 | * |
8119 | * \par |
8120 | * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
8121 | * specify the (host) base address of the destination data and the bytes per |
8122 | * row to apply. ::dstArray is ignored. |
8123 | * |
8124 | * \par |
8125 | * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
8126 | * specify the (device) base address of the destination data and the bytes per |
8127 | * row to apply. ::dstArray is ignored. |
8128 | * |
8129 | * \par |
8130 | * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
8131 | * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
8132 | * ignored. |
8133 | * |
8134 | * - ::srcXInBytes and ::srcY specify the base address of the source data for |
8135 | * the copy. |
8136 | * |
8137 | * \par |
8138 | * For host pointers, the starting address is |
8139 | * \code |
8140 | void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
8141 | * \endcode |
8142 | * |
8143 | * \par |
8144 | * For device pointers, the starting address is |
8145 | * \code |
8146 | CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
8147 | * \endcode |
8148 | * |
8149 | * \par |
8150 | * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
8151 | * element size. |
8152 | * |
8153 | * - ::dstXInBytes and ::dstY specify the base address of the destination data |
8154 | * for the copy. |
8155 | * |
8156 | * \par |
8157 | * For host pointers, the base address is |
8158 | * \code |
8159 | void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
8160 | * \endcode |
8161 | * |
8162 | * \par |
8163 | * For device pointers, the starting address is |
8164 | * \code |
8165 | CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
8166 | * \endcode |
8167 | * |
8168 | * \par |
8169 | * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
8170 | * element size. |
8171 | * |
8172 | * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
8173 | * the 2D copy being performed. |
8174 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
8175 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
8176 | * ::WidthInBytes + dstXInBytes. |
8177 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
8178 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
8179 | * ::WidthInBytes + dstXInBytes. |
8180 | * - If specified, ::srcHeight must be greater than or equal to ::Height + |
8181 | * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
8182 | * |
8183 | * \par |
8184 | * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum |
8185 | * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
8186 | * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
8187 | * (device to device, CUDA array to device, CUDA array to CUDA array), |
8188 | * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). |
8189 | * |
8190 | * \param pCopy - Parameters for the memory copy |
8191 | * \param hStream - Stream identifier |
8192 | * |
8193 | * \return |
8194 | * ::CUDA_SUCCESS, |
8195 | * ::CUDA_ERROR_DEINITIALIZED, |
8196 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8197 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8198 | * ::CUDA_ERROR_INVALID_VALUE, |
8199 | * ::CUDA_ERROR_INVALID_HANDLE |
8200 | * \notefnerr |
8201 | * \note_async |
8202 | * \note_null_stream |
8203 | * |
8204 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8205 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8206 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, |
8207 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8208 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8209 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8210 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8211 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8212 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8213 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8214 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8215 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8216 | * ::cudaMemcpy2DAsync, |
8217 | * ::cudaMemcpy2DToArrayAsync, |
8218 | * ::cudaMemcpy2DFromArrayAsync |
8219 | */ |
8220 | CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); |
8221 | |
8222 | /** |
8223 | * \brief Copies memory for 3D arrays |
8224 | * |
8225 | * Perform a 3D memory copy according to the parameters specified in |
8226 | * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: |
8227 | * |
8228 | * \code |
8229 | typedef struct CUDA_MEMCPY3D_st { |
8230 | |
8231 | unsigned int srcXInBytes, srcY, srcZ; |
8232 | unsigned int srcLOD; |
8233 | CUmemorytype srcMemoryType; |
8234 | const void *srcHost; |
8235 | CUdeviceptr srcDevice; |
8236 | CUarray srcArray; |
8237 | unsigned int srcPitch; // ignored when src is array |
8238 | unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 |
8239 | |
8240 | unsigned int dstXInBytes, dstY, dstZ; |
8241 | unsigned int dstLOD; |
8242 | CUmemorytype dstMemoryType; |
8243 | void *dstHost; |
8244 | CUdeviceptr dstDevice; |
8245 | CUarray dstArray; |
8246 | unsigned int dstPitch; // ignored when dst is array |
8247 | unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 |
8248 | |
8249 | unsigned int WidthInBytes; |
8250 | unsigned int Height; |
8251 | unsigned int Depth; |
8252 | } CUDA_MEMCPY3D; |
8253 | * \endcode |
8254 | * where: |
8255 | * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
8256 | * source and destination, respectively; ::CUmemorytype_enum is defined as: |
8257 | * |
8258 | * \code |
8259 | typedef enum CUmemorytype_enum { |
8260 | CU_MEMORYTYPE_HOST = 0x01, |
8261 | CU_MEMORYTYPE_DEVICE = 0x02, |
8262 | CU_MEMORYTYPE_ARRAY = 0x03, |
8263 | CU_MEMORYTYPE_UNIFIED = 0x04 |
8264 | } CUmemorytype; |
8265 | * \endcode |
8266 | * |
8267 | * \par |
8268 | * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
8269 | * specify the (unified virtual address space) base address of the source data |
8270 | * and the bytes per row to apply. ::srcArray is ignored. |
8271 | * This value may be used only if unified addressing is supported in the calling |
8272 | * context. |
8273 | * |
8274 | * \par |
8275 | * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and |
8276 | * ::srcHeight specify the (host) base address of the source data, the bytes |
8277 | * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
8278 | * ignored. |
8279 | * |
8280 | * \par |
8281 | * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and |
8282 | * ::srcHeight specify the (device) base address of the source data, the bytes |
8283 | * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
8284 | * ignored. |
8285 | * |
8286 | * \par |
8287 | * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
8288 | * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and |
8289 | * ::srcHeight are ignored. |
8290 | * |
8291 | * \par |
8292 | * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
8293 | * specify the (unified virtual address space) base address of the source data |
8294 | * and the bytes per row to apply. ::dstArray is ignored. |
8295 | * This value may be used only if unified addressing is supported in the calling |
8296 | * context. |
8297 | * |
8298 | * \par |
8299 | * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
8300 | * specify the (host) base address of the destination data, the bytes per row, |
8301 | * and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
8302 | * |
8303 | * \par |
8304 | * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
8305 | * specify the (device) base address of the destination data, the bytes per |
8306 | * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
8307 | * |
8308 | * \par |
8309 | * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
8310 | * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and |
8311 | * ::dstHeight are ignored. |
8312 | * |
8313 | * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source |
8314 | * data for the copy. |
8315 | * |
8316 | * \par |
8317 | * For host pointers, the starting address is |
8318 | * \code |
8319 | void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); |
8320 | * \endcode |
8321 | * |
8322 | * \par |
8323 | * For device pointers, the starting address is |
8324 | * \code |
8325 | CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; |
8326 | * \endcode |
8327 | * |
8328 | * \par |
8329 | * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
8330 | * element size. |
8331 | * |
8332 | * - dstXInBytes, ::dstY and ::dstZ specify the base address of the |
8333 | * destination data for the copy. |
8334 | * |
8335 | * \par |
8336 | * For host pointers, the base address is |
8337 | * \code |
8338 | void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); |
8339 | * \endcode |
8340 | * |
8341 | * \par |
8342 | * For device pointers, the starting address is |
8343 | * \code |
8344 | CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; |
8345 | * \endcode |
8346 | * |
8347 | * \par |
8348 | * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
8349 | * element size. |
8350 | * |
8351 | * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height |
8352 | * and depth of the 3D copy being performed. |
8353 | * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
8354 | * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
8355 | * ::WidthInBytes + dstXInBytes. |
8356 | * - If specified, ::srcHeight must be greater than or equal to ::Height + |
8357 | * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
8358 | * |
8359 | * \par |
8360 | * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum |
8361 | * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). |
8362 | * |
8363 | * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be |
8364 | * set to 0. |
8365 | * |
8366 | * \param pCopy - Parameters for the memory copy |
8367 | * \param hStream - Stream identifier |
8368 | * |
8369 | * \return |
8370 | * ::CUDA_SUCCESS, |
8371 | * ::CUDA_ERROR_DEINITIALIZED, |
8372 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8373 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8374 | * ::CUDA_ERROR_INVALID_VALUE, |
8375 | * ::CUDA_ERROR_INVALID_HANDLE |
8376 | * \notefnerr |
8377 | * \note_async |
8378 | * \note_null_stream |
8379 | * |
8380 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8381 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8382 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8383 | * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8384 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8385 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8386 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8387 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8388 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8389 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8390 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8391 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8392 | * ::cudaMemcpy3DAsync |
8393 | */ |
8394 | CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); |
8395 | |
8396 | /** |
8397 | * \brief Copies memory between contexts asynchronously. |
8398 | * |
8399 | * Perform a 3D memory copy according to the parameters specified in |
8400 | * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure |
8401 | * for documentation of its parameters. |
8402 | * |
8403 | * \param pCopy - Parameters for the memory copy |
8404 | * \param hStream - Stream identifier |
8405 | * |
8406 | * \return |
8407 | * ::CUDA_SUCCESS, |
8408 | * ::CUDA_ERROR_DEINITIALIZED, |
8409 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8410 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8411 | * ::CUDA_ERROR_INVALID_VALUE |
8412 | * \notefnerr |
8413 | * \note_async |
8414 | * \note_null_stream |
8415 | * |
8416 | * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
8417 | * ::cuMemcpy3DPeerAsync, |
8418 | * ::cudaMemcpy3DPeerAsync |
8419 | */ |
8420 | CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); |
8421 | |
8422 | /** |
8423 | * \brief Initializes device memory |
8424 | * |
8425 | * Sets the memory range of \p N 8-bit values to the specified value |
8426 | * \p uc. |
8427 | * |
8428 | * \param dstDevice - Destination device pointer |
8429 | * \param uc - Value to set |
8430 | * \param N - Number of elements |
8431 | * |
8432 | * \return |
8433 | * ::CUDA_SUCCESS, |
8434 | * ::CUDA_ERROR_DEINITIALIZED, |
8435 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8436 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8437 | * ::CUDA_ERROR_INVALID_VALUE |
8438 | * \notefnerr |
8439 | * \note_memset |
8440 | * |
8441 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8442 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8443 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8444 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8445 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8446 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8447 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8448 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8449 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8450 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8451 | * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8452 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8453 | * ::cudaMemset |
8454 | */ |
8455 | CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); |
8456 | |
8457 | /** |
8458 | * \brief Initializes device memory |
8459 | * |
8460 | * Sets the memory range of \p N 16-bit values to the specified value |
8461 | * \p us. The \p dstDevice pointer must be two byte aligned. |
8462 | * |
8463 | * \param dstDevice - Destination device pointer |
8464 | * \param us - Value to set |
8465 | * \param N - Number of elements |
8466 | * |
8467 | * \return |
8468 | * ::CUDA_SUCCESS, |
8469 | * ::CUDA_ERROR_DEINITIALIZED, |
8470 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8471 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8472 | * ::CUDA_ERROR_INVALID_VALUE |
8473 | * \notefnerr |
8474 | * \note_memset |
8475 | * |
8476 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8477 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8478 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8479 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8480 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8481 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8482 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8483 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8484 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8485 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8486 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, |
8487 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8488 | * ::cudaMemset |
8489 | */ |
8490 | CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); |
8491 | |
8492 | /** |
8493 | * \brief Initializes device memory |
8494 | * |
8495 | * Sets the memory range of \p N 32-bit values to the specified value |
8496 | * \p ui. The \p dstDevice pointer must be four byte aligned. |
8497 | * |
8498 | * \param dstDevice - Destination device pointer |
8499 | * \param ui - Value to set |
8500 | * \param N - Number of elements |
8501 | * |
8502 | * \return |
8503 | * ::CUDA_SUCCESS, |
8504 | * ::CUDA_ERROR_DEINITIALIZED, |
8505 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8506 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8507 | * ::CUDA_ERROR_INVALID_VALUE |
8508 | * \notefnerr |
8509 | * \note_memset |
8510 | * |
8511 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8512 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8513 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8514 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8515 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8516 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8517 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8518 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8519 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8520 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8521 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8522 | * ::cuMemsetD32Async, |
8523 | * ::cudaMemset |
8524 | */ |
8525 | CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); |
8526 | |
8527 | /** |
8528 | * \brief Initializes device memory |
8529 | * |
8530 | * Sets the 2D memory range of \p Width 8-bit values to the specified value |
8531 | * \p uc. \p Height specifies the number of rows to set, and \p dstPitch |
8532 | * specifies the number of bytes between each row. This function performs |
8533 | * fastest when the pitch is one that has been passed back by |
8534 | * ::cuMemAllocPitch(). |
8535 | * |
8536 | * \param dstDevice - Destination device pointer |
8537 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8538 | * \param uc - Value to set |
8539 | * \param Width - Width of row |
8540 | * \param Height - Number of rows |
8541 | * |
8542 | * \return |
8543 | * ::CUDA_SUCCESS, |
8544 | * ::CUDA_ERROR_DEINITIALIZED, |
8545 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8546 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8547 | * ::CUDA_ERROR_INVALID_VALUE |
8548 | * \notefnerr |
8549 | * \note_memset |
8550 | * |
8551 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8552 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8553 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8554 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8555 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8556 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8557 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8558 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8559 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, |
8560 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8561 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8562 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8563 | * ::cudaMemset2D |
8564 | */ |
8565 | CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); |
8566 | |
8567 | /** |
8568 | * \brief Initializes device memory |
8569 | * |
8570 | * Sets the 2D memory range of \p Width 16-bit values to the specified value |
8571 | * \p us. \p Height specifies the number of rows to set, and \p dstPitch |
8572 | * specifies the number of bytes between each row. The \p dstDevice pointer |
8573 | * and \p dstPitch offset must be two byte aligned. This function performs |
8574 | * fastest when the pitch is one that has been passed back by |
8575 | * ::cuMemAllocPitch(). |
8576 | * |
8577 | * \param dstDevice - Destination device pointer |
8578 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8579 | * \param us - Value to set |
8580 | * \param Width - Width of row |
8581 | * \param Height - Number of rows |
8582 | * |
8583 | * \return |
8584 | * ::CUDA_SUCCESS, |
8585 | * ::CUDA_ERROR_DEINITIALIZED, |
8586 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8587 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8588 | * ::CUDA_ERROR_INVALID_VALUE |
8589 | * \notefnerr |
8590 | * \note_memset |
8591 | * |
8592 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8593 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8594 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8595 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8596 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8597 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8598 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8599 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8600 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8601 | * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8602 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8603 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8604 | * ::cudaMemset2D |
8605 | */ |
8606 | CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); |
8607 | |
8608 | /** |
8609 | * \brief Initializes device memory |
8610 | * |
8611 | * Sets the 2D memory range of \p Width 32-bit values to the specified value |
8612 | * \p ui. \p Height specifies the number of rows to set, and \p dstPitch |
8613 | * specifies the number of bytes between each row. The \p dstDevice pointer |
8614 | * and \p dstPitch offset must be four byte aligned. This function performs |
8615 | * fastest when the pitch is one that has been passed back by |
8616 | * ::cuMemAllocPitch(). |
8617 | * |
8618 | * \param dstDevice - Destination device pointer |
8619 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8620 | * \param ui - Value to set |
8621 | * \param Width - Width of row |
8622 | * \param Height - Number of rows |
8623 | * |
8624 | * \return |
8625 | * ::CUDA_SUCCESS, |
8626 | * ::CUDA_ERROR_DEINITIALIZED, |
8627 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8628 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8629 | * ::CUDA_ERROR_INVALID_VALUE |
8630 | * \notefnerr |
8631 | * \note_memset |
8632 | * |
8633 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8634 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8635 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8636 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8637 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8638 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8639 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8640 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8641 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8642 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, |
8643 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8644 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8645 | * ::cudaMemset2D |
8646 | */ |
8647 | CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); |
8648 | |
8649 | /** |
8650 | * \brief Sets device memory |
8651 | * |
8652 | * Sets the memory range of \p N 8-bit values to the specified value |
8653 | * \p uc. |
8654 | * |
8655 | * \param dstDevice - Destination device pointer |
8656 | * \param uc - Value to set |
8657 | * \param N - Number of elements |
8658 | * \param hStream - Stream identifier |
8659 | * |
8660 | * \return |
8661 | * ::CUDA_SUCCESS, |
8662 | * ::CUDA_ERROR_DEINITIALIZED, |
8663 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8664 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8665 | * ::CUDA_ERROR_INVALID_VALUE |
8666 | * \notefnerr |
8667 | * \note_memset |
8668 | * \note_null_stream |
8669 | * |
8670 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8671 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8672 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8673 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8674 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8675 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8676 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8677 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8678 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8679 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8680 | * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, |
8681 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8682 | * ::cudaMemsetAsync |
8683 | */ |
8684 | CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); |
8685 | |
8686 | /** |
8687 | * \brief Sets device memory |
8688 | * |
8689 | * Sets the memory range of \p N 16-bit values to the specified value |
8690 | * \p us. The \p dstDevice pointer must be two byte aligned. |
8691 | * |
8692 | * \param dstDevice - Destination device pointer |
8693 | * \param us - Value to set |
8694 | * \param N - Number of elements |
8695 | * \param hStream - Stream identifier |
8696 | * |
8697 | * \return |
8698 | * ::CUDA_SUCCESS, |
8699 | * ::CUDA_ERROR_DEINITIALIZED, |
8700 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8701 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8702 | * ::CUDA_ERROR_INVALID_VALUE |
8703 | * \notefnerr |
8704 | * \note_memset |
8705 | * \note_null_stream |
8706 | * |
8707 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8708 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8709 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8710 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8711 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8712 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8713 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8714 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8715 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8716 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8717 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, |
8718 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8719 | * ::cudaMemsetAsync |
8720 | */ |
8721 | CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); |
8722 | |
8723 | /** |
8724 | * \brief Sets device memory |
8725 | * |
8726 | * Sets the memory range of \p N 32-bit values to the specified value |
8727 | * \p ui. The \p dstDevice pointer must be four byte aligned. |
8728 | * |
8729 | * \param dstDevice - Destination device pointer |
8730 | * \param ui - Value to set |
8731 | * \param N - Number of elements |
8732 | * \param hStream - Stream identifier |
8733 | * |
8734 | * \return |
8735 | * ::CUDA_SUCCESS, |
8736 | * ::CUDA_ERROR_DEINITIALIZED, |
8737 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8738 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8739 | * ::CUDA_ERROR_INVALID_VALUE |
8740 | * \notefnerr |
8741 | * \note_memset |
8742 | * \note_null_stream |
8743 | * |
8744 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8745 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8746 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8747 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8748 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8749 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8750 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8751 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8752 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8753 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8754 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, |
8755 | * ::cudaMemsetAsync |
8756 | */ |
8757 | CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); |
8758 | |
8759 | /** |
8760 | * \brief Sets device memory |
8761 | * |
8762 | * Sets the 2D memory range of \p Width 8-bit values to the specified value |
8763 | * \p uc. \p Height specifies the number of rows to set, and \p dstPitch |
8764 | * specifies the number of bytes between each row. This function performs |
8765 | * fastest when the pitch is one that has been passed back by |
8766 | * ::cuMemAllocPitch(). |
8767 | * |
8768 | * \param dstDevice - Destination device pointer |
8769 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8770 | * \param uc - Value to set |
8771 | * \param Width - Width of row |
8772 | * \param Height - Number of rows |
8773 | * \param hStream - Stream identifier |
8774 | * |
8775 | * \return |
8776 | * ::CUDA_SUCCESS, |
8777 | * ::CUDA_ERROR_DEINITIALIZED, |
8778 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8779 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8780 | * ::CUDA_ERROR_INVALID_VALUE |
8781 | * \notefnerr |
8782 | * \note_memset |
8783 | * \note_null_stream |
8784 | * |
8785 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8786 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8787 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8788 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8789 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8790 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8791 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8792 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8793 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, |
8794 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8795 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8796 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8797 | * ::cudaMemset2DAsync |
8798 | */ |
8799 | CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); |
8800 | |
8801 | /** |
8802 | * \brief Sets device memory |
8803 | * |
8804 | * Sets the 2D memory range of \p Width 16-bit values to the specified value |
8805 | * \p us. \p Height specifies the number of rows to set, and \p dstPitch |
8806 | * specifies the number of bytes between each row. The \p dstDevice pointer |
8807 | * and \p dstPitch offset must be two byte aligned. This function performs |
8808 | * fastest when the pitch is one that has been passed back by |
8809 | * ::cuMemAllocPitch(). |
8810 | * |
8811 | * \param dstDevice - Destination device pointer |
8812 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8813 | * \param us - Value to set |
8814 | * \param Width - Width of row |
8815 | * \param Height - Number of rows |
8816 | * \param hStream - Stream identifier |
8817 | * |
8818 | * \return |
8819 | * ::CUDA_SUCCESS, |
8820 | * ::CUDA_ERROR_DEINITIALIZED, |
8821 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8822 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8823 | * ::CUDA_ERROR_INVALID_VALUE |
8824 | * \notefnerr |
8825 | * \note_memset |
8826 | * \note_null_stream |
8827 | * |
8828 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8829 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8830 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8831 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8832 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8833 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8834 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8835 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8836 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8837 | * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
8838 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8839 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8840 | * ::cudaMemset2DAsync |
8841 | */ |
8842 | CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); |
8843 | |
8844 | /** |
8845 | * \brief Sets device memory |
8846 | * |
8847 | * Sets the 2D memory range of \p Width 32-bit values to the specified value |
8848 | * \p ui. \p Height specifies the number of rows to set, and \p dstPitch |
8849 | * specifies the number of bytes between each row. The \p dstDevice pointer |
8850 | * and \p dstPitch offset must be four byte aligned. This function performs |
8851 | * fastest when the pitch is one that has been passed back by |
8852 | * ::cuMemAllocPitch(). |
8853 | * |
8854 | * \param dstDevice - Destination device pointer |
8855 | * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) |
8856 | * \param ui - Value to set |
8857 | * \param Width - Width of row |
8858 | * \param Height - Number of rows |
8859 | * \param hStream - Stream identifier |
8860 | * |
8861 | * \return |
8862 | * ::CUDA_SUCCESS, |
8863 | * ::CUDA_ERROR_DEINITIALIZED, |
8864 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8865 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8866 | * ::CUDA_ERROR_INVALID_VALUE |
8867 | * \notefnerr |
8868 | * \note_memset |
8869 | * \note_null_stream |
8870 | * |
8871 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
8872 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8873 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8874 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8875 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8876 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8877 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8878 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8879 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
8880 | * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, |
8881 | * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
8882 | * ::cuMemsetD32, ::cuMemsetD32Async, |
8883 | * ::cudaMemset2DAsync |
8884 | */ |
8885 | CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); |
8886 | |
8887 | /** |
8888 | * \brief Creates a 1D or 2D CUDA array |
8889 | * |
8890 | * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure |
8891 | * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. |
8892 | * The ::CUDA_ARRAY_DESCRIPTOR is defined as: |
8893 | * |
8894 | * \code |
8895 | typedef struct { |
8896 | unsigned int Width; |
8897 | unsigned int Height; |
8898 | CUarray_format Format; |
8899 | unsigned int NumChannels; |
8900 | } CUDA_ARRAY_DESCRIPTOR; |
8901 | * \endcode |
8902 | * where: |
8903 | * |
8904 | * - \p Width, and \p Height are the width, and height of the CUDA array (in |
8905 | * elements); the CUDA array is one-dimensional if height is 0, two-dimensional |
8906 | * otherwise; |
8907 | * - ::Format specifies the format of the elements; ::CUarray_format is |
8908 | * defined as: |
8909 | * \code |
8910 | typedef enum CUarray_format_enum { |
8911 | CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
8912 | CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
8913 | CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
8914 | CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
8915 | CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
8916 | CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
8917 | CU_AD_FORMAT_HALF = 0x10, |
8918 | CU_AD_FORMAT_FLOAT = 0x20 |
8919 | } CUarray_format; |
8920 | * \endcode |
8921 | * - \p NumChannels specifies the number of packed components per CUDA array |
8922 | * element; it may be 1, 2, or 4; |
8923 | * |
8924 | * Here are examples of CUDA array descriptions: |
8925 | * |
8926 | * Description for a CUDA array of 2048 floats: |
8927 | * \code |
8928 | CUDA_ARRAY_DESCRIPTOR desc; |
8929 | desc.Format = CU_AD_FORMAT_FLOAT; |
8930 | desc.NumChannels = 1; |
8931 | desc.Width = 2048; |
8932 | desc.Height = 1; |
8933 | * \endcode |
8934 | * |
8935 | * Description for a 64 x 64 CUDA array of floats: |
8936 | * \code |
8937 | CUDA_ARRAY_DESCRIPTOR desc; |
8938 | desc.Format = CU_AD_FORMAT_FLOAT; |
8939 | desc.NumChannels = 1; |
8940 | desc.Width = 64; |
8941 | desc.Height = 64; |
8942 | * \endcode |
8943 | * |
8944 | * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit |
8945 | * float16's: |
8946 | * \code |
8947 | CUDA_ARRAY_DESCRIPTOR desc; |
8948 | desc.FormatFlags = CU_AD_FORMAT_HALF; |
8949 | desc.NumChannels = 4; |
8950 | desc.Width = width; |
8951 | desc.Height = height; |
8952 | * \endcode |
8953 | * |
8954 | * Description for a \p width x \p height CUDA array of 16-bit elements, each |
8955 | * of which is two 8-bit unsigned chars: |
8956 | * \code |
8957 | CUDA_ARRAY_DESCRIPTOR arrayDesc; |
8958 | desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; |
8959 | desc.NumChannels = 2; |
8960 | desc.Width = width; |
8961 | desc.Height = height; |
8962 | * \endcode |
8963 | * |
8964 | * \param pHandle - Returned array |
8965 | * \param pAllocateArray - Array descriptor |
8966 | * |
8967 | * \return |
8968 | * ::CUDA_SUCCESS, |
8969 | * ::CUDA_ERROR_DEINITIALIZED, |
8970 | * ::CUDA_ERROR_NOT_INITIALIZED, |
8971 | * ::CUDA_ERROR_INVALID_CONTEXT, |
8972 | * ::CUDA_ERROR_INVALID_VALUE, |
8973 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
8974 | * ::CUDA_ERROR_UNKNOWN |
8975 | * \notefnerr |
8976 | * |
8977 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, |
8978 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
8979 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
8980 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
8981 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
8982 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
8983 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
8984 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
8985 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
8986 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
8987 | * ::cudaMallocArray |
8988 | */ |
8989 | CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); |
8990 | |
8991 | /** |
8992 | * \brief Get a 1D or 2D CUDA array descriptor |
8993 | * |
8994 | * Returns in \p *pArrayDescriptor a descriptor containing information on the |
8995 | * format and dimensions of the CUDA array \p hArray. It is useful for |
8996 | * subroutines that have been passed a CUDA array, but need to know the CUDA |
8997 | * array parameters for validation or other purposes. |
8998 | * |
8999 | * \param pArrayDescriptor - Returned array descriptor |
9000 | * \param hArray - Array to get descriptor of |
9001 | * |
9002 | * \return |
9003 | * ::CUDA_SUCCESS, |
9004 | * ::CUDA_ERROR_DEINITIALIZED, |
9005 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9006 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9007 | * ::CUDA_ERROR_INVALID_VALUE, |
9008 | * ::CUDA_ERROR_INVALID_HANDLE |
9009 | * \notefnerr |
9010 | * |
9011 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
9012 | * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, |
9013 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
9014 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
9015 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
9016 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
9017 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
9018 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
9019 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
9020 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
9021 | * ::cudaArrayGetInfo |
9022 | */ |
9023 | CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); |
9024 | |
9025 | /** |
9026 | * \brief Returns the layout properties of a sparse CUDA array |
9027 | * |
9028 | * Returns the layout properties of a sparse CUDA array in \p sparseProperties |
9029 | * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE |
9030 | * ::CUDA_ERROR_INVALID_VALUE will be returned. |
9031 | * |
9032 | * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, |
9033 | * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. |
9034 | * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. |
9035 | * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained |
9036 | * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties |
9037 | * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. |
9038 | * |
9039 | * \return |
9040 | * ::CUDA_SUCCESS |
9041 | * ::CUDA_ERROR_INVALID_VALUE |
9042 | * |
9043 | * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES |
9044 | * \param[in] array - CUDA array to get the sparse properties of |
9045 | * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync |
9046 | */ |
9047 | CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); |
9048 | |
9049 | /** |
9050 | * \brief Returns the layout properties of a sparse CUDA mipmapped array |
9051 | * |
9052 | * Returns the sparse array layout properties in \p sparseProperties |
9053 | * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE |
9054 | * ::CUDA_ERROR_INVALID_VALUE will be returned. |
9055 | * |
9056 | * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the |
9057 | * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth |
9058 | * is less than that of the tile. |
9059 | * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, |
9060 | * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. |
9061 | * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. |
9062 | * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. |
9063 | * |
9064 | * \return |
9065 | * ::CUDA_SUCCESS |
9066 | * ::CUDA_ERROR_INVALID_VALUE |
9067 | * |
9068 | * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES |
9069 | * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of |
9070 | * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync |
9071 | */ |
9072 | CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); |
9073 | |
9074 | /** |
9075 | * \brief Gets a CUDA array plane from a CUDA array |
9076 | * |
9077 | * Returns in \p pPlaneArray a CUDA array that represents a single format plane |
9078 | * of the CUDA array \p hArray. |
9079 | * |
9080 | * If \p planeIdx is greater than the maximum number of planes in this array or if the array does |
9081 | * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. |
9082 | * |
9083 | * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns |
9084 | * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. |
9085 | * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width |
9086 | * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. |
9087 | * |
9088 | * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx |
9089 | * \param hArray - Multiplanar CUDA array |
9090 | * \param planeIdx - Plane index |
9091 | * |
9092 | * \return |
9093 | * ::CUDA_SUCCESS, |
9094 | * ::CUDA_ERROR_DEINITIALIZED, |
9095 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9096 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9097 | * ::CUDA_ERROR_INVALID_VALUE, |
9098 | * ::CUDA_ERROR_INVALID_HANDLE |
9099 | * \notefnerr |
9100 | * |
9101 | * \sa |
9102 | * ::cuArrayCreate, |
9103 | * ::cudaGetArrayPlane |
9104 | */ |
9105 | CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); |
9106 | |
9107 | /** |
9108 | * \brief Destroys a CUDA array |
9109 | * |
9110 | * Destroys the CUDA array \p hArray. |
9111 | * |
9112 | * \param hArray - Array to destroy |
9113 | * |
9114 | * \return |
9115 | * ::CUDA_SUCCESS, |
9116 | * ::CUDA_ERROR_DEINITIALIZED, |
9117 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9118 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9119 | * ::CUDA_ERROR_INVALID_HANDLE, |
9120 | * ::CUDA_ERROR_ARRAY_IS_MAPPED, |
9121 | * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
9122 | * \notefnerr |
9123 | * |
9124 | * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
9125 | * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
9126 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
9127 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
9128 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
9129 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
9130 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
9131 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
9132 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
9133 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
9134 | * ::cudaFreeArray |
9135 | */ |
9136 | CUresult CUDAAPI cuArrayDestroy(CUarray hArray); |
9137 | |
9138 | /** |
9139 | * \brief Creates a 3D CUDA array |
9140 | * |
9141 | * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure |
9142 | * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. |
9143 | * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: |
9144 | * |
9145 | * \code |
9146 | typedef struct { |
9147 | unsigned int Width; |
9148 | unsigned int Height; |
9149 | unsigned int Depth; |
9150 | CUarray_format Format; |
9151 | unsigned int NumChannels; |
9152 | unsigned int Flags; |
9153 | } CUDA_ARRAY3D_DESCRIPTOR; |
9154 | * \endcode |
9155 | * where: |
9156 | * |
9157 | * - \p Width, \p Height, and \p Depth are the width, height, and depth of the |
9158 | * CUDA array (in elements); the following types of CUDA arrays can be allocated: |
9159 | * - A 1D array is allocated if \p Height and \p Depth extents are both zero. |
9160 | * - A 2D array is allocated if only \p Depth extent is zero. |
9161 | * - A 3D array is allocated if all three extents are non-zero. |
9162 | * - A 1D layered CUDA array is allocated if only \p Height is zero and the |
9163 | * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number |
9164 | * of layers is determined by the depth extent. |
9165 | * - A 2D layered CUDA array is allocated if all three extents are non-zero and |
9166 | * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number |
9167 | * of layers is determined by the depth extent. |
9168 | * - A cubemap CUDA array is allocated if all three extents are non-zero and the |
9169 | * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and |
9170 | * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, |
9171 | * where the six layers represent the six faces of a cube. The order of the six |
9172 | * layers in memory is the same as that listed in ::CUarray_cubemap_face. |
9173 | * - A cubemap layered CUDA array is allocated if all three extents are non-zero, |
9174 | * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. |
9175 | * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. |
9176 | * A cubemap layered CUDA array is a special type of 2D layered CUDA array that |
9177 | * consists of a collection of cubemaps. The first six layers represent the first |
9178 | * cubemap, the next six layers form the second cubemap, and so on. |
9179 | * |
9180 | * - ::Format specifies the format of the elements; ::CUarray_format is |
9181 | * defined as: |
9182 | * \code |
9183 | typedef enum CUarray_format_enum { |
9184 | CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
9185 | CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
9186 | CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
9187 | CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
9188 | CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
9189 | CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
9190 | CU_AD_FORMAT_HALF = 0x10, |
9191 | CU_AD_FORMAT_FLOAT = 0x20 |
9192 | } CUarray_format; |
9193 | * \endcode |
9194 | * |
9195 | * - \p NumChannels specifies the number of packed components per CUDA array |
9196 | * element; it may be 1, 2, or 4; |
9197 | * |
9198 | * - ::Flags may be set to |
9199 | * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, |
9200 | * \p Depth specifies the number of layers, not the depth of a 3D array. |
9201 | * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. |
9202 | * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array |
9203 | * to a surface reference. |
9204 | * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be |
9205 | * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, |
9206 | * then \p Depth must be a multiple of six. |
9207 | * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. |
9208 | * Texture gather can only be performed on 2D CUDA arrays. |
9209 | * |
9210 | * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. |
9211 | * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute |
9212 | * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute |
9213 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. |
9214 | * |
9215 | * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag |
9216 | * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH |
9217 | * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. |
9218 | * |
9219 | * <table> |
9220 | * <tr><td><b>CUDA array type</b></td> |
9221 | * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), |
9222 | * (depth range)}</b></td> |
9223 | * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> |
9224 | * {(width range in elements), (height range), (depth range)}</b></td></tr> |
9225 | * <tr><td>1D</td> |
9226 | * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td> |
9227 | * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> |
9228 | * <tr><td>2D</td> |
9229 | * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td> |
9230 | * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> |
9231 | * <tr><td>3D</td> |
9232 | * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } |
9233 | * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), |
9234 | * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> |
9235 | * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), |
9236 | * (1,SURFACE3D_DEPTH) }</small></td></tr> |
9237 | * <tr><td>1D Layered</td> |
9238 | * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, |
9239 | * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> |
9240 | * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, |
9241 | * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> |
9242 | * <tr><td>2D Layered</td> |
9243 | * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), |
9244 | * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> |
9245 | * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), |
9246 | * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> |
9247 | * <tr><td>Cubemap</td> |
9248 | * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> |
9249 | * <td><small>{ (1,SURFACECUBEMAP_WIDTH), |
9250 | * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> |
9251 | * <tr><td>Cubemap Layered</td> |
9252 | * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), |
9253 | * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> |
9254 | * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), |
9255 | * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> |
9256 | * </table> |
9257 | * |
9258 | * Here are examples of CUDA array descriptions: |
9259 | * |
9260 | * Description for a CUDA array of 2048 floats: |
9261 | * \code |
9262 | CUDA_ARRAY3D_DESCRIPTOR desc; |
9263 | desc.Format = CU_AD_FORMAT_FLOAT; |
9264 | desc.NumChannels = 1; |
9265 | desc.Width = 2048; |
9266 | desc.Height = 0; |
9267 | desc.Depth = 0; |
9268 | * \endcode |
9269 | * |
9270 | * Description for a 64 x 64 CUDA array of floats: |
9271 | * \code |
9272 | CUDA_ARRAY3D_DESCRIPTOR desc; |
9273 | desc.Format = CU_AD_FORMAT_FLOAT; |
9274 | desc.NumChannels = 1; |
9275 | desc.Width = 64; |
9276 | desc.Height = 64; |
9277 | desc.Depth = 0; |
9278 | * \endcode |
9279 | * |
9280 | * Description for a \p width x \p height x \p depth CUDA array of 64-bit, |
9281 | * 4x16-bit float16's: |
9282 | * \code |
9283 | CUDA_ARRAY3D_DESCRIPTOR desc; |
9284 | desc.FormatFlags = CU_AD_FORMAT_HALF; |
9285 | desc.NumChannels = 4; |
9286 | desc.Width = width; |
9287 | desc.Height = height; |
9288 | desc.Depth = depth; |
9289 | * \endcode |
9290 | * |
9291 | * \param pHandle - Returned array |
9292 | * \param pAllocateArray - 3D array descriptor |
9293 | * |
9294 | * \return |
9295 | * ::CUDA_SUCCESS, |
9296 | * ::CUDA_ERROR_DEINITIALIZED, |
9297 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9298 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9299 | * ::CUDA_ERROR_INVALID_VALUE, |
9300 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
9301 | * ::CUDA_ERROR_UNKNOWN |
9302 | * \notefnerr |
9303 | * |
9304 | * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, |
9305 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
9306 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
9307 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
9308 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
9309 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
9310 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
9311 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
9312 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
9313 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
9314 | * ::cudaMalloc3DArray |
9315 | */ |
9316 | CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); |
9317 | |
9318 | /** |
9319 | * \brief Get a 3D CUDA array descriptor |
9320 | * |
9321 | * Returns in \p *pArrayDescriptor a descriptor containing information on the |
9322 | * format and dimensions of the CUDA array \p hArray. It is useful for |
9323 | * subroutines that have been passed a CUDA array, but need to know the CUDA |
9324 | * array parameters for validation or other purposes. |
9325 | * |
9326 | * This function may be called on 1D and 2D arrays, in which case the \p Height |
9327 | * and/or \p Depth members of the descriptor struct will be set to 0. |
9328 | * |
9329 | * \param pArrayDescriptor - Returned 3D array descriptor |
9330 | * \param hArray - 3D array to get descriptor of |
9331 | * |
9332 | * \return |
9333 | * ::CUDA_SUCCESS, |
9334 | * ::CUDA_ERROR_DEINITIALIZED, |
9335 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9336 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9337 | * ::CUDA_ERROR_INVALID_VALUE, |
9338 | * ::CUDA_ERROR_INVALID_HANDLE, |
9339 | * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
9340 | * \notefnerr |
9341 | * |
9342 | * \sa ::cuArray3DCreate, ::cuArrayCreate, |
9343 | * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
9344 | * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
9345 | * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
9346 | * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
9347 | * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
9348 | * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
9349 | * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
9350 | * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
9351 | * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
9352 | * ::cudaArrayGetInfo |
9353 | */ |
9354 | CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); |
9355 | |
9356 | /** |
9357 | * \brief Creates a CUDA mipmapped array |
9358 | * |
9359 | * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure |
9360 | * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. |
9361 | * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is |
9362 | * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. |
9363 | * |
9364 | * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: |
9365 | * |
9366 | * \code |
9367 | typedef struct { |
9368 | unsigned int Width; |
9369 | unsigned int Height; |
9370 | unsigned int Depth; |
9371 | CUarray_format Format; |
9372 | unsigned int NumChannels; |
9373 | unsigned int Flags; |
9374 | } CUDA_ARRAY3D_DESCRIPTOR; |
9375 | * \endcode |
9376 | * where: |
9377 | * |
9378 | * - \p Width, \p Height, and \p Depth are the width, height, and depth of the |
9379 | * CUDA array (in elements); the following types of CUDA arrays can be allocated: |
9380 | * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. |
9381 | * - A 2D mipmapped array is allocated if only \p Depth extent is zero. |
9382 | * - A 3D mipmapped array is allocated if all three extents are non-zero. |
9383 | * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the |
9384 | * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number |
9385 | * of layers is determined by the depth extent. |
9386 | * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and |
9387 | * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number |
9388 | * of layers is determined by the depth extent. |
9389 | * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the |
9390 | * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and |
9391 | * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, |
9392 | * where the six layers represent the six faces of a cube. The order of the six |
9393 | * layers in memory is the same as that listed in ::CUarray_cubemap_face. |
9394 | * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, |
9395 | * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. |
9396 | * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. |
9397 | * A cubemap layered CUDA array is a special type of 2D layered CUDA array that |
9398 | * consists of a collection of cubemaps. The first six layers represent the first |
9399 | * cubemap, the next six layers form the second cubemap, and so on. |
9400 | * |
9401 | * - ::Format specifies the format of the elements; ::CUarray_format is |
9402 | * defined as: |
9403 | * \code |
9404 | typedef enum CUarray_format_enum { |
9405 | CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
9406 | CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
9407 | CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
9408 | CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
9409 | CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
9410 | CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
9411 | CU_AD_FORMAT_HALF = 0x10, |
9412 | CU_AD_FORMAT_FLOAT = 0x20 |
9413 | } CUarray_format; |
9414 | * \endcode |
9415 | * |
9416 | * - \p NumChannels specifies the number of packed components per CUDA array |
9417 | * element; it may be 1, 2, or 4; |
9418 | * |
9419 | * - ::Flags may be set to |
9420 | * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, |
9421 | * \p Depth specifies the number of layers, not the depth of a 3D array. |
9422 | * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of |
9423 | * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to |
9424 | * bind a mipmap level of the CUDA mipmapped array to a surface reference. |
9425 | * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be |
9426 | * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, |
9427 | * then \p Depth must be a multiple of six. |
9428 | * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. |
9429 | * Texture gather can only be performed on 2D CUDA mipmapped arrays. |
9430 | * |
9431 | * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. |
9432 | * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute |
9433 | * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute |
9434 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. |
9435 | * |
9436 | * <table> |
9437 | * <tr><td><b>CUDA array type</b></td> |
9438 | * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), |
9439 | * (depth range)}</b></td> |
9440 | * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> |
9441 | * {(width range in elements), (height range), (depth range)}</b></td></tr> |
9442 | * <tr><td>1D</td> |
9443 | * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td> |
9444 | * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> |
9445 | * <tr><td>2D</td> |
9446 | * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td> |
9447 | * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> |
9448 | * <tr><td>3D</td> |
9449 | * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } |
9450 | * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), |
9451 | * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> |
9452 | * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), |
9453 | * (1,SURFACE3D_DEPTH) }</small></td></tr> |
9454 | * <tr><td>1D Layered</td> |
9455 | * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, |
9456 | * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> |
9457 | * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, |
9458 | * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> |
9459 | * <tr><td>2D Layered</td> |
9460 | * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), |
9461 | * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> |
9462 | * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), |
9463 | * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> |
9464 | * <tr><td>Cubemap</td> |
9465 | * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> |
9466 | * <td><small>{ (1,SURFACECUBEMAP_WIDTH), |
9467 | * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> |
9468 | * <tr><td>Cubemap Layered</td> |
9469 | * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), |
9470 | * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> |
9471 | * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), |
9472 | * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> |
9473 | * </table> |
9474 | * |
9475 | * |
9476 | * \param pHandle - Returned mipmapped array |
9477 | * \param pMipmappedArrayDesc - mipmapped array descriptor |
9478 | * \param numMipmapLevels - Number of mipmap levels |
9479 | * |
9480 | * \return |
9481 | * ::CUDA_SUCCESS, |
9482 | * ::CUDA_ERROR_DEINITIALIZED, |
9483 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9484 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9485 | * ::CUDA_ERROR_INVALID_VALUE, |
9486 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
9487 | * ::CUDA_ERROR_UNKNOWN |
9488 | * \notefnerr |
9489 | * |
9490 | * \sa |
9491 | * ::cuMipmappedArrayDestroy, |
9492 | * ::cuMipmappedArrayGetLevel, |
9493 | * ::cuArrayCreate, |
9494 | * ::cudaMallocMipmappedArray |
9495 | */ |
9496 | CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); |
9497 | |
9498 | /** |
9499 | * \brief Gets a mipmap level of a CUDA mipmapped array |
9500 | * |
9501 | * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level |
9502 | * of the CUDA mipmapped array \p hMipmappedArray. |
9503 | * |
9504 | * If \p level is greater than the maximum number of levels in this mipmapped array, |
9505 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
9506 | * |
9507 | * \param pLevelArray - Returned mipmap level CUDA array |
9508 | * \param hMipmappedArray - CUDA mipmapped array |
9509 | * \param level - Mipmap level |
9510 | * |
9511 | * \return |
9512 | * ::CUDA_SUCCESS, |
9513 | * ::CUDA_ERROR_DEINITIALIZED, |
9514 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9515 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9516 | * ::CUDA_ERROR_INVALID_VALUE, |
9517 | * ::CUDA_ERROR_INVALID_HANDLE |
9518 | * \notefnerr |
9519 | * |
9520 | * \sa |
9521 | * ::cuMipmappedArrayCreate, |
9522 | * ::cuMipmappedArrayDestroy, |
9523 | * ::cuArrayCreate, |
9524 | * ::cudaGetMipmappedArrayLevel |
9525 | */ |
9526 | CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); |
9527 | |
9528 | /** |
9529 | * \brief Destroys a CUDA mipmapped array |
9530 | * |
9531 | * Destroys the CUDA mipmapped array \p hMipmappedArray. |
9532 | * |
9533 | * \param hMipmappedArray - Mipmapped array to destroy |
9534 | * |
9535 | * \return |
9536 | * ::CUDA_SUCCESS, |
9537 | * ::CUDA_ERROR_DEINITIALIZED, |
9538 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9539 | * ::CUDA_ERROR_INVALID_CONTEXT, |
9540 | * ::CUDA_ERROR_INVALID_HANDLE, |
9541 | * ::CUDA_ERROR_ARRAY_IS_MAPPED, |
9542 | * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
9543 | * \notefnerr |
9544 | * |
9545 | * \sa |
9546 | * ::cuMipmappedArrayCreate, |
9547 | * ::cuMipmappedArrayGetLevel, |
9548 | * ::cuArrayCreate, |
9549 | * ::cudaFreeMipmappedArray |
9550 | */ |
9551 | CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); |
9552 | |
9553 | /** @} */ /* END CUDA_MEM */ |
9554 | |
9555 | /** |
9556 | * \defgroup CUDA_VA Virtual Memory Management |
9557 | * |
9558 | * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API |
9559 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
9560 | * |
9561 | * This section describes the virtual memory management functions of the low-level CUDA |
9562 | * driver application programming interface. |
9563 | * |
9564 | * @{ |
9565 | */ |
9566 | |
9567 | /** |
9568 | * \brief Allocate an address range reservation. |
9569 | * |
9570 | * Reserves a virtual address range based on the given parameters, giving |
9571 | * the starting address of the range in \p ptr. This API requires a system that |
9572 | * supports UVA. The size and address parameters must be a multiple of the |
9573 | * host page size and the alignment must be a power of two or zero for default |
9574 | * alignment. |
9575 | * |
9576 | * \param[out] ptr - Resulting pointer to start of virtual address range allocated |
9577 | * \param[in] size - Size of the reserved virtual address range requested |
9578 | * \param[in] alignment - Alignment of the reserved virtual address range requested |
9579 | * \param[in] addr - Fixed starting address range requested |
9580 | * \param[in] flags - Currently unused, must be zero |
9581 | * \return |
9582 | * ::CUDA_SUCCESS, |
9583 | * ::CUDA_ERROR_INVALID_VALUE, |
9584 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
9585 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9586 | * ::CUDA_ERROR_DEINITIALIZED, |
9587 | * ::CUDA_ERROR_NOT_PERMITTED, |
9588 | * ::CUDA_ERROR_NOT_SUPPORTED |
9589 | * |
9590 | * \sa ::cuMemAddressFree |
9591 | */ |
9592 | CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); |
9593 | |
9594 | /** |
9595 | * \brief Free an address range reservation. |
9596 | * |
9597 | * Frees a virtual address range reserved by cuMemAddressReserve. The size |
9598 | * must match what was given to memAddressReserve and the ptr given must |
9599 | * match what was returned from memAddressReserve. |
9600 | * |
9601 | * \param[in] ptr - Starting address of the virtual address range to free |
9602 | * \param[in] size - Size of the virtual address region to free |
9603 | * \return |
9604 | * ::CUDA_SUCCESS, |
9605 | * ::CUDA_ERROR_INVALID_VALUE, |
9606 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9607 | * ::CUDA_ERROR_DEINITIALIZED, |
9608 | * ::CUDA_ERROR_NOT_PERMITTED, |
9609 | * ::CUDA_ERROR_NOT_SUPPORTED |
9610 | * |
9611 | * \sa ::cuMemAddressReserve |
9612 | */ |
9613 | CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); |
9614 | |
9615 | /** |
9616 | * \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties |
9617 | * |
9618 | * This creates a memory allocation on the target device specified through the |
9619 | * \p prop structure. The created allocation will not have any device or host |
9620 | * mappings. The generic memory \p handle for the allocation can be |
9621 | * mapped to the address space of calling process via ::cuMemMap. This handle |
9622 | * cannot be transmitted directly to other processes (see |
9623 | * ::cuMemExportToShareableHandle). On Windows, the caller must also pass |
9624 | * an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which |
9625 | * limits or allows access to this handle for a recipient process (see |
9626 | * ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this |
9627 | * allocation must be a multiple of the the value given via |
9628 | * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM |
9629 | * flag. |
9630 | * If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then |
9631 | * the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays |
9632 | * and sparse CUDA mipmapped arrays. |
9633 | * (see ::cuMemMapArrayAsync). |
9634 | * |
9635 | * \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. |
9636 | * \param[in] size - Size of the allocation requested |
9637 | * \param[in] prop - Properties of the allocation to create. |
9638 | * \param[in] flags - flags for future use, must be zero now. |
9639 | * \return |
9640 | * ::CUDA_SUCCESS, |
9641 | * ::CUDA_ERROR_INVALID_VALUE, |
9642 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
9643 | * ::CUDA_ERROR_INVALID_DEVICE, |
9644 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9645 | * ::CUDA_ERROR_DEINITIALIZED, |
9646 | * ::CUDA_ERROR_NOT_PERMITTED, |
9647 | * ::CUDA_ERROR_NOT_SUPPORTED |
9648 | * \notefnerr |
9649 | * |
9650 | * \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle |
9651 | */ |
9652 | CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); |
9653 | |
9654 | /** |
9655 | * \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. |
9656 | * |
9657 | * Frees the memory that was allocated on a device through cuMemCreate. |
9658 | * |
9659 | * The memory allocation will be freed when all outstanding mappings to the memory |
9660 | * are unmapped and when all outstanding references to the handle (including it's |
9661 | * shareable counterparts) are also released. The generic memory handle can be |
9662 | * freed when there are still outstanding mappings made with this handle. Each |
9663 | * time a recipient process imports a shareable handle, it needs to pair it with |
9664 | * ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle |
9665 | * the behavior is undefined. |
9666 | * |
9667 | * \param[in] handle Value of handle which was returned previously by cuMemCreate. |
9668 | * \return |
9669 | * ::CUDA_SUCCESS, |
9670 | * ::CUDA_ERROR_INVALID_VALUE, |
9671 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9672 | * ::CUDA_ERROR_DEINITIALIZED, |
9673 | * ::CUDA_ERROR_NOT_PERMITTED, |
9674 | * ::CUDA_ERROR_NOT_SUPPORTED |
9675 | * \notefnerr |
9676 | * |
9677 | * \sa ::cuMemCreate |
9678 | */ |
9679 | CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); |
9680 | |
9681 | /** |
9682 | * \brief Maps an allocation handle to a reserved virtual address range. |
9683 | * |
9684 | * Maps bytes of memory represented by \p handle starting from byte \p offset to |
9685 | * \p size to address range [\p addr, \p addr + \p size]. This range must be an |
9686 | * address reservation previously reserved with ::cuMemAddressReserve, and |
9687 | * \p offset + \p size must be less than the size of the memory allocation. |
9688 | * Both \p ptr, \p size, and \p offset must be a multiple of the value given via |
9689 | * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. |
9690 | * |
9691 | * Please note calling ::cuMemMap does not make the address accessible, |
9692 | * the caller needs to update accessibility of a contiguous mapped VA |
9693 | * range by calling ::cuMemSetAccess. |
9694 | * |
9695 | * Once a recipient process obtains a shareable memory handle |
9696 | * from ::cuMemImportFromShareableHandle, the process must |
9697 | * use ::cuMemMap to map the memory into its address ranges before |
9698 | * setting accessibility with ::cuMemSetAccess. |
9699 | * |
9700 | * ::cuMemMap can only create mappings on VA range reservations |
9701 | * that are not currently mapped. |
9702 | * |
9703 | * \param[in] ptr - Address where memory will be mapped. |
9704 | * \param[in] size - Size of the memory mapping. |
9705 | * \param[in] offset - Offset into the memory represented by |
9706 | * - \p handle from which to start mapping |
9707 | * - Note: currently must be zero. |
9708 | * \param[in] handle - Handle to a shareable memory |
9709 | * \param[in] flags - flags for future use, must be zero now. |
9710 | * \return |
9711 | * ::CUDA_SUCCESS, |
9712 | * ::CUDA_ERROR_INVALID_VALUE, |
9713 | * ::CUDA_ERROR_INVALID_DEVICE, |
9714 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
9715 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9716 | * ::CUDA_ERROR_DEINITIALIZED, |
9717 | * ::CUDA_ERROR_NOT_PERMITTED, |
9718 | * ::CUDA_ERROR_NOT_SUPPORTED |
9719 | * \notefnerr |
9720 | * |
9721 | * \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle |
9722 | */ |
9723 | CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); |
9724 | |
9725 | /** |
9726 | * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays |
9727 | * |
9728 | * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. |
9729 | * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. |
9730 | * The structure ::CUarrayMapInfo is defined as follow: |
9731 | \code |
9732 | typedef struct CUarrayMapInfo_st { |
9733 | CUresourcetype resourceType; |
9734 | union { |
9735 | CUmipmappedArray mipmap; |
9736 | CUarray array; |
9737 | } resource; |
9738 | |
9739 | CUarraySparseSubresourceType subresourceType; |
9740 | union { |
9741 | struct { |
9742 | unsigned int level; |
9743 | unsigned int layer; |
9744 | unsigned int offsetX; |
9745 | unsigned int offsetY; |
9746 | unsigned int offsetZ; |
9747 | unsigned int extentWidth; |
9748 | unsigned int extentHeight; |
9749 | unsigned int extentDepth; |
9750 | } sparseLevel; |
9751 | struct { |
9752 | unsigned int layer; |
9753 | unsigned long long offset; |
9754 | unsigned long long size; |
9755 | } miptail; |
9756 | } subresource; |
9757 | |
9758 | CUmemOperationType memOperationType; |
9759 | |
9760 | CUmemHandleType memHandleType; |
9761 | union { |
9762 | CUmemGenericAllocationHandle memHandle; |
9763 | } memHandle; |
9764 | |
9765 | unsigned long long offset; |
9766 | unsigned int deviceBitMask; |
9767 | unsigned int flags; |
9768 | unsigned int reserved[2]; |
9769 | } CUarrayMapInfo; |
9770 | \endcode |
9771 | * |
9772 | * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. |
9773 | * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then |
9774 | * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. |
9775 | * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using |
9776 | * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE. |
9777 | * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. |
9778 | * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY |
9779 | * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. |
9780 | * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been |
9781 | * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE. |
9782 | * |
9783 | * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. |
9784 | * ::CUarraySparseSubresourceType_enum is defined as: |
9785 | \code |
9786 | typedef enum CUarraySparseSubresourceType_enum { |
9787 | CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, |
9788 | CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 |
9789 | } CUarraySparseSubresourceType; |
9790 | \endcode |
9791 | * |
9792 | * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a |
9793 | * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which |
9794 | * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by |
9795 | * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. |
9796 | * |
9797 | * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL |
9798 | * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. |
9799 | * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY |
9800 | * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. |
9801 | * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight |
9802 | * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. |
9803 | * These offsets and extents must be aligned to the corresponding tile dimension. |
9804 | * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, |
9805 | * must be zero. |
9806 | * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, |
9807 | * must be zero. |
9808 | * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth |
9809 | * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. |
9810 | * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties |
9811 | * |
9812 | * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL |
9813 | * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in |
9814 | * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. |
9815 | * Both, mip tail offset and mip tail size must be aligned to the tile size. |
9816 | * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags |
9817 | * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. |
9818 | * Otherwise, must be zero. |
9819 | * |
9820 | * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: |
9821 | \code |
9822 | typedef enum CUmemOperationType_enum { |
9823 | CU_MEM_OPERATION_TYPE_MAP = 1, |
9824 | CU_MEM_OPERATION_TYPE_UNMAP = 2 |
9825 | } CUmemOperationType; |
9826 | \endcode |
9827 | * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource |
9828 | * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. |
9829 | * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, |
9830 | * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. |
9831 | * |
9832 | * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation |
9833 | * is performed. ::CUarrayMapInfo::memHandle must be NULL. |
9834 | * |
9835 | * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. |
9836 | * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. |
9837 | * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match |
9838 | * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. |
9839 | * |
9840 | * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. |
9841 | * |
9842 | * \return |
9843 | * ::CUDA_SUCCESS, |
9844 | * ::CUDA_ERROR_INVALID_VALUE, |
9845 | * ::CUDA_ERROR_INVALID_HANDLE |
9846 | * |
9847 | * \param[in] mapInfoList - List of ::CUarrayMapInfo |
9848 | * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList |
9849 | * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations |
9850 | * |
9851 | * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties |
9852 | */ |
9853 | CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); |
9854 | |
9855 | /** |
9856 | * \brief Unmap the backing memory of a given address range. |
9857 | * |
9858 | * The range must be the entire contiguous address range that was mapped to. In |
9859 | * other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped |
9860 | * by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed |
9861 | * if there are no existing mappings and there are no unreleased memory handles. |
9862 | * |
9863 | * When ::cuMemUnmap returns successfully the address range is converted to an |
9864 | * address reservation and can be used for a future calls to ::cuMemMap. Any new |
9865 | * mapping to this virtual address will need to have access granted through |
9866 | * ::cuMemSetAccess, as all mappings start with no accessibility setup. |
9867 | * |
9868 | * \param[in] ptr - Starting address for the virtual address range to unmap |
9869 | * \param[in] size - Size of the virtual address range to unmap |
9870 | * \returns |
9871 | * ::CUDA_SUCCESS, |
9872 | * ::CUDA_ERROR_INVALID_VALUE, |
9873 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9874 | * ::CUDA_ERROR_DEINITIALIZED, |
9875 | * ::CUDA_ERROR_NOT_PERMITTED, |
9876 | * ::CUDA_ERROR_NOT_SUPPORTED |
9877 | * \notefnerr |
9878 | * \note_sync |
9879 | * |
9880 | * \sa ::cuMemCreate, ::cuMemAddressReserve |
9881 | */ |
9882 | CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); |
9883 | |
9884 | /** |
9885 | * \brief Set the access flags for each location specified in \p desc for the given virtual address range |
9886 | * |
9887 | * Given the virtual address range via \p ptr and \p size, and the locations |
9888 | * in the array given by \p desc and \p count, set the access flags for the |
9889 | * target locations. The range must be a fully mapped address range |
9890 | * containing all allocations created by ::cuMemMap / ::cuMemCreate. |
9891 | * |
9892 | * \param[in] ptr - Starting address for the virtual address range |
9893 | * \param[in] size - Length of the virtual address range |
9894 | * \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the |
9895 | * - mapping for each location specified |
9896 | * \param[in] count - Number of ::CUmemAccessDesc in \p desc |
9897 | * \returns |
9898 | * ::CUDA_SUCCESS, |
9899 | * ::CUDA_ERROR_INVALID_VALUE, |
9900 | * ::CUDA_ERROR_INVALID_DEVICE, |
9901 | * ::CUDA_ERROR_NOT_SUPPORTED |
9902 | * \notefnerr |
9903 | * \note_sync |
9904 | * |
9905 | * \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap |
9906 | */ |
9907 | CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); |
9908 | |
9909 | /** |
9910 | * \brief Get the access \p flags set for the given \p location and \p ptr |
9911 | * |
9912 | * \param[out] flags - Flags set for this location |
9913 | * \param[in] location - Location in which to check the flags for |
9914 | * \param[in] ptr - Address in which to check the access flags for |
9915 | * \returns |
9916 | * ::CUDA_SUCCESS, |
9917 | * ::CUDA_ERROR_INVALID_VALUE, |
9918 | * ::CUDA_ERROR_INVALID_DEVICE, |
9919 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9920 | * ::CUDA_ERROR_DEINITIALIZED, |
9921 | * ::CUDA_ERROR_NOT_PERMITTED, |
9922 | * ::CUDA_ERROR_NOT_SUPPORTED |
9923 | * |
9924 | * \sa ::cuMemSetAccess |
9925 | */ |
9926 | CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); |
9927 | |
9928 | /** |
9929 | * \brief Exports an allocation to a requested shareable handle type |
9930 | * |
9931 | * Given a CUDA memory handle, create a shareable memory |
9932 | * allocation handle that can be used to share the memory with other |
9933 | * processes. The recipient process can convert the shareable handle back into a |
9934 | * CUDA memory handle using ::cuMemImportFromShareableHandle and map |
9935 | * it with ::cuMemMap. The implementation of what this handle is and how it |
9936 | * can be transferred is defined by the requested handle type in \p handleType |
9937 | * |
9938 | * Once all shareable handles are closed and the allocation is released, the allocated |
9939 | * memory referenced will be released back to the OS and uses of the CUDA handle afterward |
9940 | * will lead to undefined behavior. |
9941 | * |
9942 | * This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) |
9943 | * that support importing memory from the shareable type |
9944 | * |
9945 | * \param[out] shareableHandle - Pointer to the location in which to store the requested handle type |
9946 | * \param[in] handle - CUDA handle for the memory allocation |
9947 | * \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) |
9948 | * \param[in] flags - Reserved, must be zero |
9949 | * \returns |
9950 | * ::CUDA_SUCCESS, |
9951 | * ::CUDA_ERROR_INVALID_VALUE, |
9952 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9953 | * ::CUDA_ERROR_DEINITIALIZED, |
9954 | * ::CUDA_ERROR_NOT_PERMITTED, |
9955 | * ::CUDA_ERROR_NOT_SUPPORTED |
9956 | * |
9957 | * \sa ::cuMemImportFromShareableHandle |
9958 | */ |
9959 | CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); |
9960 | |
9961 | /** |
9962 | * \brief Imports an allocation from a requested shareable handle type. |
9963 | * |
9964 | * If the current process cannot support the memory described by this shareable |
9965 | * handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. |
9966 | * |
9967 | * \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) |
9968 | * created on devices under an SLI group may not be supported, and thus this API will |
9969 | * return CUDA_ERROR_NOT_SUPPORTED. |
9970 | * There is no guarantee that the contents of \p handle will be the same CUDA memory handle |
9971 | * for the same given OS shareable handle, or the same underlying allocation. |
9972 | * |
9973 | * \param[out] handle - CUDA Memory handle for the memory allocation. |
9974 | * \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. |
9975 | * \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. |
9976 | * \returns |
9977 | * ::CUDA_SUCCESS, |
9978 | * ::CUDA_ERROR_INVALID_VALUE, |
9979 | * ::CUDA_ERROR_NOT_INITIALIZED, |
9980 | * ::CUDA_ERROR_DEINITIALIZED, |
9981 | * ::CUDA_ERROR_NOT_PERMITTED, |
9982 | * ::CUDA_ERROR_NOT_SUPPORTED |
9983 | * |
9984 | * \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease |
9985 | */ |
9986 | CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); |
9987 | |
9988 | /** |
9989 | * \brief Calculates either the minimal or recommended granularity |
9990 | * |
9991 | * Calculates either the minimal or recommended granularity |
9992 | * for a given allocation specification and returns it in granularity. This |
9993 | * granularity can be used as a multiple for alignment, size, or address mapping. |
9994 | * |
9995 | * \param[out] granularity Returned granularity. |
9996 | * \param[in] prop Property for which to determine the granularity for |
9997 | * \param[in] option Determines which granularity to return |
9998 | * \returns |
9999 | * ::CUDA_SUCCESS, |
10000 | * ::CUDA_ERROR_INVALID_VALUE, |
10001 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10002 | * ::CUDA_ERROR_DEINITIALIZED, |
10003 | * ::CUDA_ERROR_NOT_PERMITTED, |
10004 | * ::CUDA_ERROR_NOT_SUPPORTED |
10005 | * |
10006 | * \sa ::cuMemCreate, ::cuMemMap |
10007 | */ |
10008 | CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); |
10009 | |
10010 | /** |
10011 | * \brief Retrieve the contents of the property structure defining properties for this handle |
10012 | * |
10013 | * \param[out] prop - Pointer to a properties structure which will hold the information about this handle |
10014 | * \param[in] handle - Handle which to perform the query on |
10015 | * \returns |
10016 | * ::CUDA_SUCCESS, |
10017 | * ::CUDA_ERROR_INVALID_VALUE, |
10018 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10019 | * ::CUDA_ERROR_DEINITIALIZED, |
10020 | * ::CUDA_ERROR_NOT_PERMITTED, |
10021 | * ::CUDA_ERROR_NOT_SUPPORTED |
10022 | * |
10023 | * \sa ::cuMemCreate, ::cuMemImportFromShareableHandle |
10024 | */ |
10025 | CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); |
10026 | |
10027 | /** |
10028 | * \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. |
10029 | * |
10030 | * The handle is guaranteed to be the same handle value used to map the memory. If the address |
10031 | * requested is not mapped, the function will fail. The returned handle must be released with |
10032 | * corresponding number of calls to ::cuMemRelease. |
10033 | * |
10034 | * \note The address \p addr, can be any address in a range previously mapped |
10035 | * by ::cuMemMap, and not necessarily the start address. |
10036 | * |
10037 | * \param[out] handle CUDA Memory handle for the backing memory allocation. |
10038 | * \param[in] addr Memory address to query, that has been mapped previously. |
10039 | * \returns |
10040 | * ::CUDA_SUCCESS, |
10041 | * ::CUDA_ERROR_INVALID_VALUE, |
10042 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10043 | * ::CUDA_ERROR_DEINITIALIZED, |
10044 | * ::CUDA_ERROR_NOT_PERMITTED, |
10045 | * ::CUDA_ERROR_NOT_SUPPORTED |
10046 | * |
10047 | * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap |
10048 | */ |
10049 | CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); |
10050 | |
10051 | /** @} */ /* END CUDA_VA */ |
10052 | |
10053 | /** |
10054 | * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator |
10055 | * |
10056 | * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. |
10057 | * Functions for controlling the behavior of the underlying allocator. |
10058 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
10059 | * |
10060 | * This section describes the stream ordered memory allocator exposed by the |
10061 | * low-level CUDA driver application programming interface. |
10062 | * |
10063 | * @{ |
10064 | * |
10065 | * \section CUDA_MALLOC_ASYNC_overview overview |
10066 | * |
10067 | * The asynchronous allocator allows the user to allocate and free in stream order. |
10068 | * All asynchronous accesses of the allocation must happen between |
10069 | * the stream executions of the allocation and the free. If the memory is accessed |
10070 | * outside of the promised stream order, a use before allocation / use after free error |
10071 | * will cause undefined behavior. |
10072 | * |
10073 | * The allocator is free to reallocate the memory as long as it can guarantee |
10074 | * that compliant memory accesses will not overlap temporally. |
10075 | * The allocator may refer to internal stream ordering as well as inter-stream dependencies |
10076 | * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. |
10077 | * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. |
10078 | * |
10079 | * \section CUDA_MALLOC_ASYNC_support Supported Platforms |
10080 | * |
10081 | * Whether or not a device supports the integrated stream ordered memory allocator |
10082 | * may be queried by calling ::cuDeviceGetAttribute() with the device attribute |
10083 | * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED |
10084 | */ |
10085 | |
10086 | /** |
10087 | * \brief Frees memory with stream ordered semantics |
10088 | * |
10089 | * Inserts a free operation into \p hStream. |
10090 | * The allocation must not be accessed after stream execution reaches the free. |
10091 | * After this API returns, accessing the memory from any subsequent work launched on the GPU |
10092 | * or querying its pointer attributes results in undefined behavior. |
10093 | * |
10094 | * \note During stream capture, this function results in the creation of a free node and |
10095 | * must therefore be passed the address of a graph allocation. |
10096 | * |
10097 | * \param dptr - memory to free |
10098 | * \param hStream - The stream establishing the stream ordering contract. |
10099 | * \returns |
10100 | * ::CUDA_SUCCESS, |
10101 | * ::CUDA_ERROR_INVALID_VALUE, |
10102 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10103 | * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), |
10104 | * ::CUDA_ERROR_NOT_SUPPORTED |
10105 | */ |
10106 | CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); |
10107 | |
10108 | /** |
10109 | * \brief Allocates memory with stream ordered semantics |
10110 | * |
10111 | * Inserts an allocation operation into \p hStream. |
10112 | * A pointer to the allocated memory is returned immediately in *dptr. |
10113 | * The allocation must not be accessed until the the allocation operation completes. |
10114 | * The allocation comes from the memory pool current to the stream's device. |
10115 | * |
10116 | * \note The default memory pool of a device contains device memory from that device. |
10117 | * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. |
10118 | * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation |
10119 | * operation completes before work submitted in a separate stream runs. |
10120 | * \note During stream capture, this function results in the creation of an allocation node. In this case, |
10121 | * the allocation is owned by the graph instead of the memory pool. The memory pool's properties |
10122 | * are used to set the node's creation parameters. |
10123 | * |
10124 | * \param[out] dptr - Returned device pointer |
10125 | * \param[in] bytesize - Number of bytes to allocate |
10126 | * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from |
10127 | * \returns |
10128 | * ::CUDA_SUCCESS, |
10129 | * ::CUDA_ERROR_INVALID_VALUE, |
10130 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10131 | * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), |
10132 | * ::CUDA_ERROR_NOT_SUPPORTED, |
10133 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10134 | * |
10135 | * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, |
10136 | * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, |
10137 | * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute |
10138 | */ |
10139 | CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); |
10140 | |
10141 | /** |
10142 | * \brief Tries to release memory back to the OS |
10143 | * |
10144 | * Releases memory back to the OS until the pool contains fewer than minBytesToKeep |
10145 | * reserved bytes, or there is no more memory that the allocator can safely release. |
10146 | * The allocator cannot release OS allocations that back outstanding asynchronous allocations. |
10147 | * The OS allocations may happen at different granularity from the user allocations. |
10148 | * |
10149 | * \note: Allocations that have not been freed count as outstanding. |
10150 | * \note: Allocations that have been asynchronously freed but whose completion has |
10151 | * not been observed on the host (eg. by a synchronize) can count as outstanding. |
10152 | * |
10153 | * \param[in] pool - The memory pool to trim |
10154 | * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, |
10155 | * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have |
10156 | * at least minBytesToKeep bytes reserved after the operation. |
10157 | * \returns |
10158 | * ::CUDA_SUCCESS, |
10159 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10160 | * ::CUDA_ERROR_INVALID_VALUE |
10161 | * |
10162 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10163 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate |
10164 | */ |
10165 | CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); |
10166 | |
10167 | /** |
10168 | * \brief Sets attributes of a memory pool |
10169 | * |
10170 | * Supported attributes are: |
10171 | * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) |
10172 | * Amount of reserved memory in bytes to hold onto before trying |
10173 | * to release memory back to the OS. When more than the release |
10174 | * threshold bytes of memory are held by the memory pool, the |
10175 | * allocator will try to release memory back to the OS on the |
10176 | * next call to stream, event or context synchronize. (default 0) |
10177 | * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) |
10178 | * Allow ::cuMemAllocAsync to use memory asynchronously freed |
10179 | * in another stream as long as a stream ordering dependency |
10180 | * of the allocating stream on the free action exists. |
10181 | * Cuda events and null stream interactions can create the required |
10182 | * stream ordered dependencies. (default enabled) |
10183 | * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) |
10184 | * Allow reuse of already completed frees when there is no dependency |
10185 | * between the free and allocation. (default enabled) |
10186 | * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) |
10187 | * Allow ::cuMemAllocAsync to insert new stream dependencies |
10188 | * in order to establish the stream ordering required to reuse |
10189 | * a piece of memory released by ::cuMemFreeAsync (default enabled). |
10190 | * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) |
10191 | * Reset the high watermark that tracks the amount of backing memory that was |
10192 | * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. |
10193 | * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) |
10194 | * Reset the high watermark that tracks the amount of used memory that was |
10195 | * allocated for the memory pool. |
10196 | * |
10197 | * \param[in] pool - The memory pool to modify |
10198 | * \param[in] attr - The attribute to modify |
10199 | * \param[in] value - Pointer to the value to assign |
10200 | * |
10201 | * \returns |
10202 | * ::CUDA_SUCCESS, |
10203 | * ::CUDA_ERROR_INVALID_VALUE |
10204 | * |
10205 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10206 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate |
10207 | */ |
10208 | CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); |
10209 | |
10210 | /** |
10211 | * \brief Gets attributes of a memory pool |
10212 | * |
10213 | * Supported attributes are: |
10214 | * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) |
10215 | * Amount of reserved memory in bytes to hold onto before trying |
10216 | * to release memory back to the OS. When more than the release |
10217 | * threshold bytes of memory are held by the memory pool, the |
10218 | * allocator will try to release memory back to the OS on the |
10219 | * next call to stream, event or context synchronize. (default 0) |
10220 | * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) |
10221 | * Allow ::cuMemAllocAsync to use memory asynchronously freed |
10222 | * in another stream as long as a stream ordering dependency |
10223 | * of the allocating stream on the free action exists. |
10224 | * Cuda events and null stream interactions can create the required |
10225 | * stream ordered dependencies. (default enabled) |
10226 | * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) |
10227 | * Allow reuse of already completed frees when there is no dependency |
10228 | * between the free and allocation. (default enabled) |
10229 | * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) |
10230 | * Allow ::cuMemAllocAsync to insert new stream dependencies |
10231 | * in order to establish the stream ordering required to reuse |
10232 | * a piece of memory released by ::cuMemFreeAsync (default enabled). |
10233 | * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) |
10234 | * Amount of backing memory currently allocated for the mempool |
10235 | * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) |
10236 | * High watermark of backing memory allocated for the mempool since the |
10237 | * last time it was reset. |
10238 | * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) |
10239 | * Amount of memory from the pool that is currently in use by the application. |
10240 | * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) |
10241 | * High watermark of the amount of memory from the pool that was in use by the application. |
10242 | * |
10243 | * \param[in] pool - The memory pool to get attributes of |
10244 | * \param[in] attr - The attribute to get |
10245 | * \param[out] value - Retrieved value |
10246 | * |
10247 | * \returns |
10248 | * ::CUDA_SUCCESS, |
10249 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10250 | * ::CUDA_ERROR_INVALID_VALUE |
10251 | * |
10252 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10253 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate |
10254 | */ |
10255 | CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); |
10256 | |
10257 | /** |
10258 | * \brief Controls visibility of pools between devices |
10259 | * |
10260 | * \param[in] pool - The pool being modified |
10261 | * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. |
10262 | * \param[in] count - Number of descriptors in the map array. |
10263 | * |
10264 | * \returns |
10265 | * ::CUDA_SUCCESS, |
10266 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10267 | * ::CUDA_ERROR_INVALID_VALUE |
10268 | * |
10269 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10270 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate |
10271 | */ |
10272 | CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); |
10273 | |
10274 | /** |
10275 | * \brief Returns the accessibility of a pool from a device |
10276 | * |
10277 | * Returns the accessibility of the pool's memory from the specified location. |
10278 | * |
10279 | * \param[out] flags - the accessibility of the pool from the specified location |
10280 | * \param[in] memPool - the pool being queried |
10281 | * \param[in] location - the location accessing the pool |
10282 | * |
10283 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10284 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate |
10285 | */ |
10286 | CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); |
10287 | |
10288 | /** |
10289 | * \brief Creates a memory pool |
10290 | * |
10291 | * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines |
10292 | * the properties of the pool such as the backing device and IPC capabilities. |
10293 | * |
10294 | * By default, the pool's memory will be accessible from the device it is allocated on. |
10295 | * |
10296 | * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. |
10297 | * |
10298 | * \returns |
10299 | * ::CUDA_SUCCESS, |
10300 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10301 | * ::CUDA_ERROR_INVALID_VALUE, |
10302 | * ::CUDA_ERROR_OUT_OF_MEMORY, |
10303 | * ::CUDA_ERROR_NOT_SUPPORTED |
10304 | * |
10305 | * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, |
10306 | * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle |
10307 | */ |
10308 | CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); |
10309 | |
10310 | /** |
10311 | * \brief Destroys the specified memory pool |
10312 | * |
10313 | * If any pointers obtained from this pool haven't been freed or |
10314 | * the pool has free operations that haven't completed |
10315 | * when ::cuMemPoolDestroy is invoked, the function will return immediately and the |
10316 | * resources associated with the pool will be released automatically |
10317 | * once there are no more outstanding allocations. |
10318 | * |
10319 | * Destroying the current mempool of a device sets the default mempool of |
10320 | * that device as the current mempool for that device. |
10321 | * |
10322 | * \note A device's default memory pool cannot be destroyed. |
10323 | * |
10324 | * \returns |
10325 | * ::CUDA_SUCCESS, |
10326 | * ::CUDA_ERROR_INVALID_VALUE |
10327 | * |
10328 | * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, |
10329 | * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate |
10330 | */ |
10331 | CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); |
10332 | |
10333 | /** |
10334 | * \brief Allocates memory from a specified pool with stream ordered semantics. |
10335 | * |
10336 | * Inserts an allocation operation into \p hStream. |
10337 | * A pointer to the allocated memory is returned immediately in *dptr. |
10338 | * The allocation must not be accessed until the the allocation operation completes. |
10339 | * The allocation comes from the specified memory pool. |
10340 | * |
10341 | * \note |
10342 | * - The specified memory pool may be from a device different than that of the specified \p hStream. |
10343 | * |
10344 | * - Basic stream ordering allows future work submitted into the same stream to use the allocation. |
10345 | * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation |
10346 | * operation completes before work submitted in a separate stream runs. |
10347 | * |
10348 | * \note During stream capture, this function results in the creation of an allocation node. In this case, |
10349 | * the allocation is owned by the graph instead of the memory pool. The memory pool's properties |
10350 | * are used to set the node's creation parameters. |
10351 | * |
10352 | * \param[out] dptr - Returned device pointer |
10353 | * \param[in] bytesize - Number of bytes to allocate |
10354 | * \param[in] pool - The pool to allocate from |
10355 | * \param[in] hStream - The stream establishing the stream ordering semantic |
10356 | * |
10357 | * \returns |
10358 | * ::CUDA_SUCCESS, |
10359 | * ::CUDA_ERROR_INVALID_VALUE, |
10360 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10361 | * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), |
10362 | * ::CUDA_ERROR_NOT_SUPPORTED, |
10363 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10364 | * |
10365 | * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, |
10366 | * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, |
10367 | * ::cuMemPoolSetAttribute |
10368 | */ |
10369 | CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); |
10370 | |
10371 | /** |
10372 | * \brief Exports a memory pool to the requested handle type. |
10373 | * |
10374 | * Given an IPC capable mempool, create an OS handle to share the pool with another process. |
10375 | * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. |
10376 | * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. |
10377 | * The implementation of what the shareable handle is and how it can be transferred is defined by the requested |
10378 | * handle type. |
10379 | * |
10380 | * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. |
10381 | * |
10382 | * \param[out] handle_out - Returned OS handle |
10383 | * \param[in] pool - pool to export |
10384 | * \param[in] handleType - the type of handle to create |
10385 | * \param[in] flags - must be 0 |
10386 | * |
10387 | * \returns |
10388 | * ::CUDA_SUCCESS, |
10389 | * ::CUDA_ERROR_INVALID_VALUE, |
10390 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10391 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10392 | * |
10393 | * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, |
10394 | * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, |
10395 | * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, |
10396 | * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute |
10397 | */ |
10398 | CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); |
10399 | |
10400 | /** |
10401 | * \brief imports a memory pool from a shared handle. |
10402 | * |
10403 | * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. |
10404 | * |
10405 | * \note Imported memory pools do not support creating new allocations. |
10406 | * As such imported memory pools may not be used in cuDeviceSetMemPool |
10407 | * or ::cuMemAllocFromPoolAsync calls. |
10408 | * |
10409 | * \param[out] pool_out - Returned memory pool |
10410 | * \param[in] handle - OS handle of the pool to open |
10411 | * \param[in] handleType - The type of handle being imported |
10412 | * \param[in] flags - must be 0 |
10413 | * |
10414 | * \returns |
10415 | * ::CUDA_SUCCESS, |
10416 | * ::CUDA_ERROR_INVALID_VALUE, |
10417 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10418 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10419 | * |
10420 | * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer |
10421 | */ |
10422 | CUresult CUDAAPI cuMemPoolImportFromShareableHandle( |
10423 | CUmemoryPool *pool_out, |
10424 | void *handle, |
10425 | CUmemAllocationHandleType handleType, |
10426 | unsigned long long flags); |
10427 | |
10428 | /** |
10429 | * \brief Export data to share a memory pool allocation between processes. |
10430 | * |
10431 | * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. |
10432 | * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. |
10433 | * The data is not a handle and may be shared through any IPC mechanism. |
10434 | * |
10435 | * \param[out] shareData_out - Returned export data |
10436 | * \param[in] ptr - pointer to memory being exported |
10437 | * |
10438 | * \returns |
10439 | * ::CUDA_SUCCESS, |
10440 | * ::CUDA_ERROR_INVALID_VALUE, |
10441 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10442 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10443 | * |
10444 | * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer |
10445 | */ |
10446 | CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); |
10447 | |
10448 | /** |
10449 | * \brief Import a memory pool allocation from another process. |
10450 | * |
10451 | * Returns in \p ptr_out a pointer to the imported memory. |
10452 | * The imported memory must not be accessed before the allocation operation completes |
10453 | * in the exporting process. The imported memory must be freed from all importing processes before |
10454 | * being freed in the exporting process. The pointer may be freed with cuMemFree |
10455 | * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed |
10456 | * on the importing process before the free operation on the exporting process. |
10457 | * |
10458 | * \note The cuMemFreeAsync api may be used in the exporting process before |
10459 | * the cuMemFreeAsync operation completes in its stream as long as the |
10460 | * cuMemFreeAsync in the exporting process specifies a stream with |
10461 | * a stream dependency on the importing process's cuMemFreeAsync. |
10462 | * |
10463 | * \param[out] ptr_out - pointer to imported memory |
10464 | * \param[in] pool - pool from which to import |
10465 | * \param[in] shareData - data specifying the memory to import |
10466 | * |
10467 | * \returns |
10468 | * ::CUDA_SUCCESS, |
10469 | * ::CUDA_ERROR_INVALID_VALUE, |
10470 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10471 | * ::CUDA_ERROR_OUT_OF_MEMORY |
10472 | * |
10473 | * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer |
10474 | */ |
10475 | CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); |
10476 | |
10477 | /** @} */ /* END CUDA_MALLOC_ASYNC */ |
10478 | |
10479 | /** |
10480 | * \defgroup CUDA_UNIFIED Unified Addressing |
10481 | * |
10482 | * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver |
10483 | * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
10484 | * |
10485 | * This section describes the unified addressing functions of the |
10486 | * low-level CUDA driver application programming interface. |
10487 | * |
10488 | * @{ |
10489 | * |
10490 | * \section CUDA_UNIFIED_overview Overview |
10491 | * |
10492 | * CUDA devices can share a unified address space with the host. |
10493 | * For these devices there is no distinction between a device |
10494 | * pointer and a host pointer -- the same pointer value may be |
10495 | * used to access memory from the host program and from a kernel |
10496 | * running on the device (with exceptions enumerated below). |
10497 | * |
10498 | * \section CUDA_UNIFIED_support Supported Platforms |
10499 | * |
10500 | * Whether or not a device supports unified addressing may be |
10501 | * queried by calling ::cuDeviceGetAttribute() with the device |
10502 | * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. |
10503 | * |
10504 | * Unified addressing is automatically enabled in 64-bit processes |
10505 | * |
10506 | * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values |
10507 | * |
10508 | * It is possible to look up information about the memory which backs a |
10509 | * pointer value. For instance, one may want to know if a pointer points |
10510 | * to host or device memory. As another example, in the case of device |
10511 | * memory, one may want to know on which CUDA device the memory |
10512 | * resides. These properties may be queried using the function |
10513 | * ::cuPointerGetAttribute() |
10514 | * |
10515 | * Since pointers are unique, it is not necessary to specify information |
10516 | * about the pointers specified to the various copy functions in the |
10517 | * CUDA API. The function ::cuMemcpy() may be used to perform a copy |
10518 | * between two pointers, ignoring whether they point to host or device |
10519 | * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() |
10520 | * unnecessary for devices supporting unified addressing). For |
10521 | * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be |
10522 | * used to specify that the CUDA driver should infer the location of the |
10523 | * pointer from its value. |
10524 | * |
10525 | * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory |
10526 | * |
10527 | * All host memory allocated in all contexts using ::cuMemAllocHost() and |
10528 | * ::cuMemHostAlloc() is always directly accessible from all contexts on |
10529 | * all devices that support unified addressing. This is the case regardless |
10530 | * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and |
10531 | * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. |
10532 | * |
10533 | * The pointer value through which allocated host memory may be accessed |
10534 | * in kernels on all devices that support unified addressing is the same |
10535 | * as the pointer value through which that memory is accessed on the host, |
10536 | * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device |
10537 | * pointer for these allocations. |
10538 | * |
10539 | * Note that this is not the case for memory allocated using the flag |
10540 | * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. |
10541 | * |
10542 | * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory |
10543 | * |
10544 | * Upon enabling direct access from a context that supports unified addressing |
10545 | * to another peer context that supports unified addressing using |
10546 | * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using |
10547 | * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible |
10548 | * by the current context. The device pointer value through |
10549 | * which any peer memory may be accessed in the current context |
10550 | * is the same pointer value through which that memory may be |
10551 | * accessed in the peer context. |
10552 | * |
10553 | * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing |
10554 | * |
10555 | * Not all memory may be accessed on devices through the same pointer |
10556 | * value through which they are accessed on the host. These exceptions |
10557 | * are host memory registered using ::cuMemHostRegister() and host memory |
10558 | * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these |
10559 | * exceptions, there exists a distinct host and device address for the |
10560 | * memory. The device address is guaranteed to not overlap any valid host |
10561 | * pointer range and is guaranteed to have the same value across all |
10562 | * contexts that support unified addressing. |
10563 | * |
10564 | * This device address may be queried using ::cuMemHostGetDevicePointer() |
10565 | * when a context using unified addressing is current. Either the host |
10566 | * or the unified device pointer value may be used to refer to this memory |
10567 | * through ::cuMemcpy() and similar functions using the |
10568 | * ::CU_MEMORYTYPE_UNIFIED memory type. |
10569 | * |
10570 | */ |
10571 | |
10572 | /** |
10573 | * \brief Returns information about a pointer |
10574 | * |
10575 | * The supported attributes are: |
10576 | * |
10577 | * - ::CU_POINTER_ATTRIBUTE_CONTEXT: |
10578 | * |
10579 | * Returns in \p *data the ::CUcontext in which \p ptr was allocated or |
10580 | * registered. |
10581 | * The type of \p data must be ::CUcontext *. |
10582 | * |
10583 | * If \p ptr was not allocated by, mapped by, or registered with |
10584 | * a ::CUcontext which uses unified virtual addressing then |
10585 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
10586 | * |
10587 | * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: |
10588 | * |
10589 | * Returns in \p *data the physical memory type of the memory that |
10590 | * \p ptr addresses as a ::CUmemorytype enumerated value. |
10591 | * The type of \p data must be unsigned int. |
10592 | * |
10593 | * If \p ptr addresses device memory then \p *data is set to |
10594 | * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the |
10595 | * memory resides is the ::CUdevice of the ::CUcontext returned by the |
10596 | * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. |
10597 | * |
10598 | * If \p ptr addresses host memory then \p *data is set to |
10599 | * ::CU_MEMORYTYPE_HOST. |
10600 | * |
10601 | * If \p ptr was not allocated by, mapped by, or registered with |
10602 | * a ::CUcontext which uses unified virtual addressing then |
10603 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
10604 | * |
10605 | * If the current ::CUcontext does not support unified virtual |
10606 | * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. |
10607 | * |
10608 | * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: |
10609 | * |
10610 | * Returns in \p *data the device pointer value through which |
10611 | * \p ptr may be accessed by kernels running in the current |
10612 | * ::CUcontext. |
10613 | * The type of \p data must be CUdeviceptr *. |
10614 | * |
10615 | * If there exists no device pointer value through which |
10616 | * kernels running in the current ::CUcontext may access |
10617 | * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. |
10618 | * |
10619 | * If there is no current ::CUcontext then |
10620 | * ::CUDA_ERROR_INVALID_CONTEXT is returned. |
10621 | * |
10622 | * Except in the exceptional disjoint addressing cases discussed |
10623 | * below, the value returned in \p *data will equal the input |
10624 | * value \p ptr. |
10625 | * |
10626 | * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: |
10627 | * |
10628 | * Returns in \p *data the host pointer value through which |
10629 | * \p ptr may be accessed by by the host program. |
10630 | * The type of \p data must be void **. |
10631 | * If there exists no host pointer value through which |
10632 | * the host program may directly access \p ptr then |
10633 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
10634 | * |
10635 | * Except in the exceptional disjoint addressing cases discussed |
10636 | * below, the value returned in \p *data will equal the input |
10637 | * value \p ptr. |
10638 | * |
10639 | * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: |
10640 | * |
10641 | * Returns in \p *data two tokens for use with the nv-p2p.h Linux |
10642 | * kernel interface. \p data must be a struct of type |
10643 | * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. |
10644 | * |
10645 | * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). |
10646 | * Note that p2pToken and vaSpaceToken are only valid for the |
10647 | * lifetime of the source allocation. A subsequent allocation at |
10648 | * the same address may return completely different tokens. |
10649 | * Querying this attribute has a side effect of setting the attribute |
10650 | * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that |
10651 | * \p ptr points to. |
10652 | * |
10653 | * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: |
10654 | * |
10655 | * A boolean attribute which when set, ensures that synchronous memory operations |
10656 | * initiated on the region of memory that \p ptr points to will always synchronize. |
10657 | * See further documentation in the section titled "API synchronization behavior" |
10658 | * to learn more about cases when synchronous memory operations can |
10659 | * exhibit asynchronous behavior. |
10660 | * |
10661 | * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: |
10662 | * |
10663 | * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. |
10664 | * \p data must point to an unsigned long long. |
10665 | * |
10666 | * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. |
10667 | * Every memory allocation from any of the CUDA memory allocation APIs will |
10668 | * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs |
10669 | * from previous freed allocations. IDs are only unique within a single process. |
10670 | * |
10671 | * |
10672 | * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: |
10673 | * |
10674 | * Returns in \p *data a boolean that indicates whether the pointer points to |
10675 | * managed memory or not. |
10676 | * |
10677 | * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. |
10678 | * |
10679 | * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: |
10680 | * |
10681 | * Returns in \p *data an integer representing a device ordinal of a device against |
10682 | * which the memory was allocated or registered. |
10683 | * |
10684 | * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: |
10685 | * |
10686 | * Returns in \p *data a boolean that indicates if this pointer maps to |
10687 | * an allocation that is suitable for ::cudaIpcGetMemHandle. |
10688 | * |
10689 | * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: |
10690 | * |
10691 | * Returns in \p *data the starting address for the allocation referenced |
10692 | * by the device pointer \p ptr. Note that this is not necessarily the |
10693 | * address of the mapped region, but the address of the mappable address |
10694 | * range \p ptr references (e.g. from ::cuMemAddressReserve). |
10695 | * |
10696 | * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: |
10697 | * |
10698 | * Returns in \p *data the size for the allocation referenced by the device |
10699 | * pointer \p ptr. Note that this is not necessarily the size of the mapped |
10700 | * region, but the size of the mappable address range \p ptr references |
10701 | * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped |
10702 | * region, see ::cuMemGetAddressRange |
10703 | * |
10704 | * - ::CU_POINTER_ATTRIBUTE_MAPPED: |
10705 | * |
10706 | * Returns in \p *data a boolean that indicates if this pointer is in a |
10707 | * valid address range that is mapped to a backing allocation. |
10708 | * |
10709 | * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: |
10710 | * |
10711 | * Returns a bitmask of the allowed handle types for an allocation that may |
10712 | * be passed to ::cuMemExportToShareableHandle. |
10713 | * |
10714 | * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: |
10715 | * |
10716 | * Returns in \p *data the handle to the mempool that the allocation was obtained from. |
10717 | * |
10718 | * \par |
10719 | * |
10720 | * Note that for most allocations in the unified virtual address space |
10721 | * the host and device pointer for accessing the allocation will be the |
10722 | * same. The exceptions to this are |
10723 | * - user memory registered using ::cuMemHostRegister |
10724 | * - host memory allocated using ::cuMemHostAlloc with the |
10725 | * ::CU_MEMHOSTALLOC_WRITECOMBINED flag |
10726 | * For these types of allocation there will exist separate, disjoint host |
10727 | * and device addresses for accessing the allocation. In particular |
10728 | * - The host address will correspond to an invalid unmapped device address |
10729 | * (which will result in an exception if accessed from the device) |
10730 | * - The device address will correspond to an invalid unmapped host address |
10731 | * (which will result in an exception if accessed from the host). |
10732 | * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER |
10733 | * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host |
10734 | * and device addresses from either address. |
10735 | * |
10736 | * \param data - Returned pointer attribute value |
10737 | * \param attribute - Pointer attribute to query |
10738 | * \param ptr - Pointer |
10739 | * |
10740 | * \return |
10741 | * ::CUDA_SUCCESS, |
10742 | * ::CUDA_ERROR_DEINITIALIZED, |
10743 | * ::CUDA_ERROR_NOT_INITIALIZED, |
10744 | * ::CUDA_ERROR_INVALID_CONTEXT, |
10745 | * ::CUDA_ERROR_INVALID_VALUE, |
10746 | * ::CUDA_ERROR_INVALID_DEVICE |
10747 | * \notefnerr |
10748 | * |
10749 | * \sa |
10750 | * ::cuPointerSetAttribute, |
10751 | * ::cuMemAlloc, |
10752 | * ::cuMemFree, |
10753 | * ::cuMemAllocHost, |
10754 | * ::cuMemFreeHost, |
10755 | * ::cuMemHostAlloc, |
10756 | * ::cuMemHostRegister, |
10757 | * ::cuMemHostUnregister, |
10758 | * ::cudaPointerGetAttributes |
10759 | */ |
10760 | CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); |
10761 | |
10762 | /** |
10763 | * \brief Prefetches memory to the specified destination device |
10764 | * |
10765 | * Prefetches memory to the specified destination device. \p devPtr is the |
10766 | * base device pointer of the memory to be prefetched and \p dstDevice is the |
10767 | * destination device. \p count specifies the number of bytes to copy. \p hStream |
10768 | * is the stream in which the operation is enqueued. The memory range must refer |
10769 | * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. |
10770 | * |
10771 | * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If |
10772 | * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS |
10773 | * must be non-zero. Additionally, \p hStream must be associated with a device that has a |
10774 | * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
10775 | * |
10776 | * The start address and end address of the memory range will be rounded down and rounded up |
10777 | * respectively to be aligned to CPU page size before the prefetch operation is enqueued |
10778 | * in the stream. |
10779 | * |
10780 | * If no physical memory has been allocated for this region, then this memory region |
10781 | * will be populated and mapped on the destination device. If there's insufficient |
10782 | * memory to prefetch the desired region, the Unified Memory driver may evict pages from other |
10783 | * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory |
10784 | * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. |
10785 | * |
10786 | * By default, any mappings to the previous location of the migrated pages are removed and |
10787 | * mappings for the new location are only setup on \p dstDevice. The exact behavior however |
10788 | * also depends on the settings applied to this memory range via ::cuMemAdvise as described |
10789 | * below: |
10790 | * |
10791 | * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, |
10792 | * then that subset will create a read-only copy of the pages on \p dstDevice. |
10793 | * |
10794 | * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory |
10795 | * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the |
10796 | * preferred location of any pages in the memory range. |
10797 | * |
10798 | * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, |
10799 | * then mappings to those pages from all the appropriate processors are updated to |
10800 | * refer to the new location if establishing such a mapping is possible. Otherwise, |
10801 | * those mappings are cleared. |
10802 | * |
10803 | * Note that this API is not required for functionality and only serves to improve performance |
10804 | * by allowing the application to migrate data to a suitable location before it is accessed. |
10805 | * Memory accesses to this range are always coherent and are allowed even when the data is |
10806 | * actively being migrated. |
10807 | * |
10808 | * Note that this function is asynchronous with respect to the host and all work |
10809 | * on other devices. |
10810 | * |
10811 | * \param devPtr - Pointer to be prefetched |
10812 | * \param count - Size in bytes |
10813 | * \param dstDevice - Destination device to prefetch to |
10814 | * \param hStream - Stream to enqueue prefetch operation |
10815 | * |
10816 | * \return |
10817 | * ::CUDA_SUCCESS, |
10818 | * ::CUDA_ERROR_INVALID_VALUE, |
10819 | * ::CUDA_ERROR_INVALID_DEVICE |
10820 | * \notefnerr |
10821 | * \note_async |
10822 | * \note_null_stream |
10823 | * |
10824 | * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, |
10825 | * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, |
10826 | * ::cudaMemPrefetchAsync |
10827 | */ |
10828 | CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); |
10829 | |
10830 | /** |
10831 | * \brief Advise about the usage of a given memory range |
10832 | * |
10833 | * Advise the Unified Memory subsystem about the usage pattern for the memory range |
10834 | * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory |
10835 | * range will be rounded down and rounded up respectively to be aligned to CPU page size before the |
10836 | * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged |
10837 | * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable |
10838 | * memory provided it represents a valid, host-accessible region of memory and all additional constraints |
10839 | * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable |
10840 | * memory range results in an error being returned. |
10841 | * |
10842 | * The \p advice parameter can take the following values: |
10843 | * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read |
10844 | * from and only occasionally written to. Any read accesses from any processor to this region will create a |
10845 | * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync |
10846 | * is called on this region, it will create a read-only copy of the data on the destination processor. |
10847 | * If any processor writes to this region, all copies of the corresponding page will be invalidated |
10848 | * except for the one where the write occurred. The \p device argument is ignored for this advice. |
10849 | * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU |
10850 | * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
10851 | * Also, if a context is created on a device that does not have the device attribute |
10852 | * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until |
10853 | * all such contexts are destroyed. |
10854 | * If the memory region refers to valid system-allocated pageable memory, then the accessing device must |
10855 | * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only |
10856 | * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the |
10857 | * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice |
10858 | * will not create a read-only copy when that device accesses this memory region. |
10859 | * |
10860 | * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the |
10861 | * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated |
10862 | * copies of the data will be collapsed into a single copy. The location for the collapsed |
10863 | * copy will be the preferred location if the page has a preferred location and one of the read-duplicated |
10864 | * copies was resident at that location. Otherwise, the location chosen is arbitrary. |
10865 | * |
10866 | * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the |
10867 | * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the |
10868 | * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the |
10869 | * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location |
10870 | * does not cause data to migrate to that location immediately. Instead, it guides the migration policy |
10871 | * when a fault occurs on that memory region. If the data is already in its preferred location and the |
10872 | * faulting processor can establish a mapping without requiring the data to be migrated, then |
10873 | * data migration will be avoided. On the other hand, if the data is not in its preferred location |
10874 | * or if a direct mapping cannot be established, then it will be migrated to the processor accessing |
10875 | * it. It is important to note that setting the preferred location does not prevent data prefetching |
10876 | * done using ::cuMemPrefetchAsync. |
10877 | * Having a preferred location can override the page thrash detection and resolution logic in the Unified |
10878 | * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device |
10879 | * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But |
10880 | * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. |
10881 | * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the |
10882 | * policies associated with that advice will override the policies of this advice, unless read accesses from |
10883 | * \p device will not result in a read-only copy being created on that device as outlined in description for |
10884 | * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. |
10885 | * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
10886 | * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
10887 | * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
10888 | * then this call has no effect. Note however that this behavior may change in the future. |
10889 | * |
10890 | * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION |
10891 | * and changes the preferred location to none. |
10892 | * |
10893 | * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. |
10894 | * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then |
10895 | * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. |
10896 | * This advice does not cause data migration and has no impact on the location of the data per se. Instead, |
10897 | * it causes the data to always be mapped in the specified processor's page tables, as long as the |
10898 | * location of the data permits a mapping to be established. If the data gets migrated for any reason, |
10899 | * the mappings are updated accordingly. |
10900 | * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. |
10901 | * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the |
10902 | * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data |
10903 | * over to the other GPUs is not as important because the accesses are infrequent and the overhead of |
10904 | * migration may be too high. But preventing faults can still help improve performance, and so having |
10905 | * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated |
10906 | * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the |
10907 | * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the |
10908 | * page in host memory. |
10909 | * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the |
10910 | * policies associated with that advice will override the policies of this advice. Additionally, if the |
10911 | * preferred location of this memory region or any subset of it is also \p device, then the policies |
10912 | * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. |
10913 | * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
10914 | * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
10915 | * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
10916 | * then this call has no effect. |
10917 | * |
10918 | * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to |
10919 | * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. |
10920 | * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
10921 | * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
10922 | * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
10923 | * then this call has no effect. |
10924 | * |
10925 | * \param devPtr - Pointer to memory to set the advice for |
10926 | * \param count - Size in bytes of the memory range |
10927 | * \param advice - Advice to be applied for the specified memory range |
10928 | * \param device - Device to apply the advice for |
10929 | * |
10930 | * \return |
10931 | * ::CUDA_SUCCESS, |
10932 | * ::CUDA_ERROR_INVALID_VALUE, |
10933 | * ::CUDA_ERROR_INVALID_DEVICE |
10934 | * \notefnerr |
10935 | * \note_async |
10936 | * \note_null_stream |
10937 | * |
10938 | * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, |
10939 | * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, |
10940 | * ::cudaMemAdvise |
10941 | */ |
10942 | CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); |
10943 | |
10944 | /** |
10945 | * \brief Query an attribute of a given memory range |
10946 | * |
10947 | * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The |
10948 | * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via |
10949 | * __managed__ variables. |
10950 | * |
10951 | * The \p attribute parameter can take the following values: |
10952 | * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted |
10953 | * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given |
10954 | * memory range have read-duplication enabled, or 0 otherwise. |
10955 | * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be |
10956 | * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device |
10957 | * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU |
10958 | * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID |
10959 | * if either all the pages don't have the same preferred location or some of the pages don't have a |
10960 | * preferred location at all. Note that the actual location of the pages in the memory range at the time of |
10961 | * the query may be different from the preferred location. |
10962 | * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted |
10963 | * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned |
10964 | * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. |
10965 | * If any device does not have that advice set for the entire memory range, that device will not be included. |
10966 | * If \p data is larger than the number of devices that have that advice set for that memory range, |
10967 | * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 |
10968 | * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be |
10969 | * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have |
10970 | * that advice set, then only as many devices will be returned as can fit in the array. There is no |
10971 | * guarantee on which specific devices will be returned, however. |
10972 | * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be |
10973 | * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location |
10974 | * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be |
10975 | * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU |
10976 | * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not |
10977 | * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the |
10978 | * last location that the application requested to prefetch the memory range to. It gives no indication as to |
10979 | * whether the prefetch operation to that location has completed or even begun. |
10980 | * |
10981 | * \param data - A pointers to a memory location where the result |
10982 | * of each attribute query will be written to. |
10983 | * \param dataSize - Array containing the size of data |
10984 | * \param attribute - The attribute to query |
10985 | * \param devPtr - Start of the range to query |
10986 | * \param count - Size of the range to query |
10987 | * |
10988 | * \return |
10989 | * ::CUDA_SUCCESS, |
10990 | * ::CUDA_ERROR_INVALID_VALUE, |
10991 | * ::CUDA_ERROR_INVALID_DEVICE |
10992 | * \notefnerr |
10993 | * \note_async |
10994 | * \note_null_stream |
10995 | * |
10996 | * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, |
10997 | * ::cuMemAdvise, |
10998 | * ::cudaMemRangeGetAttribute |
10999 | */ |
11000 | CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); |
11001 | |
11002 | /** |
11003 | * \brief Query attributes of a given memory range. |
11004 | * |
11005 | * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The |
11006 | * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via |
11007 | * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes |
11008 | * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. |
11009 | * The results of the query will be stored in \p data. |
11010 | * |
11011 | * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for |
11012 | * attribute descriptions and restrictions. |
11013 | * |
11014 | * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY |
11015 | * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION |
11016 | * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY |
11017 | * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION |
11018 | * |
11019 | * \param data - A two-dimensional array containing pointers to memory |
11020 | * locations where the result of each attribute query will be written to. |
11021 | * \param dataSizes - Array containing the sizes of each result |
11022 | * \param attributes - An array of attributes to query |
11023 | * (numAttributes and the number of attributes in this array should match) |
11024 | * \param numAttributes - Number of attributes to query |
11025 | * \param devPtr - Start of the range to query |
11026 | * \param count - Size of the range to query |
11027 | * |
11028 | * \return |
11029 | * ::CUDA_SUCCESS, |
11030 | * ::CUDA_ERROR_DEINITIALIZED, |
11031 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11032 | * ::CUDA_ERROR_INVALID_VALUE, |
11033 | * ::CUDA_ERROR_INVALID_DEVICE |
11034 | * \notefnerr |
11035 | * |
11036 | * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, |
11037 | * ::cuMemPrefetchAsync, |
11038 | * ::cudaMemRangeGetAttributes |
11039 | */ |
11040 | CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); |
11041 | |
11042 | /** |
11043 | * \brief Set attributes on a previously allocated memory region |
11044 | * |
11045 | * The supported attributes are: |
11046 | * |
11047 | * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: |
11048 | * |
11049 | * A boolean attribute that can either be set (1) or unset (0). When set, |
11050 | * the region of memory that \p ptr points to is guaranteed to always synchronize |
11051 | * memory operations that are synchronous. If there are some previously initiated |
11052 | * synchronous memory operations that are pending when this attribute is set, the |
11053 | * function does not return until those memory operations are complete. |
11054 | * See further documentation in the section titled "API synchronization behavior" |
11055 | * to learn more about cases when synchronous memory operations can |
11056 | * exhibit asynchronous behavior. |
11057 | * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. |
11058 | * |
11059 | * \param value - Pointer to memory containing the value to be set |
11060 | * \param attribute - Pointer attribute to set |
11061 | * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs |
11062 | * |
11063 | * \return |
11064 | * ::CUDA_SUCCESS, |
11065 | * ::CUDA_ERROR_DEINITIALIZED, |
11066 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11067 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11068 | * ::CUDA_ERROR_INVALID_VALUE, |
11069 | * ::CUDA_ERROR_INVALID_DEVICE |
11070 | * \notefnerr |
11071 | * |
11072 | * \sa ::cuPointerGetAttribute, |
11073 | * ::cuPointerGetAttributes, |
11074 | * ::cuMemAlloc, |
11075 | * ::cuMemFree, |
11076 | * ::cuMemAllocHost, |
11077 | * ::cuMemFreeHost, |
11078 | * ::cuMemHostAlloc, |
11079 | * ::cuMemHostRegister, |
11080 | * ::cuMemHostUnregister |
11081 | */ |
11082 | CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); |
11083 | |
11084 | /** |
11085 | * \brief Returns information about a pointer. |
11086 | * |
11087 | * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): |
11088 | * |
11089 | * - ::CU_POINTER_ATTRIBUTE_CONTEXT |
11090 | * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE |
11091 | * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER |
11092 | * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER |
11093 | * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS |
11094 | * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID |
11095 | * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED |
11096 | * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL |
11097 | * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR |
11098 | * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE |
11099 | * - ::CU_POINTER_ATTRIBUTE_MAPPED |
11100 | * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE |
11101 | * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES |
11102 | * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE |
11103 | * |
11104 | * \param numAttributes - Number of attributes to query |
11105 | * \param attributes - An array of attributes to query |
11106 | * (numAttributes and the number of attributes in this array should match) |
11107 | * \param data - A two-dimensional array containing pointers to memory |
11108 | * locations where the result of each attribute query will be written to. |
11109 | * \param ptr - Pointer to query |
11110 | * |
11111 | * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr |
11112 | * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values |
11113 | * and CUDA_SUCCESS is returned. |
11114 | * |
11115 | * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA |
11116 | * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. |
11117 | * |
11118 | * \return |
11119 | * ::CUDA_SUCCESS, |
11120 | * ::CUDA_ERROR_DEINITIALIZED, |
11121 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11122 | * ::CUDA_ERROR_INVALID_VALUE, |
11123 | * ::CUDA_ERROR_INVALID_DEVICE |
11124 | * \notefnerr |
11125 | * |
11126 | * \sa |
11127 | * ::cuPointerGetAttribute, |
11128 | * ::cuPointerSetAttribute, |
11129 | * ::cudaPointerGetAttributes |
11130 | */ |
11131 | CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); |
11132 | |
11133 | /** @} */ /* END CUDA_UNIFIED */ |
11134 | |
11135 | /** |
11136 | * \defgroup CUDA_STREAM Stream Management |
11137 | * |
11138 | * ___MANBRIEF___ stream management functions of the low-level CUDA driver API |
11139 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
11140 | * |
11141 | * This section describes the stream management functions of the low-level CUDA |
11142 | * driver application programming interface. |
11143 | * |
11144 | * @{ |
11145 | */ |
11146 | |
11147 | /** |
11148 | * \brief Create a stream |
11149 | * |
11150 | * Creates a stream and returns a handle in \p phStream. The \p Flags argument |
11151 | * determines behaviors of the stream. |
11152 | * |
11153 | * Valid values for \p Flags are: |
11154 | * - ::CU_STREAM_DEFAULT: Default stream creation flag. |
11155 | * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created |
11156 | * stream may run concurrently with work in stream 0 (the NULL stream), and that |
11157 | * the created stream should perform no implicit synchronization with stream 0. |
11158 | * |
11159 | * \param phStream - Returned newly created stream |
11160 | * \param Flags - Parameters for stream creation |
11161 | * |
11162 | * \return |
11163 | * ::CUDA_SUCCESS, |
11164 | * ::CUDA_ERROR_DEINITIALIZED, |
11165 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11166 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11167 | * ::CUDA_ERROR_INVALID_VALUE, |
11168 | * ::CUDA_ERROR_OUT_OF_MEMORY |
11169 | * \notefnerr |
11170 | * |
11171 | * \sa ::cuStreamDestroy, |
11172 | * ::cuStreamCreateWithPriority, |
11173 | * ::cuStreamGetPriority, |
11174 | * ::cuStreamGetFlags, |
11175 | * ::cuStreamWaitEvent, |
11176 | * ::cuStreamQuery, |
11177 | * ::cuStreamSynchronize, |
11178 | * ::cuStreamAddCallback, |
11179 | * ::cudaStreamCreate, |
11180 | * ::cudaStreamCreateWithFlags |
11181 | */ |
11182 | CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); |
11183 | |
11184 | /** |
11185 | * \brief Create a stream with the given priority |
11186 | * |
11187 | * Creates a stream with the specified priority and returns a handle in \p phStream. |
11188 | * This API alters the scheduler priority of work in the stream. Work in a higher |
11189 | * priority stream may preempt work already executing in a low priority stream. |
11190 | * |
11191 | * \p priority follows a convention where lower numbers represent higher priorities. |
11192 | * '0' represents default priority. The range of meaningful numerical priorities can |
11193 | * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is |
11194 | * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, |
11195 | * it will automatically be clamped to the lowest or the highest number in the range. |
11196 | * |
11197 | * \param phStream - Returned newly created stream |
11198 | * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of |
11199 | * valid flags |
11200 | * \param priority - Stream priority. Lower numbers represent higher priorities. |
11201 | * See ::cuCtxGetStreamPriorityRange for more information about |
11202 | * meaningful stream priorities that can be passed. |
11203 | * |
11204 | * \return |
11205 | * ::CUDA_SUCCESS, |
11206 | * ::CUDA_ERROR_DEINITIALIZED, |
11207 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11208 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11209 | * ::CUDA_ERROR_INVALID_VALUE, |
11210 | * ::CUDA_ERROR_OUT_OF_MEMORY |
11211 | * \notefnerr |
11212 | * |
11213 | * \note Stream priorities are supported only on GPUs |
11214 | * with compute capability 3.5 or higher. |
11215 | * |
11216 | * \note In the current implementation, only compute kernels launched in |
11217 | * priority streams are affected by the stream's priority. Stream priorities have |
11218 | * no effect on host-to-device and device-to-host memory operations. |
11219 | * |
11220 | * \sa ::cuStreamDestroy, |
11221 | * ::cuStreamCreate, |
11222 | * ::cuStreamGetPriority, |
11223 | * ::cuCtxGetStreamPriorityRange, |
11224 | * ::cuStreamGetFlags, |
11225 | * ::cuStreamWaitEvent, |
11226 | * ::cuStreamQuery, |
11227 | * ::cuStreamSynchronize, |
11228 | * ::cuStreamAddCallback, |
11229 | * ::cudaStreamCreateWithPriority |
11230 | */ |
11231 | CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); |
11232 | |
11233 | |
11234 | /** |
11235 | * \brief Query the priority of a given stream |
11236 | * |
11237 | * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority |
11238 | * and return the priority in \p priority. Note that if the stream was created with a |
11239 | * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, |
11240 | * this function returns the clamped priority. |
11241 | * See ::cuStreamCreateWithPriority for details about priority clamping. |
11242 | * |
11243 | * \param hStream - Handle to the stream to be queried |
11244 | * \param priority - Pointer to a signed integer in which the stream's priority is returned |
11245 | * \return |
11246 | * ::CUDA_SUCCESS, |
11247 | * ::CUDA_ERROR_DEINITIALIZED, |
11248 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11249 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11250 | * ::CUDA_ERROR_INVALID_VALUE, |
11251 | * ::CUDA_ERROR_INVALID_HANDLE, |
11252 | * ::CUDA_ERROR_OUT_OF_MEMORY |
11253 | * \notefnerr |
11254 | * |
11255 | * \sa ::cuStreamDestroy, |
11256 | * ::cuStreamCreate, |
11257 | * ::cuStreamCreateWithPriority, |
11258 | * ::cuCtxGetStreamPriorityRange, |
11259 | * ::cuStreamGetFlags, |
11260 | * ::cudaStreamGetPriority |
11261 | */ |
11262 | CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); |
11263 | |
11264 | /** |
11265 | * \brief Query the flags of a given stream |
11266 | * |
11267 | * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority |
11268 | * and return the flags in \p flags. |
11269 | * |
11270 | * \param hStream - Handle to the stream to be queried |
11271 | * \param flags - Pointer to an unsigned integer in which the stream's flags are returned |
11272 | * The value returned in \p flags is a logical 'OR' of all flags that |
11273 | * were used while creating this stream. See ::cuStreamCreate for the list |
11274 | * of valid flags |
11275 | * \return |
11276 | * ::CUDA_SUCCESS, |
11277 | * ::CUDA_ERROR_DEINITIALIZED, |
11278 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11279 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11280 | * ::CUDA_ERROR_INVALID_VALUE, |
11281 | * ::CUDA_ERROR_INVALID_HANDLE, |
11282 | * ::CUDA_ERROR_OUT_OF_MEMORY |
11283 | * \notefnerr |
11284 | * |
11285 | * \sa ::cuStreamDestroy, |
11286 | * ::cuStreamCreate, |
11287 | * ::cuStreamGetPriority, |
11288 | * ::cudaStreamGetFlags |
11289 | */ |
11290 | CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); |
11291 | |
11292 | /** |
11293 | * \brief Query the context associated with a stream |
11294 | * |
11295 | * Returns the CUDA context that the stream is associated with. |
11296 | * |
11297 | * The stream handle \p hStream can refer to any of the following: |
11298 | * <ul> |
11299 | * <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate |
11300 | * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as |
11301 | * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. |
11302 | * The returned context is the context that was active in the calling thread when the |
11303 | * stream was created. Passing an invalid handle will result in undefined behavior.</li> |
11304 | * <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and |
11305 | * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, |
11306 | * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. |
11307 | * Specifying any of the special handles will return the context current to the |
11308 | * calling thread. If no context is current to the calling thread, |
11309 | * ::CUDA_ERROR_INVALID_CONTEXT is returned.</li> |
11310 | * </ul> |
11311 | * |
11312 | * \param hStream - Handle to the stream to be queried |
11313 | * \param pctx - Returned context associated with the stream |
11314 | * |
11315 | * \return |
11316 | * ::CUDA_SUCCESS, |
11317 | * ::CUDA_ERROR_DEINITIALIZED, |
11318 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11319 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11320 | * ::CUDA_ERROR_INVALID_HANDLE, |
11321 | * \notefnerr |
11322 | * |
11323 | * \sa ::cuStreamDestroy, |
11324 | * ::cuStreamCreateWithPriority, |
11325 | * ::cuStreamGetPriority, |
11326 | * ::cuStreamGetFlags, |
11327 | * ::cuStreamWaitEvent, |
11328 | * ::cuStreamQuery, |
11329 | * ::cuStreamSynchronize, |
11330 | * ::cuStreamAddCallback, |
11331 | * ::cudaStreamCreate, |
11332 | * ::cudaStreamCreateWithFlags |
11333 | */ |
11334 | CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); |
11335 | |
11336 | /** |
11337 | * \brief Make a compute stream wait on an event |
11338 | * |
11339 | * Makes all future work submitted to \p hStream wait for all work captured in |
11340 | * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. |
11341 | * The synchronization will be performed efficiently on the device when applicable. |
11342 | * \p hEvent may be from a different context or device than \p hStream. |
11343 | * |
11344 | * flags include: |
11345 | * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. |
11346 | * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external |
11347 | * event node when performing stream capture. This flag is invalid outside |
11348 | * of stream capture. |
11349 | * |
11350 | * \param hStream - Stream to wait |
11351 | * \param hEvent - Event to wait on (may not be NULL) |
11352 | * \param Flags - See ::CUevent_capture_flags |
11353 | * |
11354 | * \return |
11355 | * ::CUDA_SUCCESS, |
11356 | * ::CUDA_ERROR_DEINITIALIZED, |
11357 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11358 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11359 | * ::CUDA_ERROR_INVALID_HANDLE, |
11360 | * \note_null_stream |
11361 | * \notefnerr |
11362 | * |
11363 | * \sa ::cuStreamCreate, |
11364 | * ::cuEventRecord, |
11365 | * ::cuStreamQuery, |
11366 | * ::cuStreamSynchronize, |
11367 | * ::cuStreamAddCallback, |
11368 | * ::cuStreamDestroy, |
11369 | * ::cudaStreamWaitEvent |
11370 | */ |
11371 | CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); |
11372 | |
11373 | /** |
11374 | * \brief Add a callback to a compute stream |
11375 | * |
11376 | * \note This function is slated for eventual deprecation and removal. If |
11377 | * you do not require the callback to execute in case of a device error, |
11378 | * consider using ::cuLaunchHostFunc. Additionally, this function is not |
11379 | * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike |
11380 | * ::cuLaunchHostFunc. |
11381 | * |
11382 | * Adds a callback to be called on the host after all currently enqueued |
11383 | * items in the stream have completed. For each |
11384 | * cuStreamAddCallback call, the callback will be executed exactly once. |
11385 | * The callback will block later work in the stream until it is finished. |
11386 | * |
11387 | * The callback may be passed ::CUDA_SUCCESS or an error code. In the event |
11388 | * of a device error, all subsequently executed callbacks will receive an |
11389 | * appropriate ::CUresult. |
11390 | * |
11391 | * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API |
11392 | * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any |
11393 | * synchronization that may depend on outstanding device work or other callbacks |
11394 | * that are not mandated to run earlier. Callbacks without a mandated order |
11395 | * (in independent streams) execute in undefined order and may be serialized. |
11396 | * |
11397 | * For the purposes of Unified Memory, callback execution makes a number of |
11398 | * guarantees: |
11399 | * <ul> |
11400 | * <li>The callback stream is considered idle for the duration of the |
11401 | * callback. Thus, for example, a callback may always use memory attached |
11402 | * to the callback stream.</li> |
11403 | * <li>The start of execution of a callback has the same effect as |
11404 | * synchronizing an event recorded in the same stream immediately prior to |
11405 | * the callback. It thus synchronizes streams which have been "joined" |
11406 | * prior to the callback.</li> |
11407 | * <li>Adding device work to any stream does not have the effect of making |
11408 | * the stream active until all preceding host functions and stream callbacks |
11409 | * have executed. Thus, for |
11410 | * example, a callback might use global attached memory even if work has |
11411 | * been added to another stream, if the work has been ordered behind the |
11412 | * callback with an event.</li> |
11413 | * <li>Completion of a callback does not cause a stream to become |
11414 | * active except as described above. The callback stream will remain idle |
11415 | * if no device work follows the callback, and will remain idle across |
11416 | * consecutive callbacks without device work in between. Thus, for example, |
11417 | * stream synchronization can be done by signaling from a callback at the |
11418 | * end of the stream.</li> |
11419 | * </ul> |
11420 | * |
11421 | * \param hStream - Stream to add callback to |
11422 | * \param callback - The function to call once preceding stream operations are complete |
11423 | * \param userData - User specified data to be passed to the callback function |
11424 | * \param flags - Reserved for future use, must be 0 |
11425 | * |
11426 | * \return |
11427 | * ::CUDA_SUCCESS, |
11428 | * ::CUDA_ERROR_DEINITIALIZED, |
11429 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11430 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11431 | * ::CUDA_ERROR_INVALID_HANDLE, |
11432 | * ::CUDA_ERROR_NOT_SUPPORTED |
11433 | * \note_null_stream |
11434 | * \notefnerr |
11435 | * |
11436 | * \sa ::cuStreamCreate, |
11437 | * ::cuStreamQuery, |
11438 | * ::cuStreamSynchronize, |
11439 | * ::cuStreamWaitEvent, |
11440 | * ::cuStreamDestroy, |
11441 | * ::cuMemAllocManaged, |
11442 | * ::cuStreamAttachMemAsync, |
11443 | * ::cuStreamLaunchHostFunc, |
11444 | * ::cudaStreamAddCallback |
11445 | */ |
11446 | CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); |
11447 | |
11448 | /** |
11449 | * \brief Begins graph capture on a stream |
11450 | * |
11451 | * Begin graph capture on \p hStream. When a stream is in capture mode, all operations |
11452 | * pushed into the stream will not be executed, but will instead be captured into |
11453 | * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated |
11454 | * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which |
11455 | * it was initiated, and it may only be initiated if the stream is not already in capture |
11456 | * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id |
11457 | * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. |
11458 | * |
11459 | * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be |
11460 | * called on this stream from the same thread. |
11461 | * |
11462 | * \param hStream - Stream in which to initiate capture |
11463 | * \param mode - Controls the interaction of this capture sequence with other API |
11464 | * calls that are potentially unsafe. For more details see |
11465 | * ::cuThreadExchangeStreamCaptureMode. |
11466 | * |
11467 | * \note Kernels captured using this API must not use texture and surface references. |
11468 | * Reading or writing through any texture or surface reference is undefined |
11469 | * behavior. This restriction does not apply to texture and surface objects. |
11470 | * |
11471 | * \return |
11472 | * ::CUDA_SUCCESS, |
11473 | * ::CUDA_ERROR_DEINITIALIZED, |
11474 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11475 | * ::CUDA_ERROR_INVALID_VALUE |
11476 | * \notefnerr |
11477 | * |
11478 | * \sa |
11479 | * ::cuStreamCreate, |
11480 | * ::cuStreamIsCapturing, |
11481 | * ::cuStreamEndCapture, |
11482 | * ::cuThreadExchangeStreamCaptureMode |
11483 | */ |
11484 | CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); |
11485 | |
11486 | /** |
11487 | * \brief Swaps the stream capture interaction mode for a thread |
11488 | * |
11489 | * Sets the calling thread's stream capture interaction mode to the value contained |
11490 | * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To |
11491 | * facilitate deterministic behavior across function or module boundaries, callers |
11492 | * are encouraged to use this API in a push-pop fashion: \code |
11493 | CUstreamCaptureMode mode = desiredMode; |
11494 | cuThreadExchangeStreamCaptureMode(&mode); |
11495 | ... |
11496 | cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode |
11497 | * \endcode |
11498 | * |
11499 | * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call |
11500 | * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is |
11501 | * not enqueued asynchronously to a stream, and is not observed by stream capture. |
11502 | * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture |
11503 | * depended on the allocation being replayed whenever the graph is launched, the |
11504 | * captured graph would be invalid. |
11505 | * |
11506 | * Therefore, stream capture places restrictions on API calls that can be made within |
11507 | * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This |
11508 | * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. |
11509 | * |
11510 | * A thread's mode is one of the following: |
11511 | * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has |
11512 | * an ongoing capture sequence that was not initiated with |
11513 | * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread |
11514 | * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, |
11515 | * this thread is prohibited from potentially unsafe API calls. |
11516 | * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture |
11517 | * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited |
11518 | * from potentially unsafe API calls. Concurrent capture sequences in other threads |
11519 | * are ignored. |
11520 | * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially |
11521 | * unsafe API calls. Note that the thread is still prohibited from API calls which |
11522 | * necessarily conflict with stream capture, for example, attempting ::cuEventQuery |
11523 | * on an event that was last recorded inside a capture sequence. |
11524 | * |
11525 | * \param mode - Pointer to mode value to swap with the current mode |
11526 | * |
11527 | * \return |
11528 | * ::CUDA_SUCCESS, |
11529 | * ::CUDA_ERROR_DEINITIALIZED, |
11530 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11531 | * ::CUDA_ERROR_INVALID_VALUE |
11532 | * \notefnerr |
11533 | * |
11534 | * \sa |
11535 | * ::cuStreamBeginCapture |
11536 | */ |
11537 | CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); |
11538 | |
11539 | /** |
11540 | * \brief Ends capture on a stream, returning the captured graph |
11541 | * |
11542 | * End capture on \p hStream, returning the captured graph via \p phGraph. |
11543 | * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. |
11544 | * If capture was invalidated, due to a violation of the rules of stream capture, then |
11545 | * a NULL graph will be returned. |
11546 | * |
11547 | * If the \p mode argument to ::cuStreamBeginCapture was not |
11548 | * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as |
11549 | * ::cuStreamBeginCapture. |
11550 | * |
11551 | * \param hStream - Stream to query |
11552 | * \param phGraph - The captured graph |
11553 | * |
11554 | * \return |
11555 | * ::CUDA_SUCCESS, |
11556 | * ::CUDA_ERROR_DEINITIALIZED, |
11557 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11558 | * ::CUDA_ERROR_INVALID_VALUE, |
11559 | * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD |
11560 | * \notefnerr |
11561 | * |
11562 | * \sa |
11563 | * ::cuStreamCreate, |
11564 | * ::cuStreamBeginCapture, |
11565 | * ::cuStreamIsCapturing |
11566 | */ |
11567 | CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); |
11568 | |
11569 | /** |
11570 | * \brief Returns a stream's capture status |
11571 | * |
11572 | * Return the capture status of \p hStream via \p captureStatus. After a successful |
11573 | * call, \p *captureStatus will contain one of the following: |
11574 | * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. |
11575 | * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. |
11576 | * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error |
11577 | * has invalidated the capture sequence. The capture sequence must be terminated |
11578 | * with ::cuStreamEndCapture on the stream where it was initiated in order to |
11579 | * continue using \p hStream. |
11580 | * |
11581 | * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while |
11582 | * a blocking stream in the same context is capturing, it will return |
11583 | * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified |
11584 | * after the call. The blocking stream capture is not invalidated. |
11585 | * |
11586 | * When a blocking stream is capturing, the legacy stream is in an |
11587 | * unusable state until the blocking stream capture is terminated. The legacy |
11588 | * stream is not supported for stream capture, but attempted use would have an |
11589 | * implicit dependency on the capturing stream(s). |
11590 | * |
11591 | * \param hStream - Stream to query |
11592 | * \param captureStatus - Returns the stream's capture status |
11593 | * |
11594 | * \return |
11595 | * ::CUDA_SUCCESS, |
11596 | * ::CUDA_ERROR_DEINITIALIZED, |
11597 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11598 | * ::CUDA_ERROR_INVALID_VALUE, |
11599 | * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT |
11600 | * \notefnerr |
11601 | * |
11602 | * \sa |
11603 | * ::cuStreamCreate, |
11604 | * ::cuStreamBeginCapture, |
11605 | * ::cuStreamEndCapture |
11606 | */ |
11607 | CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); |
11608 | |
11609 | /** |
11610 | * \brief Query capture status of a stream |
11611 | * |
11612 | * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will |
11613 | * supplant this version in 12.0, which is retained for minor version compatibility. |
11614 | * |
11615 | * Query the capture status of a stream and and get an id for |
11616 | * the capture sequence, which is unique over the lifetime of the process. |
11617 | * |
11618 | * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created |
11619 | * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. |
11620 | * |
11621 | * A valid id is returned only if both of the following are true: |
11622 | * - the call returns CUDA_SUCCESS |
11623 | * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE |
11624 | * |
11625 | * \return |
11626 | * ::CUDA_SUCCESS, |
11627 | * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT |
11628 | * \notefnerr |
11629 | * |
11630 | * \sa |
11631 | * ::cuStreamGetCaptureInfo_v2, |
11632 | * ::cuStreamBeginCapture, |
11633 | * ::cuStreamIsCapturing |
11634 | */ |
11635 | CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); |
11636 | |
11637 | /** |
11638 | * \brief Query a stream's capture state (11.3+) |
11639 | * |
11640 | * Query stream state related to stream capture. |
11641 | * |
11642 | * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created |
11643 | * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. |
11644 | * |
11645 | * Valid data (other than capture status) is returned only if both of the following are true: |
11646 | * - the call returns CUDA_SUCCESS |
11647 | * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE |
11648 | * |
11649 | * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the |
11650 | * previous version in 12.0. Developers requiring compatibility across minor versions to |
11651 | * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback |
11652 | * path. |
11653 | * |
11654 | * \param hStream - The stream to query |
11655 | * \param captureStatus_out - Location to return the capture status of the stream; required |
11656 | * \param id_out - Optional location to return an id for the capture sequence, which is |
11657 | * unique over the lifetime of the process |
11658 | * \param graph_out - Optional location to return the graph being captured into. All |
11659 | * operations other than destroy and node removal are permitted on the graph |
11660 | * while the capture sequence is in progress. This API does not transfer |
11661 | * ownership of the graph, which is transferred or destroyed at |
11662 | * ::cuStreamEndCapture. Note that the graph handle may be invalidated before |
11663 | * end of capture for certain errors. Nodes that are or become |
11664 | * unreachable from the original stream at ::cuStreamEndCapture due to direct |
11665 | * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. |
11666 | * \param dependencies_out - Optional location to store a pointer to an array of nodes. |
11667 | * The next node to be captured in the stream will depend on this set of nodes, |
11668 | * absent operations such as event wait which modify this set. The array pointer |
11669 | * is valid until the next API call which operates on the stream or until end of |
11670 | * capture. The node handles may be copied out and are valid until they or the |
11671 | * graph is destroyed. The driver-owned array may also be passed directly to |
11672 | * APIs that operate on the graph (not the stream) without copying. |
11673 | * \param numDependencies_out - Optional location to store the size of the array |
11674 | * returned in dependencies_out. |
11675 | * |
11676 | * \return |
11677 | * ::CUDA_SUCCESS, |
11678 | * ::CUDA_ERROR_INVALID_VALUE, |
11679 | * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT |
11680 | * \note_graph_thread_safety |
11681 | * \notefnerr |
11682 | * |
11683 | * \sa |
11684 | * ::cuStreamGetCaptureInfo, |
11685 | * ::cuStreamBeginCapture, |
11686 | * ::cuStreamIsCapturing, |
11687 | * ::cuStreamUpdateCaptureDependencies |
11688 | */ |
11689 | CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, |
11690 | cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); |
11691 | |
11692 | /** |
11693 | * \brief Update the set of dependencies in a capturing stream (11.3+) |
11694 | * |
11695 | * Modifies the dependency set of a capturing stream. The dependency set is the set |
11696 | * of nodes that the next captured node in the stream will depend on. |
11697 | * |
11698 | * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and |
11699 | * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to |
11700 | * the API is added to the existing set or replaces it. A flags value of 0 defaults |
11701 | * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. |
11702 | * |
11703 | * Nodes that are removed from the dependency set via this API do not result in |
11704 | * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at |
11705 | * ::cuStreamEndCapture. |
11706 | * |
11707 | * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. |
11708 | * |
11709 | * This API is new in CUDA 11.3. Developers requiring compatibility across minor |
11710 | * versions to CUDA 11.0 should not use this API or provide a fallback. |
11711 | * |
11712 | * \return |
11713 | * ::CUDA_SUCCESS, |
11714 | * ::CUDA_ERROR_INVALID_VALUE, |
11715 | * ::CUDA_ERROR_ILLEGAL_STATE |
11716 | * |
11717 | * \sa |
11718 | * ::cuStreamBeginCapture, |
11719 | * ::cuStreamGetCaptureInfo, |
11720 | * ::cuStreamGetCaptureInfo_v2 |
11721 | */ |
11722 | CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); |
11723 | |
11724 | /** |
11725 | * \brief Attach memory to a stream asynchronously |
11726 | * |
11727 | * Enqueues an operation in \p hStream to specify stream association of |
11728 | * \p length bytes of memory starting from \p dptr. This function is a |
11729 | * stream-ordered operation, meaning that it is dependent on, and will |
11730 | * only take effect when, previous work in stream has completed. Any |
11731 | * previous association is automatically replaced. |
11732 | * |
11733 | * \p dptr must point to one of the following types of memories: |
11734 | * - managed memory declared using the __managed__ keyword or allocated with |
11735 | * ::cuMemAllocManaged. |
11736 | * - a valid host-accessible region of system-allocated pageable memory. This |
11737 | * type of memory may only be specified if the device associated with the |
11738 | * stream reports a non-zero value for the device attribute |
11739 | * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. |
11740 | * |
11741 | * For managed allocations, \p length must be either zero or the entire |
11742 | * allocation's size. Both indicate that the entire allocation's stream |
11743 | * association is being changed. Currently, it is not possible to change stream |
11744 | * association for a portion of a managed allocation. |
11745 | * |
11746 | * For pageable host allocations, \p length must be non-zero. |
11747 | * |
11748 | * The stream association is specified using \p flags which must be |
11749 | * one of ::CUmemAttach_flags. |
11750 | * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed |
11751 | * by any stream on any device. |
11752 | * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee |
11753 | * that it won't access the memory on the device from any stream on a device that |
11754 | * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
11755 | * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with |
11756 | * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, |
11757 | * the program makes a guarantee that it will only access the memory on the device |
11758 | * from \p hStream. It is illegal to attach singly to the NULL stream, because the |
11759 | * NULL stream is a virtual global stream and not a specific stream. An error will |
11760 | * be returned in this case. |
11761 | * |
11762 | * When memory is associated with a single stream, the Unified Memory system will |
11763 | * allow CPU access to this memory region so long as all operations in \p hStream |
11764 | * have completed, regardless of whether other streams are active. In effect, |
11765 | * this constrains exclusive ownership of the managed memory region by |
11766 | * an active GPU to per-stream activity instead of whole-GPU activity. |
11767 | * |
11768 | * Accessing memory on the device from streams that are not associated with |
11769 | * it will produce undefined results. No error checking is performed by the |
11770 | * Unified Memory system to ensure that kernels launched into other streams |
11771 | * do not access this region. |
11772 | * |
11773 | * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync |
11774 | * via events, synchronization or other means to ensure legal access to memory |
11775 | * at all times. Data visibility and coherency will be changed appropriately |
11776 | * for all kernels which follow a stream-association change. |
11777 | * |
11778 | * If \p hStream is destroyed while data is associated with it, the association is |
11779 | * removed and the association reverts to the default visibility of the allocation |
11780 | * as specified at ::cuMemAllocManaged. For __managed__ variables, the default |
11781 | * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an |
11782 | * asynchronous operation, and as a result, the change to default association won't |
11783 | * happen until all work in the stream has completed. |
11784 | * |
11785 | * \param hStream - Stream in which to enqueue the attach operation |
11786 | * \param dptr - Pointer to memory (must be a pointer to managed memory or |
11787 | * to a valid host-accessible region of system-allocated |
11788 | * pageable memory) |
11789 | * \param length - Length of memory |
11790 | * \param flags - Must be one of ::CUmemAttach_flags |
11791 | * |
11792 | * \return |
11793 | * ::CUDA_SUCCESS, |
11794 | * ::CUDA_ERROR_DEINITIALIZED, |
11795 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11796 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11797 | * ::CUDA_ERROR_INVALID_HANDLE, |
11798 | * ::CUDA_ERROR_NOT_SUPPORTED |
11799 | * \note_null_stream |
11800 | * \notefnerr |
11801 | * |
11802 | * \sa ::cuStreamCreate, |
11803 | * ::cuStreamQuery, |
11804 | * ::cuStreamSynchronize, |
11805 | * ::cuStreamWaitEvent, |
11806 | * ::cuStreamDestroy, |
11807 | * ::cuMemAllocManaged, |
11808 | * ::cudaStreamAttachMemAsync |
11809 | */ |
11810 | CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); |
11811 | |
11812 | /** |
11813 | * \brief Determine status of a compute stream |
11814 | * |
11815 | * Returns ::CUDA_SUCCESS if all operations in the stream specified by |
11816 | * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. |
11817 | * |
11818 | * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS |
11819 | * is equivalent to having called ::cuStreamSynchronize(). |
11820 | * |
11821 | * \param hStream - Stream to query status of |
11822 | * |
11823 | * \return |
11824 | * ::CUDA_SUCCESS, |
11825 | * ::CUDA_ERROR_DEINITIALIZED, |
11826 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11827 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11828 | * ::CUDA_ERROR_INVALID_HANDLE, |
11829 | * ::CUDA_ERROR_NOT_READY |
11830 | * \note_null_stream |
11831 | * \notefnerr |
11832 | * |
11833 | * \sa ::cuStreamCreate, |
11834 | * ::cuStreamWaitEvent, |
11835 | * ::cuStreamDestroy, |
11836 | * ::cuStreamSynchronize, |
11837 | * ::cuStreamAddCallback, |
11838 | * ::cudaStreamQuery |
11839 | */ |
11840 | CUresult CUDAAPI cuStreamQuery(CUstream hStream); |
11841 | |
11842 | /** |
11843 | * \brief Wait until a stream's tasks are completed |
11844 | * |
11845 | * Waits until the device has completed all operations in the stream specified |
11846 | * by \p hStream. If the context was created with the |
11847 | * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the |
11848 | * stream is finished with all of its tasks. |
11849 | * |
11850 | * \param hStream - Stream to wait for |
11851 | * |
11852 | * \return |
11853 | * ::CUDA_SUCCESS, |
11854 | * ::CUDA_ERROR_DEINITIALIZED, |
11855 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11856 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11857 | * ::CUDA_ERROR_INVALID_HANDLE |
11858 | |
11859 | * \note_null_stream |
11860 | * \notefnerr |
11861 | * |
11862 | * \sa ::cuStreamCreate, |
11863 | * ::cuStreamDestroy, |
11864 | * ::cuStreamWaitEvent, |
11865 | * ::cuStreamQuery, |
11866 | * ::cuStreamAddCallback, |
11867 | * ::cudaStreamSynchronize |
11868 | */ |
11869 | CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); |
11870 | |
11871 | /** |
11872 | * \brief Destroys a stream |
11873 | * |
11874 | * Destroys the stream specified by \p hStream. |
11875 | * |
11876 | * In case the device is still doing work in the stream \p hStream |
11877 | * when ::cuStreamDestroy() is called, the function will return immediately |
11878 | * and the resources associated with \p hStream will be released automatically |
11879 | * once the device has completed all work in \p hStream. |
11880 | * |
11881 | * \param hStream - Stream to destroy |
11882 | * |
11883 | * \return |
11884 | * ::CUDA_SUCCESS, |
11885 | * ::CUDA_ERROR_DEINITIALIZED, |
11886 | * ::CUDA_ERROR_NOT_INITIALIZED, |
11887 | * ::CUDA_ERROR_INVALID_CONTEXT, |
11888 | * ::CUDA_ERROR_INVALID_VALUE, |
11889 | * ::CUDA_ERROR_INVALID_HANDLE |
11890 | * \notefnerr |
11891 | * |
11892 | * \sa ::cuStreamCreate, |
11893 | * ::cuStreamWaitEvent, |
11894 | * ::cuStreamQuery, |
11895 | * ::cuStreamSynchronize, |
11896 | * ::cuStreamAddCallback, |
11897 | * ::cudaStreamDestroy |
11898 | */ |
11899 | CUresult CUDAAPI cuStreamDestroy(CUstream hStream); |
11900 | |
11901 | /** |
11902 | * \brief Copies attributes from source stream to destination stream. |
11903 | * |
11904 | * Copies attributes from source stream \p src to destination stream \p dst. |
11905 | * Both streams must have the same context. |
11906 | * |
11907 | * \param[out] dst Destination stream |
11908 | * \param[in] src Source stream |
11909 | * For list of attributes see ::CUstreamAttrID |
11910 | * |
11911 | * \return |
11912 | * ::CUDA_SUCCESS, |
11913 | * ::CUDA_ERROR_INVALID_VALUE |
11914 | * \notefnerr |
11915 | * |
11916 | * \sa |
11917 | * ::CUaccessPolicyWindow |
11918 | */ |
11919 | CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); |
11920 | |
11921 | /** |
11922 | * \brief Queries stream attribute. |
11923 | * |
11924 | * Queries attribute \p attr from \p hStream and stores it in corresponding |
11925 | * member of \p value_out. |
11926 | * |
11927 | * \param[in] hStream |
11928 | * \param[in] attr |
11929 | * \param[out] value_out |
11930 | * |
11931 | * \return |
11932 | * ::CUDA_SUCCESS, |
11933 | * ::CUDA_ERROR_INVALID_VALUE, |
11934 | * ::CUDA_ERROR_INVALID_HANDLE |
11935 | * \notefnerr |
11936 | * |
11937 | * \sa |
11938 | * ::CUaccessPolicyWindow |
11939 | */ |
11940 | CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, |
11941 | CUstreamAttrValue *value_out); |
11942 | |
11943 | /** |
11944 | * \brief Sets stream attribute. |
11945 | * |
11946 | * Sets attribute \p attr on \p hStream from corresponding attribute of |
11947 | * \p value. The updated attribute will be applied to subsequent work |
11948 | * submitted to the stream. It will not affect previously submitted work. |
11949 | * |
11950 | * \param[out] hStream |
11951 | * \param[in] attr |
11952 | * \param[in] value |
11953 | * |
11954 | * \return |
11955 | * ::CUDA_SUCCESS, |
11956 | * ::CUDA_ERROR_INVALID_VALUE, |
11957 | * ::CUDA_ERROR_INVALID_HANDLE |
11958 | * \notefnerr |
11959 | * |
11960 | * \sa |
11961 | * ::CUaccessPolicyWindow |
11962 | */ |
11963 | CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, |
11964 | const CUstreamAttrValue *value); |
11965 | |
11966 | /** @} */ /* END CUDA_STREAM */ |
11967 | |
11968 | |
11969 | /** |
11970 | * \defgroup CUDA_EVENT Event Management |
11971 | * |
11972 | * ___MANBRIEF___ event management functions of the low-level CUDA driver API |
11973 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
11974 | * |
11975 | * This section describes the event management functions of the low-level CUDA |
11976 | * driver application programming interface. |
11977 | * |
11978 | * @{ |
11979 | */ |
11980 | |
11981 | /** |
11982 | * \brief Creates an event |
11983 | * |
11984 | * Creates an event *phEvent for the current context with the flags specified via |
11985 | * \p Flags. Valid flags include: |
11986 | * - ::CU_EVENT_DEFAULT: Default event creation flag. |
11987 | * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking |
11988 | * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on |
11989 | * an event created with this flag will block until the event has actually |
11990 | * been recorded. |
11991 | * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need |
11992 | * to record timing data. Events created with this flag specified and |
11993 | * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best |
11994 | * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). |
11995 | * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an |
11996 | * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must |
11997 | * be specified along with ::CU_EVENT_DISABLE_TIMING. |
11998 | * |
11999 | * \param phEvent - Returns newly created event |
12000 | * \param Flags - Event creation flags |
12001 | * |
12002 | * \return |
12003 | * ::CUDA_SUCCESS, |
12004 | * ::CUDA_ERROR_DEINITIALIZED, |
12005 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12006 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12007 | * ::CUDA_ERROR_INVALID_VALUE, |
12008 | * ::CUDA_ERROR_OUT_OF_MEMORY |
12009 | * \notefnerr |
12010 | * |
12011 | * \sa |
12012 | * ::cuEventRecord, |
12013 | * ::cuEventQuery, |
12014 | * ::cuEventSynchronize, |
12015 | * ::cuEventDestroy, |
12016 | * ::cuEventElapsedTime, |
12017 | * ::cudaEventCreate, |
12018 | * ::cudaEventCreateWithFlags |
12019 | */ |
12020 | CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); |
12021 | |
12022 | /** |
12023 | * \brief Records an event |
12024 | * |
12025 | * Captures in \p hEvent the contents of \p hStream at the time of this call. |
12026 | * \p hEvent and \p hStream must be from the same context. |
12027 | * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then |
12028 | * examine or wait for completion of the work that was captured. Uses of |
12029 | * \p hStream after this call do not modify \p hEvent. See note on default |
12030 | * stream behavior for what is captured in the default case. |
12031 | * |
12032 | * ::cuEventRecord() can be called multiple times on the same event and |
12033 | * will overwrite the previously captured state. Other APIs such as |
12034 | * ::cuStreamWaitEvent() use the most recently captured state at the time |
12035 | * of the API call, and are not affected by later calls to |
12036 | * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an |
12037 | * event represents an empty set of work, so for example ::cuEventQuery() |
12038 | * would return ::CUDA_SUCCESS. |
12039 | * |
12040 | * \param hEvent - Event to record |
12041 | * \param hStream - Stream to record event for |
12042 | * |
12043 | * \return |
12044 | * ::CUDA_SUCCESS, |
12045 | * ::CUDA_ERROR_DEINITIALIZED, |
12046 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12047 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12048 | * ::CUDA_ERROR_INVALID_HANDLE, |
12049 | * ::CUDA_ERROR_INVALID_VALUE |
12050 | * \note_null_stream |
12051 | * \notefnerr |
12052 | * |
12053 | * \sa ::cuEventCreate, |
12054 | * ::cuEventQuery, |
12055 | * ::cuEventSynchronize, |
12056 | * ::cuStreamWaitEvent, |
12057 | * ::cuEventDestroy, |
12058 | * ::cuEventElapsedTime, |
12059 | * ::cudaEventRecord, |
12060 | * ::cuEventRecordWithFlags |
12061 | */ |
12062 | CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); |
12063 | |
12064 | /** |
12065 | * \brief Records an event |
12066 | * |
12067 | * Captures in \p hEvent the contents of \p hStream at the time of this call. |
12068 | * \p hEvent and \p hStream must be from the same context. |
12069 | * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then |
12070 | * examine or wait for completion of the work that was captured. Uses of |
12071 | * \p hStream after this call do not modify \p hEvent. See note on default |
12072 | * stream behavior for what is captured in the default case. |
12073 | * |
12074 | * ::cuEventRecordWithFlags() can be called multiple times on the same event and |
12075 | * will overwrite the previously captured state. Other APIs such as |
12076 | * ::cuStreamWaitEvent() use the most recently captured state at the time |
12077 | * of the API call, and are not affected by later calls to |
12078 | * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an |
12079 | * event represents an empty set of work, so for example ::cuEventQuery() |
12080 | * would return ::CUDA_SUCCESS. |
12081 | * |
12082 | * flags include: |
12083 | * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. |
12084 | * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external |
12085 | * event node when performing stream capture. This flag is invalid outside |
12086 | * of stream capture. |
12087 | * |
12088 | * \param hEvent - Event to record |
12089 | * \param hStream - Stream to record event for |
12090 | * \param flags - See ::CUevent_capture_flags |
12091 | * |
12092 | * \return |
12093 | * ::CUDA_SUCCESS, |
12094 | * ::CUDA_ERROR_DEINITIALIZED, |
12095 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12096 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12097 | * ::CUDA_ERROR_INVALID_HANDLE, |
12098 | * ::CUDA_ERROR_INVALID_VALUE |
12099 | * \note_null_stream |
12100 | * \notefnerr |
12101 | * |
12102 | * \sa ::cuEventCreate, |
12103 | * ::cuEventQuery, |
12104 | * ::cuEventSynchronize, |
12105 | * ::cuStreamWaitEvent, |
12106 | * ::cuEventDestroy, |
12107 | * ::cuEventElapsedTime, |
12108 | * ::cuEventRecord, |
12109 | * ::cudaEventRecord |
12110 | */ |
12111 | CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); |
12112 | |
12113 | /** |
12114 | * \brief Queries an event's status |
12115 | * |
12116 | * Queries the status of all work currently captured by \p hEvent. See |
12117 | * ::cuEventRecord() for details on what is captured by an event. |
12118 | * |
12119 | * Returns ::CUDA_SUCCESS if all captured work has been completed, or |
12120 | * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. |
12121 | * |
12122 | * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS |
12123 | * is equivalent to having called ::cuEventSynchronize(). |
12124 | * |
12125 | * \param hEvent - Event to query |
12126 | * |
12127 | * \return |
12128 | * ::CUDA_SUCCESS, |
12129 | * ::CUDA_ERROR_DEINITIALIZED, |
12130 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12131 | * ::CUDA_ERROR_INVALID_HANDLE, |
12132 | * ::CUDA_ERROR_INVALID_VALUE, |
12133 | * ::CUDA_ERROR_NOT_READY |
12134 | * \notefnerr |
12135 | * |
12136 | * \sa ::cuEventCreate, |
12137 | * ::cuEventRecord, |
12138 | * ::cuEventSynchronize, |
12139 | * ::cuEventDestroy, |
12140 | * ::cuEventElapsedTime, |
12141 | * ::cudaEventQuery |
12142 | */ |
12143 | CUresult CUDAAPI cuEventQuery(CUevent hEvent); |
12144 | |
12145 | /** |
12146 | * \brief Waits for an event to complete |
12147 | * |
12148 | * Waits until the completion of all work currently captured in \p hEvent. |
12149 | * See ::cuEventRecord() for details on what is captured by an event. |
12150 | * |
12151 | * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC |
12152 | * flag will cause the calling CPU thread to block until the event has |
12153 | * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has |
12154 | * not been set, then the CPU thread will busy-wait until the event has |
12155 | * been completed by the device. |
12156 | * |
12157 | * \param hEvent - Event to wait for |
12158 | * |
12159 | * \return |
12160 | * ::CUDA_SUCCESS, |
12161 | * ::CUDA_ERROR_DEINITIALIZED, |
12162 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12163 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12164 | * ::CUDA_ERROR_INVALID_HANDLE |
12165 | * \notefnerr |
12166 | * |
12167 | * \sa ::cuEventCreate, |
12168 | * ::cuEventRecord, |
12169 | * ::cuEventQuery, |
12170 | * ::cuEventDestroy, |
12171 | * ::cuEventElapsedTime, |
12172 | * ::cudaEventSynchronize |
12173 | */ |
12174 | CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); |
12175 | |
12176 | /** |
12177 | * \brief Destroys an event |
12178 | * |
12179 | * Destroys the event specified by \p hEvent. |
12180 | * |
12181 | * An event may be destroyed before it is complete (i.e., while |
12182 | * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the |
12183 | * call does not block on completion of the event, and any associated |
12184 | * resources will automatically be released asynchronously at completion. |
12185 | * |
12186 | * \param hEvent - Event to destroy |
12187 | * |
12188 | * \return |
12189 | * ::CUDA_SUCCESS, |
12190 | * ::CUDA_ERROR_DEINITIALIZED, |
12191 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12192 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12193 | * ::CUDA_ERROR_INVALID_HANDLE |
12194 | * \notefnerr |
12195 | * |
12196 | * \sa ::cuEventCreate, |
12197 | * ::cuEventRecord, |
12198 | * ::cuEventQuery, |
12199 | * ::cuEventSynchronize, |
12200 | * ::cuEventElapsedTime, |
12201 | * ::cudaEventDestroy |
12202 | */ |
12203 | CUresult CUDAAPI cuEventDestroy(CUevent hEvent); |
12204 | |
12205 | /** |
12206 | * \brief Computes the elapsed time between two events |
12207 | * |
12208 | * Computes the elapsed time between two events (in milliseconds with a |
12209 | * resolution of around 0.5 microseconds). |
12210 | * |
12211 | * If either event was last recorded in a non-NULL stream, the resulting time |
12212 | * may be greater than expected (even if both used the same stream handle). This |
12213 | * happens because the ::cuEventRecord() operation takes place asynchronously |
12214 | * and there is no guarantee that the measured latency is actually just between |
12215 | * the two events. Any number of other different stream operations could execute |
12216 | * in between the two measured events, thus altering the timing in a significant |
12217 | * way. |
12218 | * |
12219 | * If ::cuEventRecord() has not been called on either event then |
12220 | * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called |
12221 | * on both events but one or both of them has not yet been completed (that is, |
12222 | * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the |
12223 | * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with |
12224 | * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return |
12225 | * ::CUDA_ERROR_INVALID_HANDLE. |
12226 | * |
12227 | * \param pMilliseconds - Time between \p hStart and \p hEnd in ms |
12228 | * \param hStart - Starting event |
12229 | * \param hEnd - Ending event |
12230 | * |
12231 | * \return |
12232 | * ::CUDA_SUCCESS, |
12233 | * ::CUDA_ERROR_DEINITIALIZED, |
12234 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12235 | * ::CUDA_ERROR_INVALID_CONTEXT, |
12236 | * ::CUDA_ERROR_INVALID_HANDLE, |
12237 | * ::CUDA_ERROR_NOT_READY |
12238 | * \notefnerr |
12239 | * |
12240 | * \sa ::cuEventCreate, |
12241 | * ::cuEventRecord, |
12242 | * ::cuEventQuery, |
12243 | * ::cuEventSynchronize, |
12244 | * ::cuEventDestroy, |
12245 | * ::cudaEventElapsedTime |
12246 | */ |
12247 | CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); |
12248 | |
12249 | /** @} */ /* END CUDA_EVENT */ |
12250 | |
12251 | /** |
12252 | * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability |
12253 | * |
12254 | * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API |
12255 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
12256 | * |
12257 | * This section describes the external resource interoperability functions of the low-level CUDA |
12258 | * driver application programming interface. |
12259 | * |
12260 | * @{ |
12261 | */ |
12262 | |
12263 | /** |
12264 | * \brief Imports an external memory object |
12265 | * |
12266 | * Imports an externally allocated memory object and returns |
12267 | * a handle to that in \p extMem_out. |
12268 | * |
12269 | * The properties of the handle being imported must be described in |
12270 | * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure |
12271 | * is defined as follows: |
12272 | * |
12273 | * \code |
12274 | typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { |
12275 | CUexternalMemoryHandleType type; |
12276 | union { |
12277 | int fd; |
12278 | struct { |
12279 | void *handle; |
12280 | const void *name; |
12281 | } win32; |
12282 | const void *nvSciBufObject; |
12283 | } handle; |
12284 | unsigned long long size; |
12285 | unsigned int flags; |
12286 | } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; |
12287 | * \endcode |
12288 | * |
12289 | * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type |
12290 | * of handle being imported. ::CUexternalMemoryHandleType is |
12291 | * defined as: |
12292 | * |
12293 | * \code |
12294 | typedef enum CUexternalMemoryHandleType_enum { |
12295 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, |
12296 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
12297 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
12298 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, |
12299 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, |
12300 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, |
12301 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, |
12302 | CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 |
12303 | } CUexternalMemoryHandleType; |
12304 | * \endcode |
12305 | * |
12306 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12307 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then |
12308 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid |
12309 | * file descriptor referencing a memory object. Ownership of |
12310 | * the file descriptor is transferred to the CUDA driver when the |
12311 | * handle is imported successfully. Performing any operations on the |
12312 | * file descriptor after it is imported results in undefined behavior. |
12313 | * |
12314 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12315 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one |
12316 | * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
12317 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
12318 | * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
12319 | * is not NULL, then it must represent a valid shared NT handle that |
12320 | * references a memory object. Ownership of this handle is |
12321 | * not transferred to CUDA after the import operation, so the |
12322 | * application must release the handle using the appropriate system |
12323 | * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12324 | * is not NULL, then it must point to a NULL-terminated array of |
12325 | * UTF-16 characters that refers to a memory object. |
12326 | * |
12327 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12328 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then |
12329 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
12330 | * be non-NULL and |
12331 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12332 | * must be NULL. The handle specified must be a globally shared KMT |
12333 | * handle. This handle does not hold a reference to the underlying |
12334 | * object, and thus will be invalid when all references to the |
12335 | * memory object are destroyed. |
12336 | * |
12337 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12338 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one |
12339 | * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
12340 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
12341 | * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
12342 | * is not NULL, then it must represent a valid shared NT handle that |
12343 | * is returned by ID3D12Device::CreateSharedHandle when referring to a |
12344 | * ID3D12Heap object. This handle holds a reference to the underlying |
12345 | * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12346 | * is not NULL, then it must point to a NULL-terminated array of |
12347 | * UTF-16 characters that refers to a ID3D12Heap object. |
12348 | * |
12349 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12350 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one |
12351 | * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
12352 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
12353 | * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
12354 | * is not NULL, then it must represent a valid shared NT handle that |
12355 | * is returned by ID3D12Device::CreateSharedHandle when referring to a |
12356 | * ID3D12Resource object. This handle holds a reference to the |
12357 | * underlying object. If |
12358 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12359 | * is not NULL, then it must point to a NULL-terminated array of |
12360 | * UTF-16 characters that refers to a ID3D12Resource object. |
12361 | * |
12362 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12363 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then |
12364 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
12365 | * represent a valid shared NT handle that is returned by |
12366 | * IDXGIResource1::CreateSharedHandle when referring to a |
12367 | * ID3D11Resource object. If |
12368 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12369 | * is not NULL, then it must point to a NULL-terminated array of |
12370 | * UTF-16 characters that refers to a ID3D11Resource object. |
12371 | * |
12372 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12373 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then |
12374 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
12375 | * represent a valid shared KMT handle that is returned by |
12376 | * IDXGIResource::GetSharedHandle when referring to a |
12377 | * ID3D11Resource object and |
12378 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
12379 | * must be NULL. |
12380 | * |
12381 | * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
12382 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then |
12383 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL |
12384 | * and reference a valid NvSciBuf object. |
12385 | * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the |
12386 | * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync |
12387 | * as appropriate barriers to maintain coherence between CUDA and the other drivers. |
12388 | * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC |
12389 | * for memory synchronization. |
12390 | * |
12391 | * |
12392 | * The size of the memory object must be specified in |
12393 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. |
12394 | * |
12395 | * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in |
12396 | * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the |
12397 | * resource is a dedicated resource. The definition of what a |
12398 | * dedicated resource is outside the scope of this extension. |
12399 | * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type |
12400 | * is one of the following: |
12401 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE |
12402 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE |
12403 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
12404 | * |
12405 | * \param extMem_out - Returned handle to an external memory object |
12406 | * \param memHandleDesc - Memory import handle descriptor |
12407 | * |
12408 | * \return |
12409 | * ::CUDA_SUCCESS, |
12410 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12411 | * ::CUDA_ERROR_INVALID_VALUE, |
12412 | * ::CUDA_ERROR_INVALID_HANDLE |
12413 | * \notefnerr |
12414 | * |
12415 | * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the |
12416 | * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges |
12417 | * as well as appropriate Vulkan pipeline barriers to maintain coherence between |
12418 | * CPU and GPU. For more information on these APIs, please refer to "Synchronization |
12419 | * and Cache Control" chapter from Vulkan specification. |
12420 | * |
12421 | * \sa ::cuDestroyExternalMemory, |
12422 | * ::cuExternalMemoryGetMappedBuffer, |
12423 | * ::cuExternalMemoryGetMappedMipmappedArray |
12424 | */ |
12425 | CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); |
12426 | |
12427 | /** |
12428 | * \brief Maps a buffer onto an imported memory object |
12429 | * |
12430 | * Maps a buffer onto an imported memory object and returns a device |
12431 | * pointer in \p devPtr. |
12432 | * |
12433 | * The properties of the buffer being mapped must be described in |
12434 | * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is |
12435 | * defined as follows: |
12436 | * |
12437 | * \code |
12438 | typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { |
12439 | unsigned long long offset; |
12440 | unsigned long long size; |
12441 | unsigned int flags; |
12442 | } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; |
12443 | * \endcode |
12444 | * |
12445 | * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in |
12446 | * the memory object where the buffer's base address is. |
12447 | * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. |
12448 | * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. |
12449 | * |
12450 | * The offset and size have to be suitably aligned to match the |
12451 | * requirements of the external API. Mapping two buffers whose ranges |
12452 | * overlap may or may not result in the same virtual address being |
12453 | * returned for the overlapped portion. In such cases, the application |
12454 | * must ensure that all accesses to that region from the GPU are |
12455 | * volatile. Otherwise writes made via one address are not guaranteed |
12456 | * to be visible via the other address, even if they're issued by the |
12457 | * same thread. It is recommended that applications map the combined |
12458 | * range instead of mapping separate buffers and then apply the |
12459 | * appropriate offsets to the returned pointer to derive the |
12460 | * individual buffers. |
12461 | * |
12462 | * The returned pointer \p devPtr must be freed using ::cuMemFree. |
12463 | * |
12464 | * \param devPtr - Returned device pointer to buffer |
12465 | * \param extMem - Handle to external memory object |
12466 | * \param bufferDesc - Buffer descriptor |
12467 | * |
12468 | * \return |
12469 | * ::CUDA_SUCCESS, |
12470 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12471 | * ::CUDA_ERROR_INVALID_VALUE, |
12472 | * ::CUDA_ERROR_INVALID_HANDLE |
12473 | * \notefnerr |
12474 | * |
12475 | * \sa ::cuImportExternalMemory, |
12476 | * ::cuDestroyExternalMemory, |
12477 | * ::cuExternalMemoryGetMappedMipmappedArray |
12478 | */ |
12479 | CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); |
12480 | |
12481 | /** |
12482 | * \brief Maps a CUDA mipmapped array onto an external memory object |
12483 | * |
12484 | * Maps a CUDA mipmapped array onto an external object and returns a |
12485 | * handle to it in \p mipmap. |
12486 | * |
12487 | * The properties of the CUDA mipmapped array being mapped must be |
12488 | * described in \p mipmapDesc. The structure |
12489 | * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: |
12490 | * |
12491 | * \code |
12492 | typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { |
12493 | unsigned long long offset; |
12494 | CUDA_ARRAY3D_DESCRIPTOR arrayDesc; |
12495 | unsigned int numLevels; |
12496 | } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; |
12497 | * \endcode |
12498 | * |
12499 | * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the |
12500 | * offset in the memory object where the base level of the mipmap |
12501 | * chain is. |
12502 | * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes |
12503 | * the format, dimensions and type of the base level of the mipmap |
12504 | * chain. For further details on these parameters, please refer to the |
12505 | * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped |
12506 | * array is bound as a color target in the graphics API, then the flag |
12507 | * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in |
12508 | * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. |
12509 | * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies |
12510 | * the total number of levels in the mipmap chain. |
12511 | * |
12512 | * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then |
12513 | * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. |
12514 | * |
12515 | * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. |
12516 | * |
12517 | * \param mipmap - Returned CUDA mipmapped array |
12518 | * \param extMem - Handle to external memory object |
12519 | * \param mipmapDesc - CUDA array descriptor |
12520 | * |
12521 | * \return |
12522 | * ::CUDA_SUCCESS, |
12523 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12524 | * ::CUDA_ERROR_INVALID_VALUE, |
12525 | * ::CUDA_ERROR_INVALID_HANDLE |
12526 | * \notefnerr |
12527 | * |
12528 | * \sa ::cuImportExternalMemory, |
12529 | * ::cuDestroyExternalMemory, |
12530 | * ::cuExternalMemoryGetMappedBuffer |
12531 | */ |
12532 | CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); |
12533 | |
12534 | /** |
12535 | * \brief Destroys an external memory object. |
12536 | * |
12537 | * Destroys the specified external memory object. Any existing buffers |
12538 | * and CUDA mipmapped arrays mapped onto this object must no longer be |
12539 | * used and must be explicitly freed using ::cuMemFree and |
12540 | * ::cuMipmappedArrayDestroy respectively. |
12541 | * |
12542 | * \param extMem - External memory object to be destroyed |
12543 | * |
12544 | * \return |
12545 | * ::CUDA_SUCCESS, |
12546 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12547 | * ::CUDA_ERROR_INVALID_HANDLE |
12548 | * \notefnerr |
12549 | * |
12550 | * \sa ::cuImportExternalMemory, |
12551 | * ::cuExternalMemoryGetMappedBuffer, |
12552 | * ::cuExternalMemoryGetMappedMipmappedArray |
12553 | */ |
12554 | CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); |
12555 | |
12556 | /** |
12557 | * \brief Imports an external semaphore |
12558 | * |
12559 | * Imports an externally allocated synchronization object and returns |
12560 | * a handle to that in \p extSem_out. |
12561 | * |
12562 | * The properties of the handle being imported must be described in |
12563 | * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is |
12564 | * defined as follows: |
12565 | * |
12566 | * \code |
12567 | typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { |
12568 | CUexternalSemaphoreHandleType type; |
12569 | union { |
12570 | int fd; |
12571 | struct { |
12572 | void *handle; |
12573 | const void *name; |
12574 | } win32; |
12575 | const void* NvSciSyncObj; |
12576 | } handle; |
12577 | unsigned int flags; |
12578 | } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; |
12579 | * \endcode |
12580 | * |
12581 | * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of |
12582 | * handle being imported. ::CUexternalSemaphoreHandleType is defined |
12583 | * as: |
12584 | * |
12585 | * \code |
12586 | typedef enum CUexternalSemaphoreHandleType_enum { |
12587 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, |
12588 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
12589 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
12590 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, |
12591 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, |
12592 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, |
12593 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, |
12594 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, |
12595 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, |
12596 | CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 |
12597 | } CUexternalSemaphoreHandleType; |
12598 | * \endcode |
12599 | * |
12600 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12601 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then |
12602 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid |
12603 | * file descriptor referencing a synchronization object. Ownership of |
12604 | * the file descriptor is transferred to the CUDA driver when the |
12605 | * handle is imported successfully. Performing any operations on the |
12606 | * file descriptor after it is imported results in undefined behavior. |
12607 | * |
12608 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12609 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one |
12610 | * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and |
12611 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be |
12612 | * NULL. If |
12613 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12614 | * is not NULL, then it must represent a valid shared NT handle that |
12615 | * references a synchronization object. Ownership of this handle is |
12616 | * not transferred to CUDA after the import operation, so the |
12617 | * application must release the handle using the appropriate system |
12618 | * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12619 | * is not NULL, then it must name a valid synchronization object. |
12620 | * |
12621 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12622 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then |
12623 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must |
12624 | * be non-NULL and |
12625 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12626 | * must be NULL. The handle specified must be a globally shared KMT |
12627 | * handle. This handle does not hold a reference to the underlying |
12628 | * object, and thus will be invalid when all references to the |
12629 | * synchronization object are destroyed. |
12630 | * |
12631 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12632 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one |
12633 | * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and |
12634 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be |
12635 | * NULL. If |
12636 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12637 | * is not NULL, then it must represent a valid shared NT handle that |
12638 | * is returned by ID3D12Device::CreateSharedHandle when referring to a |
12639 | * ID3D12Fence object. This handle holds a reference to the underlying |
12640 | * object. If |
12641 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12642 | * is not NULL, then it must name a valid synchronization object that |
12643 | * refers to a valid ID3D12Fence object. |
12644 | * |
12645 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12646 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then |
12647 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12648 | * represents a valid shared NT handle that is returned by |
12649 | * ID3D11Fence::CreateSharedHandle. If |
12650 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12651 | * is not NULL, then it must name a valid synchronization object that |
12652 | * refers to a valid ID3D11Fence object. |
12653 | * |
12654 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12655 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then |
12656 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj |
12657 | * represents a valid NvSciSyncObj. |
12658 | * |
12659 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then |
12660 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12661 | * represents a valid shared NT handle that |
12662 | * is returned by IDXGIResource1::CreateSharedHandle when referring to |
12663 | * a IDXGIKeyedMutex object. If |
12664 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12665 | * is not NULL, then it must name a valid synchronization object that |
12666 | * refers to a valid IDXGIKeyedMutex object. |
12667 | * |
12668 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12669 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then |
12670 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12671 | * represents a valid shared KMT handle that |
12672 | * is returned by IDXGIResource::GetSharedHandle when referring to |
12673 | * a IDXGIKeyedMutex object and |
12674 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. |
12675 | * |
12676 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12677 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then |
12678 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid |
12679 | * file descriptor referencing a synchronization object. Ownership of |
12680 | * the file descriptor is transferred to the CUDA driver when the |
12681 | * handle is imported successfully. Performing any operations on the |
12682 | * file descriptor after it is imported results in undefined behavior. |
12683 | * |
12684 | * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
12685 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one |
12686 | * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and |
12687 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be |
12688 | * NULL. If |
12689 | * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
12690 | * is not NULL, then it must represent a valid shared NT handle that |
12691 | * references a synchronization object. Ownership of this handle is |
12692 | * not transferred to CUDA after the import operation, so the |
12693 | * application must release the handle using the appropriate system |
12694 | * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
12695 | * is not NULL, then it must name a valid synchronization object. |
12696 | * |
12697 | * \param extSem_out - Returned handle to an external semaphore |
12698 | * \param semHandleDesc - Semaphore import handle descriptor |
12699 | * |
12700 | * \return |
12701 | * ::CUDA_SUCCESS, |
12702 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12703 | * ::CUDA_ERROR_NOT_SUPPORTED, |
12704 | * ::CUDA_ERROR_INVALID_HANDLE |
12705 | * \notefnerr |
12706 | * |
12707 | * \sa ::cuDestroyExternalSemaphore, |
12708 | * ::cuSignalExternalSemaphoresAsync, |
12709 | * ::cuWaitExternalSemaphoresAsync |
12710 | */ |
12711 | CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); |
12712 | |
12713 | /** |
12714 | * \brief Signals a set of external semaphore objects |
12715 | * |
12716 | * Enqueues a signal operation on a set of externally allocated |
12717 | * semaphore object in the specified stream. The operations will be |
12718 | * executed when all prior operations in the stream complete. |
12719 | * |
12720 | * The exact semantics of signaling a semaphore depends on the type of |
12721 | * the object. |
12722 | * |
12723 | * If the semaphore object is any one of the following types: |
12724 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, |
12725 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, |
12726 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
12727 | * then signaling the semaphore will set it to the signaled state. |
12728 | * |
12729 | * If the semaphore object is any one of the following types: |
12730 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, |
12731 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, |
12732 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, |
12733 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 |
12734 | * then the semaphore will be set to the value specified in |
12735 | * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. |
12736 | * |
12737 | * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC |
12738 | * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence |
12739 | * to a value that can be used by subsequent waiters of the same NvSciSync object |
12740 | * to order operations with those currently submitted in \p stream. Such an update |
12741 | * will overwrite previous contents of |
12742 | * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, |
12743 | * signaling such an external semaphore object causes appropriate memory synchronization |
12744 | * operations to be performed over all external memory objects that are imported as |
12745 | * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses |
12746 | * made by other importers of the same set of NvSciBuf memory object(s) are coherent. |
12747 | * These operations can be skipped by specifying the flag |
12748 | * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a |
12749 | * performance optimization when data coherency is not required. But specifying this |
12750 | * flag in scenarios where data coherency is required results in undefined behavior. |
12751 | * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
12752 | * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in |
12753 | * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return |
12754 | * CUDA_ERROR_NOT_SUPPORTED. |
12755 | * |
12756 | * If the semaphore object is any one of the following types: |
12757 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, |
12758 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
12759 | * then the keyed mutex will be released with the key specified in |
12760 | * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. |
12761 | * |
12762 | * \param extSemArray - Set of external semaphores to be signaled |
12763 | * \param paramsArray - Array of semaphore parameters |
12764 | * \param numExtSems - Number of semaphores to signal |
12765 | * \param stream - Stream to enqueue the signal operations in |
12766 | * |
12767 | * \return |
12768 | * ::CUDA_SUCCESS, |
12769 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12770 | * ::CUDA_ERROR_INVALID_HANDLE, |
12771 | * ::CUDA_ERROR_NOT_SUPPORTED |
12772 | * \notefnerr |
12773 | * |
12774 | * \sa ::cuImportExternalSemaphore, |
12775 | * ::cuDestroyExternalSemaphore, |
12776 | * ::cuWaitExternalSemaphoresAsync |
12777 | */ |
12778 | CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
12779 | |
12780 | /** |
12781 | * \brief Waits on a set of external semaphore objects |
12782 | * |
12783 | * Enqueues a wait operation on a set of externally allocated |
12784 | * semaphore object in the specified stream. The operations will be |
12785 | * executed when all prior operations in the stream complete. |
12786 | * |
12787 | * The exact semantics of waiting on a semaphore depends on the type |
12788 | * of the object. |
12789 | * |
12790 | * If the semaphore object is any one of the following types: |
12791 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, |
12792 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, |
12793 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
12794 | * then waiting on the semaphore will wait until the semaphore reaches |
12795 | * the signaled state. The semaphore will then be reset to the |
12796 | * unsignaled state. Therefore for every signal operation, there can |
12797 | * only be one wait operation. |
12798 | * |
12799 | * If the semaphore object is any one of the following types: |
12800 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, |
12801 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, |
12802 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, |
12803 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 |
12804 | * then waiting on the semaphore will wait until the value of the |
12805 | * semaphore is greater than or equal to |
12806 | * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. |
12807 | * |
12808 | * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC |
12809 | * then, waiting on the semaphore will wait until the |
12810 | * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the |
12811 | * signaler of the NvSciSyncObj that was associated with this semaphore object. |
12812 | * By default, waiting on such an external semaphore object causes appropriate |
12813 | * memory synchronization operations to be performed over all external memory objects |
12814 | * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that |
12815 | * any subsequent accesses made by other importers of the same set of NvSciBuf memory |
12816 | * object(s) are coherent. These operations can be skipped by specifying the flag |
12817 | * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a |
12818 | * performance optimization when data coherency is not required. But specifying this |
12819 | * flag in scenarios where data coherency is required results in undefined behavior. |
12820 | * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
12821 | * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in |
12822 | * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return |
12823 | * CUDA_ERROR_NOT_SUPPORTED. |
12824 | * |
12825 | * If the semaphore object is any one of the following types: |
12826 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, |
12827 | * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
12828 | * then the keyed mutex will be acquired when it is released with the key |
12829 | * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key |
12830 | * or until the timeout specified by |
12831 | * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs |
12832 | * has lapsed. The timeout interval can either be a finite value |
12833 | * specified in milliseconds or an infinite value. In case an infinite |
12834 | * value is specified the timeout never elapses. The windows INFINITE |
12835 | * macro must be used to specify infinite timeout. |
12836 | * |
12837 | * \param extSemArray - External semaphores to be waited on |
12838 | * \param paramsArray - Array of semaphore parameters |
12839 | * \param numExtSems - Number of semaphores to wait on |
12840 | * \param stream - Stream to enqueue the wait operations in |
12841 | * |
12842 | * \return |
12843 | * ::CUDA_SUCCESS, |
12844 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12845 | * ::CUDA_ERROR_INVALID_HANDLE, |
12846 | * ::CUDA_ERROR_NOT_SUPPORTED, |
12847 | * ::CUDA_ERROR_TIMEOUT |
12848 | * \notefnerr |
12849 | * |
12850 | * \sa ::cuImportExternalSemaphore, |
12851 | * ::cuDestroyExternalSemaphore, |
12852 | * ::cuSignalExternalSemaphoresAsync |
12853 | */ |
12854 | CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
12855 | |
12856 | /** |
12857 | * \brief Destroys an external semaphore |
12858 | * |
12859 | * Destroys an external semaphore object and releases any references |
12860 | * to the underlying resource. Any outstanding signals or waits must |
12861 | * have completed before the semaphore is destroyed. |
12862 | * |
12863 | * \param extSem - External semaphore to be destroyed |
12864 | * |
12865 | * \return |
12866 | * ::CUDA_SUCCESS, |
12867 | * ::CUDA_ERROR_NOT_INITIALIZED, |
12868 | * ::CUDA_ERROR_INVALID_HANDLE |
12869 | * \notefnerr |
12870 | * |
12871 | * \sa ::cuImportExternalSemaphore, |
12872 | * ::cuSignalExternalSemaphoresAsync, |
12873 | * ::cuWaitExternalSemaphoresAsync |
12874 | */ |
12875 | CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); |
12876 | |
12877 | /** @} */ /* END CUDA_EXTRES_INTEROP */ |
12878 | |
12879 | /** |
12880 | * \defgroup CUDA_MEMOP Stream memory operations |
12881 | * |
12882 | * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API |
12883 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
12884 | * |
12885 | * This section describes the stream memory operations of the low-level CUDA |
12886 | * driver application programming interface. |
12887 | * |
12888 | * The whole set of operations is disabled by default. Users are required |
12889 | * to explicitly enable them, e.g. on Linux by passing the kernel module |
12890 | * parameter shown below: |
12891 | * modprobe nvidia NVreg_EnableStreamMemOPs=1 |
12892 | * There is currently no way to enable these operations on other operating |
12893 | * systems. |
12894 | * |
12895 | * Users can programmatically query whether the device supports these |
12896 | * operations with ::cuDeviceGetAttribute() and |
12897 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
12898 | * |
12899 | * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with |
12900 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. |
12901 | * |
12902 | * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() |
12903 | * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and |
12904 | * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with |
12905 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
12906 | * |
12907 | * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and |
12908 | * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform |
12909 | * hardware features and can be queried with ::cuDeviceGetAttribute() and |
12910 | * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. |
12911 | * |
12912 | * Note that all memory pointers passed as parameters to these operations |
12913 | * are device pointers. Where necessary a device pointer should be |
12914 | * obtained, for example with ::cuMemHostGetDevicePointer(). |
12915 | * |
12916 | * None of the operations accepts pointers to managed memory buffers |
12917 | * (::cuMemAllocManaged). |
12918 | * |
12919 | * @{ |
12920 | */ |
12921 | |
12922 | /** |
12923 | * \brief Wait on a memory location |
12924 | * |
12925 | * Enqueues a synchronization of the stream on the given memory location. Work |
12926 | * ordered after the operation will block until the given condition on the |
12927 | * memory is satisfied. By default, the condition is to wait for |
12928 | * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. |
12929 | * Other condition types can be specified via \p flags. |
12930 | * |
12931 | * If the memory was registered via ::cuMemHostRegister(), the device pointer |
12932 | * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot |
12933 | * be used with managed memory (::cuMemAllocManaged). |
12934 | * |
12935 | * Support for this can be queried with ::cuDeviceGetAttribute() and |
12936 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
12937 | * |
12938 | * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and |
12939 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. |
12940 | * |
12941 | * \param stream The stream to synchronize on the memory location. |
12942 | * \param addr The memory location to wait on. |
12943 | * \param value The value to compare with the memory location. |
12944 | * \param flags See ::CUstreamWaitValue_flags. |
12945 | * |
12946 | * \return |
12947 | * ::CUDA_SUCCESS, |
12948 | * ::CUDA_ERROR_INVALID_VALUE, |
12949 | * ::CUDA_ERROR_NOT_SUPPORTED |
12950 | * \notefnerr |
12951 | * |
12952 | * \sa ::cuStreamWaitValue64, |
12953 | * ::cuStreamWriteValue32, |
12954 | * ::cuStreamWriteValue64, |
12955 | * ::cuStreamBatchMemOp, |
12956 | * ::cuMemHostRegister, |
12957 | * ::cuStreamWaitEvent |
12958 | */ |
12959 | CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
12960 | |
12961 | /** |
12962 | * \brief Wait on a memory location |
12963 | * |
12964 | * Enqueues a synchronization of the stream on the given memory location. Work |
12965 | * ordered after the operation will block until the given condition on the |
12966 | * memory is satisfied. By default, the condition is to wait for |
12967 | * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. |
12968 | * Other condition types can be specified via \p flags. |
12969 | * |
12970 | * If the memory was registered via ::cuMemHostRegister(), the device pointer |
12971 | * should be obtained with ::cuMemHostGetDevicePointer(). |
12972 | * |
12973 | * Support for this can be queried with ::cuDeviceGetAttribute() and |
12974 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
12975 | * |
12976 | * \param stream The stream to synchronize on the memory location. |
12977 | * \param addr The memory location to wait on. |
12978 | * \param value The value to compare with the memory location. |
12979 | * \param flags See ::CUstreamWaitValue_flags. |
12980 | * |
12981 | * \return |
12982 | * ::CUDA_SUCCESS, |
12983 | * ::CUDA_ERROR_INVALID_VALUE, |
12984 | * ::CUDA_ERROR_NOT_SUPPORTED |
12985 | * \notefnerr |
12986 | * |
12987 | * \sa ::cuStreamWaitValue32, |
12988 | * ::cuStreamWriteValue32, |
12989 | * ::cuStreamWriteValue64, |
12990 | * ::cuStreamBatchMemOp, |
12991 | * ::cuMemHostRegister, |
12992 | * ::cuStreamWaitEvent |
12993 | */ |
12994 | CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
12995 | |
12996 | /** |
12997 | * \brief Write a value to memory |
12998 | * |
12999 | * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER |
13000 | * flag is passed, the write is preceded by a system-wide memory fence, |
13001 | * equivalent to a __threadfence_system() but scoped to the stream |
13002 | * rather than a CUDA thread. |
13003 | * |
13004 | * If the memory was registered via ::cuMemHostRegister(), the device pointer |
13005 | * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot |
13006 | * be used with managed memory (::cuMemAllocManaged). |
13007 | * |
13008 | * Support for this can be queried with ::cuDeviceGetAttribute() and |
13009 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
13010 | * |
13011 | * \param stream The stream to do the write in. |
13012 | * \param addr The device address to write to. |
13013 | * \param value The value to write. |
13014 | * \param flags See ::CUstreamWriteValue_flags. |
13015 | * |
13016 | * \return |
13017 | * ::CUDA_SUCCESS, |
13018 | * ::CUDA_ERROR_INVALID_VALUE, |
13019 | * ::CUDA_ERROR_NOT_SUPPORTED |
13020 | * \notefnerr |
13021 | * |
13022 | * \sa ::cuStreamWriteValue64, |
13023 | * ::cuStreamWaitValue32, |
13024 | * ::cuStreamWaitValue64, |
13025 | * ::cuStreamBatchMemOp, |
13026 | * ::cuMemHostRegister, |
13027 | * ::cuEventRecord |
13028 | */ |
13029 | CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
13030 | |
13031 | /** |
13032 | * \brief Write a value to memory |
13033 | * |
13034 | * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER |
13035 | * flag is passed, the write is preceded by a system-wide memory fence, |
13036 | * equivalent to a __threadfence_system() but scoped to the stream |
13037 | * rather than a CUDA thread. |
13038 | * |
13039 | * If the memory was registered via ::cuMemHostRegister(), the device pointer |
13040 | * should be obtained with ::cuMemHostGetDevicePointer(). |
13041 | * |
13042 | * Support for this can be queried with ::cuDeviceGetAttribute() and |
13043 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
13044 | * |
13045 | * \param stream The stream to do the write in. |
13046 | * \param addr The device address to write to. |
13047 | * \param value The value to write. |
13048 | * \param flags See ::CUstreamWriteValue_flags. |
13049 | * |
13050 | * \return |
13051 | * ::CUDA_SUCCESS, |
13052 | * ::CUDA_ERROR_INVALID_VALUE, |
13053 | * ::CUDA_ERROR_NOT_SUPPORTED |
13054 | * \notefnerr |
13055 | * |
13056 | * \sa ::cuStreamWriteValue32, |
13057 | * ::cuStreamWaitValue32, |
13058 | * ::cuStreamWaitValue64, |
13059 | * ::cuStreamBatchMemOp, |
13060 | * ::cuMemHostRegister, |
13061 | * ::cuEventRecord |
13062 | */ |
13063 | CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
13064 | |
13065 | /** |
13066 | * \brief Batch operations to synchronize the stream via memory operations |
13067 | * |
13068 | * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). |
13069 | * Batching operations may avoid some performance overhead in both the API call |
13070 | * and the device execution versus adding them to the stream in separate API |
13071 | * calls. The operations are enqueued in the order they appear in the array. |
13072 | * |
13073 | * See ::CUstreamBatchMemOpType for the full set of supported operations, and |
13074 | * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), |
13075 | * and ::cuStreamWriteValue64() for details of specific operations. |
13076 | * |
13077 | * Basic support for this can be queried with ::cuDeviceGetAttribute() and |
13078 | * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details |
13079 | * on querying support for specific operations. |
13080 | * |
13081 | * \param stream The stream to enqueue the operations in. |
13082 | * \param count The number of operations in the array. Must be less than 256. |
13083 | * \param paramArray The types and parameters of the individual operations. |
13084 | * \param flags Reserved for future expansion; must be 0. |
13085 | * |
13086 | * \return |
13087 | * ::CUDA_SUCCESS, |
13088 | * ::CUDA_ERROR_INVALID_VALUE, |
13089 | * ::CUDA_ERROR_NOT_SUPPORTED |
13090 | * \notefnerr |
13091 | * |
13092 | * \sa ::cuStreamWaitValue32, |
13093 | * ::cuStreamWaitValue64, |
13094 | * ::cuStreamWriteValue32, |
13095 | * ::cuStreamWriteValue64, |
13096 | * ::cuMemHostRegister |
13097 | */ |
13098 | CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); |
13099 | |
13100 | /** @} */ /* END CUDA_MEMOP */ |
13101 | |
13102 | /** |
13103 | * \defgroup CUDA_EXEC Execution Control |
13104 | * |
13105 | * ___MANBRIEF___ execution control functions of the low-level CUDA driver API |
13106 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
13107 | * |
13108 | * This section describes the execution control functions of the low-level CUDA |
13109 | * driver application programming interface. |
13110 | * |
13111 | * @{ |
13112 | */ |
13113 | |
13114 | /** |
13115 | * \brief Returns information about a function |
13116 | * |
13117 | * Returns in \p *pi the integer value of the attribute \p attrib on the kernel |
13118 | * given by \p hfunc. The supported attributes are: |
13119 | * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads |
13120 | * per block, beyond which a launch of the function would fail. This number |
13121 | * depends on both the function and the device on which the function is |
13122 | * currently loaded. |
13123 | * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of |
13124 | * statically-allocated shared memory per block required by this function. |
13125 | * This does not include dynamically-allocated shared memory requested by |
13126 | * the user at runtime. |
13127 | * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated |
13128 | * constant memory required by this function. |
13129 | * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory |
13130 | * used by each thread of this function. |
13131 | * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread |
13132 | * of this function. |
13133 | * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for |
13134 | * which the function was compiled. This value is the major PTX version * 10 |
13135 | * + the minor PTX version, so a PTX version 1.3 function would return the |
13136 | * value 13. Note that this may return the undefined value of 0 for cubins |
13137 | * compiled prior to CUDA 3.0. |
13138 | * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for |
13139 | * which the function was compiled. This value is the major binary |
13140 | * version * 10 + the minor binary version, so a binary version 1.3 function |
13141 | * would return the value 13. Note that this will return a value of 10 for |
13142 | * legacy cubins that do not have a properly-encoded binary architecture |
13143 | * version. |
13144 | * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has |
13145 | * been compiled with user specified option "-Xptxas --dlcm=ca" set . |
13146 | * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of |
13147 | * dynamically-allocated shared memory. |
13148 | * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 |
13149 | * cache split ratio in percent of total shared memory. |
13150 | * |
13151 | * \param pi - Returned attribute value |
13152 | * \param attrib - Attribute requested |
13153 | * \param hfunc - Function to query attribute of |
13154 | * |
13155 | * \return |
13156 | * ::CUDA_SUCCESS, |
13157 | * ::CUDA_ERROR_DEINITIALIZED, |
13158 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13159 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13160 | * ::CUDA_ERROR_INVALID_HANDLE, |
13161 | * ::CUDA_ERROR_INVALID_VALUE |
13162 | * \notefnerr |
13163 | * |
13164 | * \sa ::cuCtxGetCacheConfig, |
13165 | * ::cuCtxSetCacheConfig, |
13166 | * ::cuFuncSetCacheConfig, |
13167 | * ::cuLaunchKernel, |
13168 | * ::cudaFuncGetAttributes, |
13169 | * ::cudaFuncSetAttribute |
13170 | */ |
13171 | CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); |
13172 | |
13173 | /** |
13174 | * \brief Sets information about a function |
13175 | * |
13176 | * This call sets the value of a specified attribute \p attrib on the kernel given |
13177 | * by \p hfunc to an integer value specified by \p val |
13178 | * This function returns CUDA_SUCCESS if the new value of the attribute could be |
13179 | * successfully set. If the set fails, this call will return an error. |
13180 | * Not all attributes can have values set. Attempting to set a value on a read-only |
13181 | * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) |
13182 | * |
13183 | * Supported attributes for the cuFuncSetAttribute call are: |
13184 | * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of |
13185 | * dynamically-allocated shared memory. The value should contain the requested |
13186 | * maximum size of dynamically-allocated shared memory. The sum of this value and |
13187 | * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the |
13188 | * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. |
13189 | * The maximal size of requestable dynamic shared memory may differ by GPU |
13190 | * architecture. |
13191 | * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 |
13192 | * cache and shared memory use the same hardware resources, this sets the shared memory |
13193 | * carveout preference, in percent of the total shared memory. |
13194 | * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR |
13195 | * This is only a hint, and the driver can choose a different ratio if required to execute the function. |
13196 | * |
13197 | * \param hfunc - Function to query attribute of |
13198 | * \param attrib - Attribute requested |
13199 | * \param value - The value to set |
13200 | * |
13201 | * \return |
13202 | * ::CUDA_SUCCESS, |
13203 | * ::CUDA_ERROR_DEINITIALIZED, |
13204 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13205 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13206 | * ::CUDA_ERROR_INVALID_HANDLE, |
13207 | * ::CUDA_ERROR_INVALID_VALUE |
13208 | * \notefnerr |
13209 | * |
13210 | * \sa ::cuCtxGetCacheConfig, |
13211 | * ::cuCtxSetCacheConfig, |
13212 | * ::cuFuncSetCacheConfig, |
13213 | * ::cuLaunchKernel, |
13214 | * ::cudaFuncGetAttributes, |
13215 | * ::cudaFuncSetAttribute |
13216 | */ |
13217 | CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); |
13218 | |
13219 | /** |
13220 | * \brief Sets the preferred cache configuration for a device function |
13221 | * |
13222 | * On devices where the L1 cache and shared memory use the same hardware |
13223 | * resources, this sets through \p config the preferred cache configuration for |
13224 | * the device function \p hfunc. This is only a preference. The driver will use |
13225 | * the requested configuration if possible, but it is free to choose a different |
13226 | * configuration if required to execute \p hfunc. Any context-wide preference |
13227 | * set via ::cuCtxSetCacheConfig() will be overridden by this per-function |
13228 | * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In |
13229 | * that case, the current context-wide setting will be used. |
13230 | * |
13231 | * This setting does nothing on devices where the size of the L1 cache and |
13232 | * shared memory are fixed. |
13233 | * |
13234 | * Launching a kernel with a different preference than the most recent |
13235 | * preference setting may insert a device-side synchronization point. |
13236 | * |
13237 | * |
13238 | * The supported cache configurations are: |
13239 | * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
13240 | * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
13241 | * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
13242 | * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
13243 | * |
13244 | * \param hfunc - Kernel to configure cache for |
13245 | * \param config - Requested cache configuration |
13246 | * |
13247 | * \return |
13248 | * ::CUDA_SUCCESS, |
13249 | * ::CUDA_ERROR_INVALID_VALUE, |
13250 | * ::CUDA_ERROR_DEINITIALIZED, |
13251 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13252 | * ::CUDA_ERROR_INVALID_CONTEXT |
13253 | * \notefnerr |
13254 | * |
13255 | * \sa ::cuCtxGetCacheConfig, |
13256 | * ::cuCtxSetCacheConfig, |
13257 | * ::cuFuncGetAttribute, |
13258 | * ::cuLaunchKernel, |
13259 | * ::cudaFuncSetCacheConfig |
13260 | */ |
13261 | CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); |
13262 | |
13263 | /** |
13264 | * \brief Sets the shared memory configuration for a device function. |
13265 | * |
13266 | * On devices with configurable shared memory banks, this function will |
13267 | * force all subsequent launches of the specified device function to have |
13268 | * the given shared memory bank size configuration. On any given launch of the |
13269 | * function, the shared memory configuration of the device will be temporarily |
13270 | * changed if needed to suit the function's preferred configuration. Changes in |
13271 | * shared memory configuration between subsequent launches of functions, |
13272 | * may introduce a device side synchronization point. |
13273 | * |
13274 | * Any per-function setting of shared memory bank size set via |
13275 | * ::cuFuncSetSharedMemConfig will override the context wide setting set with |
13276 | * ::cuCtxSetSharedMemConfig. |
13277 | * |
13278 | * Changing the shared memory bank size will not increase shared memory usage |
13279 | * or affect occupancy of kernels, but may have major effects on performance. |
13280 | * Larger bank sizes will allow for greater potential bandwidth to shared memory, |
13281 | * but will change what kinds of accesses to shared memory will result in bank |
13282 | * conflicts. |
13283 | * |
13284 | * This function will do nothing on devices with fixed shared memory bank size. |
13285 | * |
13286 | * The supported bank configurations are: |
13287 | * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory |
13288 | * configuration when launching this function. |
13289 | * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to |
13290 | * be natively four bytes when launching this function. |
13291 | * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to |
13292 | * be natively eight bytes when launching this function. |
13293 | * |
13294 | * \param hfunc - kernel to be given a shared memory config |
13295 | * \param config - requested shared memory configuration |
13296 | * |
13297 | * \return |
13298 | * ::CUDA_SUCCESS, |
13299 | * ::CUDA_ERROR_INVALID_VALUE, |
13300 | * ::CUDA_ERROR_DEINITIALIZED, |
13301 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13302 | * ::CUDA_ERROR_INVALID_CONTEXT |
13303 | * \notefnerr |
13304 | * |
13305 | * \sa ::cuCtxGetCacheConfig, |
13306 | * ::cuCtxSetCacheConfig, |
13307 | * ::cuCtxGetSharedMemConfig, |
13308 | * ::cuCtxSetSharedMemConfig, |
13309 | * ::cuFuncGetAttribute, |
13310 | * ::cuLaunchKernel, |
13311 | * ::cudaFuncSetSharedMemConfig |
13312 | */ |
13313 | CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); |
13314 | |
13315 | /** |
13316 | * \brief Returns a module handle |
13317 | * |
13318 | * Returns in \p *hmod the handle of the module that function \p hfunc |
13319 | * is located in. The lifetime of the module corresponds to the lifetime of |
13320 | * the context it was loaded in or until the module is explicitly unloaded. |
13321 | * |
13322 | * The CUDA runtime manages its own modules loaded into the primary context. |
13323 | * If the handle returned by this API refers to a module loaded by the CUDA runtime, |
13324 | * calling ::cuModuleUnload() on that module will result in undefined behavior. |
13325 | * |
13326 | * \param hmod - Returned module handle |
13327 | * \param hfunc - Function to retrieve module for |
13328 | * |
13329 | * \return |
13330 | * ::CUDA_SUCCESS, |
13331 | * ::CUDA_ERROR_DEINITIALIZED, |
13332 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13333 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13334 | * ::CUDA_ERROR_INVALID_VALUE, |
13335 | * ::CUDA_ERROR_NOT_FOUND |
13336 | * \notefnerr |
13337 | * |
13338 | */ |
13339 | CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); |
13340 | |
13341 | /** |
13342 | * \brief Launches a CUDA function |
13343 | * |
13344 | * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ |
13345 | * grid of blocks. Each block contains \p blockDimX x \p blockDimY x |
13346 | * \p blockDimZ threads. |
13347 | * |
13348 | * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
13349 | * available to each thread block. |
13350 | * |
13351 | * Kernel parameters to \p f can be specified in one of two ways: |
13352 | * |
13353 | * 1) Kernel parameters can be specified via \p kernelParams. If \p f |
13354 | * has N parameters, then \p kernelParams needs to be an array of N |
13355 | * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] |
13356 | * must point to a region of memory from which the actual kernel |
13357 | * parameter will be copied. The number of kernel parameters and their |
13358 | * offsets and sizes do not need to be specified as that information is |
13359 | * retrieved directly from the kernel's image. |
13360 | * |
13361 | * 2) Kernel parameters can also be packaged by the application into |
13362 | * a single buffer that is passed in via the \p extra parameter. |
13363 | * This places the burden on the application of knowing each kernel |
13364 | * parameter's size and alignment/padding within the buffer. Here is |
13365 | * an example of using the \p extra parameter in this manner: |
13366 | * \code |
13367 | size_t argBufferSize; |
13368 | char argBuffer[256]; |
13369 | |
13370 | // populate argBuffer and argBufferSize |
13371 | |
13372 | void *config[] = { |
13373 | CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, |
13374 | CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, |
13375 | CU_LAUNCH_PARAM_END |
13376 | }; |
13377 | status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); |
13378 | * \endcode |
13379 | * |
13380 | * The \p extra parameter exists to allow ::cuLaunchKernel to take |
13381 | * additional less commonly used arguments. \p extra specifies a list of |
13382 | * names of extra settings and their corresponding values. Each extra |
13383 | * setting name is immediately followed by the corresponding value. The |
13384 | * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. |
13385 | * |
13386 | * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra |
13387 | * array; |
13388 | * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next |
13389 | * value in \p extra will be a pointer to a buffer containing all |
13390 | * the kernel parameters for launching kernel \p f; |
13391 | * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next |
13392 | * value in \p extra will be a pointer to a size_t containing the |
13393 | * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; |
13394 | * |
13395 | * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel |
13396 | * parameters are specified with both \p kernelParams and \p extra |
13397 | * (i.e. both \p kernelParams and \p extra are non-NULL). |
13398 | * |
13399 | * Calling ::cuLaunchKernel() invalidates the persistent function state |
13400 | * set through the following deprecated APIs: |
13401 | * ::cuFuncSetBlockShape(), |
13402 | * ::cuFuncSetSharedSize(), |
13403 | * ::cuParamSetSize(), |
13404 | * ::cuParamSeti(), |
13405 | * ::cuParamSetf(), |
13406 | * ::cuParamSetv(). |
13407 | * |
13408 | * Note that to use ::cuLaunchKernel(), the kernel \p f must either have |
13409 | * been compiled with toolchain version 3.2 or later so that it will |
13410 | * contain kernel parameter information, or have no kernel parameters. |
13411 | * If either of these conditions is not met, then ::cuLaunchKernel() will |
13412 | * return ::CUDA_ERROR_INVALID_IMAGE. |
13413 | * |
13414 | * \param f - Kernel to launch |
13415 | * \param gridDimX - Width of grid in blocks |
13416 | * \param gridDimY - Height of grid in blocks |
13417 | * \param gridDimZ - Depth of grid in blocks |
13418 | * \param blockDimX - X dimension of each thread block |
13419 | * \param blockDimY - Y dimension of each thread block |
13420 | * \param blockDimZ - Z dimension of each thread block |
13421 | * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes |
13422 | * \param hStream - Stream identifier |
13423 | * \param kernelParams - Array of pointers to kernel parameters |
13424 | * \param extra - Extra options |
13425 | * |
13426 | * \return |
13427 | * ::CUDA_SUCCESS, |
13428 | * ::CUDA_ERROR_DEINITIALIZED, |
13429 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13430 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13431 | * ::CUDA_ERROR_INVALID_HANDLE, |
13432 | * ::CUDA_ERROR_INVALID_IMAGE, |
13433 | * ::CUDA_ERROR_INVALID_VALUE, |
13434 | * ::CUDA_ERROR_LAUNCH_FAILED, |
13435 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
13436 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
13437 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
13438 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
13439 | * \note_null_stream |
13440 | * \notefnerr |
13441 | * |
13442 | * \sa ::cuCtxGetCacheConfig, |
13443 | * ::cuCtxSetCacheConfig, |
13444 | * ::cuFuncSetCacheConfig, |
13445 | * ::cuFuncGetAttribute, |
13446 | * ::cudaLaunchKernel |
13447 | */ |
13448 | CUresult CUDAAPI cuLaunchKernel(CUfunction f, |
13449 | unsigned int gridDimX, |
13450 | unsigned int gridDimY, |
13451 | unsigned int gridDimZ, |
13452 | unsigned int blockDimX, |
13453 | unsigned int blockDimY, |
13454 | unsigned int blockDimZ, |
13455 | unsigned int sharedMemBytes, |
13456 | CUstream hStream, |
13457 | void **kernelParams, |
13458 | void **); |
13459 | |
13460 | /** |
13461 | * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute |
13462 | * |
13463 | * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ |
13464 | * grid of blocks. Each block contains \p blockDimX x \p blockDimY x |
13465 | * \p blockDimZ threads. |
13466 | * |
13467 | * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
13468 | * available to each thread block. |
13469 | * |
13470 | * The device on which this kernel is invoked must have a non-zero value for |
13471 | * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. |
13472 | * |
13473 | * The total number of blocks launched cannot exceed the maximum number of blocks per |
13474 | * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or |
13475 | * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors |
13476 | * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. |
13477 | * |
13478 | * The kernel cannot make use of CUDA dynamic parallelism. |
13479 | * |
13480 | * Kernel parameters must be specified via \p kernelParams. If \p f |
13481 | * has N parameters, then \p kernelParams needs to be an array of N |
13482 | * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] |
13483 | * must point to a region of memory from which the actual kernel |
13484 | * parameter will be copied. The number of kernel parameters and their |
13485 | * offsets and sizes do not need to be specified as that information is |
13486 | * retrieved directly from the kernel's image. |
13487 | * |
13488 | * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is |
13489 | * the same as function state set through ::cuLaunchKernel API |
13490 | * |
13491 | * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous |
13492 | * block shape, shared size and parameter info associated with \p f |
13493 | * is overwritten. |
13494 | * |
13495 | * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have |
13496 | * been compiled with toolchain version 3.2 or later so that it will |
13497 | * contain kernel parameter information, or have no kernel parameters. |
13498 | * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will |
13499 | * return ::CUDA_ERROR_INVALID_IMAGE. |
13500 | * |
13501 | * \param f - Kernel to launch |
13502 | * \param gridDimX - Width of grid in blocks |
13503 | * \param gridDimY - Height of grid in blocks |
13504 | * \param gridDimZ - Depth of grid in blocks |
13505 | * \param blockDimX - X dimension of each thread block |
13506 | * \param blockDimY - Y dimension of each thread block |
13507 | * \param blockDimZ - Z dimension of each thread block |
13508 | * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes |
13509 | * \param hStream - Stream identifier |
13510 | * \param kernelParams - Array of pointers to kernel parameters |
13511 | * |
13512 | * \return |
13513 | * ::CUDA_SUCCESS, |
13514 | * ::CUDA_ERROR_DEINITIALIZED, |
13515 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13516 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13517 | * ::CUDA_ERROR_INVALID_HANDLE, |
13518 | * ::CUDA_ERROR_INVALID_IMAGE, |
13519 | * ::CUDA_ERROR_INVALID_VALUE, |
13520 | * ::CUDA_ERROR_LAUNCH_FAILED, |
13521 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
13522 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
13523 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
13524 | * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, |
13525 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
13526 | * \note_null_stream |
13527 | * \notefnerr |
13528 | * |
13529 | * \sa ::cuCtxGetCacheConfig, |
13530 | * ::cuCtxSetCacheConfig, |
13531 | * ::cuFuncSetCacheConfig, |
13532 | * ::cuFuncGetAttribute, |
13533 | * ::cuLaunchCooperativeKernelMultiDevice, |
13534 | * ::cudaLaunchCooperativeKernel |
13535 | */ |
13536 | CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, |
13537 | unsigned int gridDimX, |
13538 | unsigned int gridDimY, |
13539 | unsigned int gridDimZ, |
13540 | unsigned int blockDimX, |
13541 | unsigned int blockDimY, |
13542 | unsigned int blockDimZ, |
13543 | unsigned int sharedMemBytes, |
13544 | CUstream hStream, |
13545 | void **kernelParams); |
13546 | |
13547 | /** |
13548 | * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute |
13549 | * |
13550 | * \deprecated This function is deprecated as of CUDA 11.3. |
13551 | * |
13552 | * Invokes kernels as specified in the \p launchParamsList array where each element |
13553 | * of the array specifies all the parameters required to perform a single kernel launch. |
13554 | * These kernels can cooperate and synchronize as they execute. The size of the array is |
13555 | * specified by \p numDevices. |
13556 | * |
13557 | * No two kernels can be launched on the same device. All the devices targeted by this |
13558 | * multi-device launch must be identical. All devices must have a non-zero value for the |
13559 | * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. |
13560 | * |
13561 | * All kernels launched must be identical with respect to the compiled code. Note that |
13562 | * any __device__, __constant__ or __managed__ variables present in the module that owns |
13563 | * the kernel launched on each device, are independently instantiated on every device. |
13564 | * It is the application's responsibility to ensure these variables are initialized and |
13565 | * used appropriately. |
13566 | * |
13567 | * The size of the grids as specified in blocks, the size of the blocks themselves |
13568 | * and the amount of shared memory used by each thread block must also match across |
13569 | * all launched kernels. |
13570 | * |
13571 | * The streams used to launch these kernels must have been created via either ::cuStreamCreate |
13572 | * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD |
13573 | * cannot be used. |
13574 | * |
13575 | * The total number of blocks launched per kernel cannot exceed the maximum number of blocks |
13576 | * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or |
13577 | * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors |
13578 | * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the |
13579 | * total number of blocks launched per device has to match across all devices, the maximum |
13580 | * number of blocks that can be launched per device will be limited by the device with the |
13581 | * least number of multiprocessors. |
13582 | * |
13583 | * The kernels cannot make use of CUDA dynamic parallelism. |
13584 | * |
13585 | * The ::CUDA_LAUNCH_PARAMS structure is defined as: |
13586 | * \code |
13587 | typedef struct CUDA_LAUNCH_PARAMS_st |
13588 | { |
13589 | CUfunction function; |
13590 | unsigned int gridDimX; |
13591 | unsigned int gridDimY; |
13592 | unsigned int gridDimZ; |
13593 | unsigned int blockDimX; |
13594 | unsigned int blockDimY; |
13595 | unsigned int blockDimZ; |
13596 | unsigned int sharedMemBytes; |
13597 | CUstream hStream; |
13598 | void **kernelParams; |
13599 | } CUDA_LAUNCH_PARAMS; |
13600 | * \endcode |
13601 | * where: |
13602 | * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must |
13603 | * be identical with respect to the compiled code. |
13604 | * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across |
13605 | * all kernels launched. |
13606 | * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across |
13607 | * all kernels launched. |
13608 | * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across |
13609 | * all kernels launched. |
13610 | * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across |
13611 | * all kernels launched. |
13612 | * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across |
13613 | * all kernels launched. |
13614 | * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across |
13615 | * all kernels launched. |
13616 | * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. |
13617 | * This must match across all kernels launched. |
13618 | * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot |
13619 | * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated |
13620 | * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. |
13621 | * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If |
13622 | * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams |
13623 | * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through |
13624 | * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual |
13625 | * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes |
13626 | * do not need to be specified as that information is retrieved directly from the kernel's image. |
13627 | * |
13628 | * By default, the kernel won't begin execution on any GPU until all prior work in all the specified |
13629 | * streams has completed. This behavior can be overridden by specifying the flag |
13630 | * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel |
13631 | * will only wait for prior work in the stream corresponding to that GPU to complete before it begins |
13632 | * execution. |
13633 | * |
13634 | * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin |
13635 | * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying |
13636 | * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, |
13637 | * any subsequent work pushed in any of the specified streams will only wait for the kernel launched |
13638 | * on the GPU corresponding to that stream to complete before it begins execution. |
13639 | * |
13640 | * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is |
13641 | * the same as function state set through ::cuLaunchKernel API when called individually for each |
13642 | * element in \p launchParamsList. |
13643 | * |
13644 | * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous |
13645 | * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function |
13646 | * in \p launchParamsList is overwritten. |
13647 | * |
13648 | * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have |
13649 | * been compiled with toolchain version 3.2 or later so that it will |
13650 | * contain kernel parameter information, or have no kernel parameters. |
13651 | * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will |
13652 | * return ::CUDA_ERROR_INVALID_IMAGE. |
13653 | * |
13654 | * \param launchParamsList - List of launch parameters, one per device |
13655 | * \param numDevices - Size of the \p launchParamsList array |
13656 | * \param flags - Flags to control launch behavior |
13657 | * |
13658 | * \return |
13659 | * ::CUDA_SUCCESS, |
13660 | * ::CUDA_ERROR_DEINITIALIZED, |
13661 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13662 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13663 | * ::CUDA_ERROR_INVALID_HANDLE, |
13664 | * ::CUDA_ERROR_INVALID_IMAGE, |
13665 | * ::CUDA_ERROR_INVALID_VALUE, |
13666 | * ::CUDA_ERROR_LAUNCH_FAILED, |
13667 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
13668 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
13669 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
13670 | * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, |
13671 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
13672 | * \note_null_stream |
13673 | * \notefnerr |
13674 | * |
13675 | * \sa ::cuCtxGetCacheConfig, |
13676 | * ::cuCtxSetCacheConfig, |
13677 | * ::cuFuncSetCacheConfig, |
13678 | * ::cuFuncGetAttribute, |
13679 | * ::cuLaunchCooperativeKernel, |
13680 | * ::cudaLaunchCooperativeKernelMultiDevice |
13681 | */ |
13682 | __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); |
13683 | |
13684 | /** |
13685 | * \brief Enqueues a host function call in a stream |
13686 | * |
13687 | * Enqueues a host function to run in a stream. The function will be called |
13688 | * after currently enqueued work and will block work added after it. |
13689 | * |
13690 | * The host function must not make any CUDA API calls. Attempting to use a |
13691 | * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. |
13692 | * The host function must not perform any synchronization that may depend on |
13693 | * outstanding CUDA work not mandated to run earlier. Host functions without a |
13694 | * mandated order (such as in independent streams) execute in undefined order |
13695 | * and may be serialized. |
13696 | * |
13697 | * For the purposes of Unified Memory, execution makes a number of guarantees: |
13698 | * <ul> |
13699 | * <li>The stream is considered idle for the duration of the function's |
13700 | * execution. Thus, for example, the function may always use memory attached |
13701 | * to the stream it was enqueued in.</li> |
13702 | * <li>The start of execution of the function has the same effect as |
13703 | * synchronizing an event recorded in the same stream immediately prior to |
13704 | * the function. It thus synchronizes streams which have been "joined" |
13705 | * prior to the function.</li> |
13706 | * <li>Adding device work to any stream does not have the effect of making |
13707 | * the stream active until all preceding host functions and stream callbacks |
13708 | * have executed. Thus, for |
13709 | * example, a function might use global attached memory even if work has |
13710 | * been added to another stream, if the work has been ordered behind the |
13711 | * function call with an event.</li> |
13712 | * <li>Completion of the function does not cause a stream to become |
13713 | * active except as described above. The stream will remain idle |
13714 | * if no device work follows the function, and will remain idle across |
13715 | * consecutive host functions or stream callbacks without device work in |
13716 | * between. Thus, for example, |
13717 | * stream synchronization can be done by signaling from a host function at the |
13718 | * end of the stream.</li> |
13719 | * </ul> |
13720 | * |
13721 | * Note that, in contrast to ::cuStreamAddCallback, the function will not be |
13722 | * called in the event of an error in the CUDA context. |
13723 | * |
13724 | * \param hStream - Stream to enqueue function call in |
13725 | * \param fn - The function to call once preceding stream operations are complete |
13726 | * \param userData - User-specified data to be passed to the function |
13727 | * |
13728 | * \return |
13729 | * ::CUDA_SUCCESS, |
13730 | * ::CUDA_ERROR_DEINITIALIZED, |
13731 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13732 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13733 | * ::CUDA_ERROR_INVALID_HANDLE, |
13734 | * ::CUDA_ERROR_NOT_SUPPORTED |
13735 | * \note_null_stream |
13736 | * \notefnerr |
13737 | * |
13738 | * \sa ::cuStreamCreate, |
13739 | * ::cuStreamQuery, |
13740 | * ::cuStreamSynchronize, |
13741 | * ::cuStreamWaitEvent, |
13742 | * ::cuStreamDestroy, |
13743 | * ::cuMemAllocManaged, |
13744 | * ::cuStreamAttachMemAsync, |
13745 | * ::cuStreamAddCallback |
13746 | */ |
13747 | CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); |
13748 | |
13749 | /** @} */ /* END CUDA_EXEC */ |
13750 | |
13751 | /** |
13752 | * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] |
13753 | * |
13754 | * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA |
13755 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
13756 | * |
13757 | * This section describes the deprecated execution control functions of the |
13758 | * low-level CUDA driver application programming interface. |
13759 | * |
13760 | * @{ |
13761 | */ |
13762 | |
13763 | /** |
13764 | * \brief Sets the block-dimensions for the function |
13765 | * |
13766 | * \deprecated |
13767 | * |
13768 | * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are |
13769 | * created when the kernel given by \p hfunc is launched. |
13770 | * |
13771 | * \param hfunc - Kernel to specify dimensions of |
13772 | * \param x - X dimension |
13773 | * \param y - Y dimension |
13774 | * \param z - Z dimension |
13775 | * |
13776 | * \return |
13777 | * ::CUDA_SUCCESS, |
13778 | * ::CUDA_ERROR_DEINITIALIZED, |
13779 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13780 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13781 | * ::CUDA_ERROR_INVALID_HANDLE, |
13782 | * ::CUDA_ERROR_INVALID_VALUE |
13783 | * \notefnerr |
13784 | * |
13785 | * \sa ::cuFuncSetSharedSize, |
13786 | * ::cuFuncSetCacheConfig, |
13787 | * ::cuFuncGetAttribute, |
13788 | * ::cuParamSetSize, |
13789 | * ::cuParamSeti, |
13790 | * ::cuParamSetf, |
13791 | * ::cuParamSetv, |
13792 | * ::cuLaunch, |
13793 | * ::cuLaunchGrid, |
13794 | * ::cuLaunchGridAsync, |
13795 | * ::cuLaunchKernel |
13796 | */ |
13797 | __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); |
13798 | |
13799 | /** |
13800 | * \brief Sets the dynamic shared-memory size for the function |
13801 | * |
13802 | * \deprecated |
13803 | * |
13804 | * Sets through \p bytes the amount of dynamic shared memory that will be |
13805 | * available to each thread block when the kernel given by \p hfunc is launched. |
13806 | * |
13807 | * \param hfunc - Kernel to specify dynamic shared-memory size for |
13808 | * \param bytes - Dynamic shared-memory size per thread in bytes |
13809 | * |
13810 | * \return |
13811 | * ::CUDA_SUCCESS, |
13812 | * ::CUDA_ERROR_DEINITIALIZED, |
13813 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13814 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13815 | * ::CUDA_ERROR_INVALID_HANDLE, |
13816 | * ::CUDA_ERROR_INVALID_VALUE |
13817 | * \notefnerr |
13818 | * |
13819 | * \sa ::cuFuncSetBlockShape, |
13820 | * ::cuFuncSetCacheConfig, |
13821 | * ::cuFuncGetAttribute, |
13822 | * ::cuParamSetSize, |
13823 | * ::cuParamSeti, |
13824 | * ::cuParamSetf, |
13825 | * ::cuParamSetv, |
13826 | * ::cuLaunch, |
13827 | * ::cuLaunchGrid, |
13828 | * ::cuLaunchGridAsync, |
13829 | * ::cuLaunchKernel |
13830 | */ |
13831 | __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); |
13832 | |
13833 | /** |
13834 | * \brief Sets the parameter size for the function |
13835 | * |
13836 | * \deprecated |
13837 | * |
13838 | * Sets through \p numbytes the total size in bytes needed by the function |
13839 | * parameters of the kernel corresponding to \p hfunc. |
13840 | * |
13841 | * \param hfunc - Kernel to set parameter size for |
13842 | * \param numbytes - Size of parameter list in bytes |
13843 | * |
13844 | * \return |
13845 | * ::CUDA_SUCCESS, |
13846 | * ::CUDA_ERROR_DEINITIALIZED, |
13847 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13848 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13849 | * ::CUDA_ERROR_INVALID_VALUE |
13850 | * \notefnerr |
13851 | * |
13852 | * \sa ::cuFuncSetBlockShape, |
13853 | * ::cuFuncSetSharedSize, |
13854 | * ::cuFuncGetAttribute, |
13855 | * ::cuParamSetf, |
13856 | * ::cuParamSeti, |
13857 | * ::cuParamSetv, |
13858 | * ::cuLaunch, |
13859 | * ::cuLaunchGrid, |
13860 | * ::cuLaunchGridAsync, |
13861 | * ::cuLaunchKernel |
13862 | */ |
13863 | __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); |
13864 | |
13865 | /** |
13866 | * \brief Adds an integer parameter to the function's argument list |
13867 | * |
13868 | * \deprecated |
13869 | * |
13870 | * Sets an integer parameter that will be specified the next time the |
13871 | * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. |
13872 | * |
13873 | * \param hfunc - Kernel to add parameter to |
13874 | * \param offset - Offset to add parameter to argument list |
13875 | * \param value - Value of parameter |
13876 | * |
13877 | * \return |
13878 | * ::CUDA_SUCCESS, |
13879 | * ::CUDA_ERROR_DEINITIALIZED, |
13880 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13881 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13882 | * ::CUDA_ERROR_INVALID_VALUE |
13883 | * \notefnerr |
13884 | * |
13885 | * \sa ::cuFuncSetBlockShape, |
13886 | * ::cuFuncSetSharedSize, |
13887 | * ::cuFuncGetAttribute, |
13888 | * ::cuParamSetSize, |
13889 | * ::cuParamSetf, |
13890 | * ::cuParamSetv, |
13891 | * ::cuLaunch, |
13892 | * ::cuLaunchGrid, |
13893 | * ::cuLaunchGridAsync, |
13894 | * ::cuLaunchKernel |
13895 | */ |
13896 | __CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); |
13897 | |
13898 | /** |
13899 | * \brief Adds a floating-point parameter to the function's argument list |
13900 | * |
13901 | * \deprecated |
13902 | * |
13903 | * Sets a floating-point parameter that will be specified the next time the |
13904 | * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. |
13905 | * |
13906 | * \param hfunc - Kernel to add parameter to |
13907 | * \param offset - Offset to add parameter to argument list |
13908 | * \param value - Value of parameter |
13909 | * |
13910 | * \return |
13911 | * ::CUDA_SUCCESS, |
13912 | * ::CUDA_ERROR_DEINITIALIZED, |
13913 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13914 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13915 | * ::CUDA_ERROR_INVALID_VALUE |
13916 | * \notefnerr |
13917 | * |
13918 | * \sa ::cuFuncSetBlockShape, |
13919 | * ::cuFuncSetSharedSize, |
13920 | * ::cuFuncGetAttribute, |
13921 | * ::cuParamSetSize, |
13922 | * ::cuParamSeti, |
13923 | * ::cuParamSetv, |
13924 | * ::cuLaunch, |
13925 | * ::cuLaunchGrid, |
13926 | * ::cuLaunchGridAsync, |
13927 | * ::cuLaunchKernel |
13928 | */ |
13929 | __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); |
13930 | |
13931 | /** |
13932 | * \brief Adds arbitrary data to the function's argument list |
13933 | * |
13934 | * \deprecated |
13935 | * |
13936 | * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr |
13937 | * into the parameter space of the kernel corresponding to \p hfunc. \p offset |
13938 | * is a byte offset. |
13939 | * |
13940 | * \param hfunc - Kernel to add data to |
13941 | * \param offset - Offset to add data to argument list |
13942 | * \param ptr - Pointer to arbitrary data |
13943 | * \param numbytes - Size of data to copy in bytes |
13944 | * |
13945 | * \return |
13946 | * ::CUDA_SUCCESS, |
13947 | * ::CUDA_ERROR_DEINITIALIZED, |
13948 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13949 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13950 | * ::CUDA_ERROR_INVALID_VALUE |
13951 | * \notefnerr |
13952 | * |
13953 | * \sa ::cuFuncSetBlockShape, |
13954 | * ::cuFuncSetSharedSize, |
13955 | * ::cuFuncGetAttribute, |
13956 | * ::cuParamSetSize, |
13957 | * ::cuParamSetf, |
13958 | * ::cuParamSeti, |
13959 | * ::cuLaunch, |
13960 | * ::cuLaunchGrid, |
13961 | * ::cuLaunchGridAsync, |
13962 | * ::cuLaunchKernel |
13963 | */ |
13964 | __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); |
13965 | |
13966 | /** |
13967 | * \brief Launches a CUDA function |
13968 | * |
13969 | * \deprecated |
13970 | * |
13971 | * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block |
13972 | * contains the number of threads specified by a previous call to |
13973 | * ::cuFuncSetBlockShape(). |
13974 | * |
13975 | * The block shape, dynamic shared memory size, and parameter information |
13976 | * must be set using |
13977 | * ::cuFuncSetBlockShape(), |
13978 | * ::cuFuncSetSharedSize(), |
13979 | * ::cuParamSetSize(), |
13980 | * ::cuParamSeti(), |
13981 | * ::cuParamSetf(), and |
13982 | * ::cuParamSetv() |
13983 | * prior to calling this function. |
13984 | * |
13985 | * Launching a function via ::cuLaunchKernel() invalidates the function's |
13986 | * block shape, dynamic shared memory size, and parameter information. After |
13987 | * launching via cuLaunchKernel, this state must be re-initialized prior to |
13988 | * calling this function. Failure to do so results in undefined behavior. |
13989 | * |
13990 | * \param f - Kernel to launch |
13991 | * |
13992 | * \return |
13993 | * ::CUDA_SUCCESS, |
13994 | * ::CUDA_ERROR_DEINITIALIZED, |
13995 | * ::CUDA_ERROR_NOT_INITIALIZED, |
13996 | * ::CUDA_ERROR_INVALID_CONTEXT, |
13997 | * ::CUDA_ERROR_INVALID_VALUE, |
13998 | * ::CUDA_ERROR_LAUNCH_FAILED, |
13999 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
14000 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
14001 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
14002 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
14003 | * \notefnerr |
14004 | * |
14005 | * \sa ::cuFuncSetBlockShape, |
14006 | * ::cuFuncSetSharedSize, |
14007 | * ::cuFuncGetAttribute, |
14008 | * ::cuParamSetSize, |
14009 | * ::cuParamSetf, |
14010 | * ::cuParamSeti, |
14011 | * ::cuParamSetv, |
14012 | * ::cuLaunchGrid, |
14013 | * ::cuLaunchGridAsync, |
14014 | * ::cuLaunchKernel |
14015 | */ |
14016 | __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); |
14017 | |
14018 | /** |
14019 | * \brief Launches a CUDA function |
14020 | * |
14021 | * \deprecated |
14022 | * |
14023 | * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of |
14024 | * blocks. Each block contains the number of threads specified by a previous |
14025 | * call to ::cuFuncSetBlockShape(). |
14026 | * |
14027 | * The block shape, dynamic shared memory size, and parameter information |
14028 | * must be set using |
14029 | * ::cuFuncSetBlockShape(), |
14030 | * ::cuFuncSetSharedSize(), |
14031 | * ::cuParamSetSize(), |
14032 | * ::cuParamSeti(), |
14033 | * ::cuParamSetf(), and |
14034 | * ::cuParamSetv() |
14035 | * prior to calling this function. |
14036 | * |
14037 | * Launching a function via ::cuLaunchKernel() invalidates the function's |
14038 | * block shape, dynamic shared memory size, and parameter information. After |
14039 | * launching via cuLaunchKernel, this state must be re-initialized prior to |
14040 | * calling this function. Failure to do so results in undefined behavior. |
14041 | * |
14042 | * \param f - Kernel to launch |
14043 | * \param grid_width - Width of grid in blocks |
14044 | * \param grid_height - Height of grid in blocks |
14045 | * |
14046 | * \return |
14047 | * ::CUDA_SUCCESS, |
14048 | * ::CUDA_ERROR_DEINITIALIZED, |
14049 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14050 | * ::CUDA_ERROR_INVALID_CONTEXT, |
14051 | * ::CUDA_ERROR_INVALID_VALUE, |
14052 | * ::CUDA_ERROR_LAUNCH_FAILED, |
14053 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
14054 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
14055 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
14056 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
14057 | * \notefnerr |
14058 | * |
14059 | * \sa ::cuFuncSetBlockShape, |
14060 | * ::cuFuncSetSharedSize, |
14061 | * ::cuFuncGetAttribute, |
14062 | * ::cuParamSetSize, |
14063 | * ::cuParamSetf, |
14064 | * ::cuParamSeti, |
14065 | * ::cuParamSetv, |
14066 | * ::cuLaunch, |
14067 | * ::cuLaunchGridAsync, |
14068 | * ::cuLaunchKernel |
14069 | */ |
14070 | __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); |
14071 | |
14072 | /** |
14073 | * \brief Launches a CUDA function |
14074 | * |
14075 | * \deprecated |
14076 | * |
14077 | * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of |
14078 | * blocks. Each block contains the number of threads specified by a previous |
14079 | * call to ::cuFuncSetBlockShape(). |
14080 | * |
14081 | * The block shape, dynamic shared memory size, and parameter information |
14082 | * must be set using |
14083 | * ::cuFuncSetBlockShape(), |
14084 | * ::cuFuncSetSharedSize(), |
14085 | * ::cuParamSetSize(), |
14086 | * ::cuParamSeti(), |
14087 | * ::cuParamSetf(), and |
14088 | * ::cuParamSetv() |
14089 | * prior to calling this function. |
14090 | * |
14091 | * Launching a function via ::cuLaunchKernel() invalidates the function's |
14092 | * block shape, dynamic shared memory size, and parameter information. After |
14093 | * launching via cuLaunchKernel, this state must be re-initialized prior to |
14094 | * calling this function. Failure to do so results in undefined behavior. |
14095 | * |
14096 | * \param f - Kernel to launch |
14097 | * \param grid_width - Width of grid in blocks |
14098 | * \param grid_height - Height of grid in blocks |
14099 | * \param hStream - Stream identifier |
14100 | * |
14101 | * \return |
14102 | * ::CUDA_SUCCESS, |
14103 | * ::CUDA_ERROR_DEINITIALIZED, |
14104 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14105 | * ::CUDA_ERROR_INVALID_CONTEXT, |
14106 | * ::CUDA_ERROR_INVALID_HANDLE, |
14107 | * ::CUDA_ERROR_INVALID_VALUE, |
14108 | * ::CUDA_ERROR_LAUNCH_FAILED, |
14109 | * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
14110 | * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
14111 | * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
14112 | * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
14113 | * |
14114 | * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), |
14115 | * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by |
14116 | * growing the per-thread stack as needed per launch and not shrinking it afterwards. |
14117 | * |
14118 | * \note_null_stream |
14119 | * \notefnerr |
14120 | * |
14121 | * \sa ::cuFuncSetBlockShape, |
14122 | * ::cuFuncSetSharedSize, |
14123 | * ::cuFuncGetAttribute, |
14124 | * ::cuParamSetSize, |
14125 | * ::cuParamSetf, |
14126 | * ::cuParamSeti, |
14127 | * ::cuParamSetv, |
14128 | * ::cuLaunch, |
14129 | * ::cuLaunchGrid, |
14130 | * ::cuLaunchKernel |
14131 | */ |
14132 | __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); |
14133 | |
14134 | |
14135 | /** |
14136 | * \brief Adds a texture-reference to the function's argument list |
14137 | * |
14138 | * \deprecated |
14139 | * |
14140 | * Makes the CUDA array or linear memory bound to the texture reference |
14141 | * \p hTexRef available to a device program as a texture. In this version of |
14142 | * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and |
14143 | * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. |
14144 | * |
14145 | * \param hfunc - Kernel to add texture-reference to |
14146 | * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) |
14147 | * \param hTexRef - Texture-reference to add to argument list |
14148 | * |
14149 | * \return |
14150 | * ::CUDA_SUCCESS, |
14151 | * ::CUDA_ERROR_DEINITIALIZED, |
14152 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14153 | * ::CUDA_ERROR_INVALID_CONTEXT, |
14154 | * ::CUDA_ERROR_INVALID_VALUE |
14155 | * \notefnerr |
14156 | */ |
14157 | __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); |
14158 | /** @} */ /* END CUDA_EXEC_DEPRECATED */ |
14159 | |
14160 | /** |
14161 | * \defgroup CUDA_GRAPH Graph Management |
14162 | * |
14163 | * ___MANBRIEF___ graph management functions of the low-level CUDA driver API |
14164 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
14165 | * |
14166 | * This section describes the graph management functions of the low-level CUDA |
14167 | * driver application programming interface. |
14168 | * |
14169 | * @{ |
14170 | */ |
14171 | |
14172 | /** |
14173 | * \brief Creates a graph |
14174 | * |
14175 | * Creates an empty graph, which is returned via \p phGraph. |
14176 | * |
14177 | * \param phGraph - Returns newly created graph |
14178 | * \param flags - Graph creation flags, must be 0 |
14179 | * |
14180 | * \return |
14181 | * ::CUDA_SUCCESS, |
14182 | * ::CUDA_ERROR_DEINITIALIZED, |
14183 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14184 | * ::CUDA_ERROR_INVALID_VALUE, |
14185 | * ::CUDA_ERROR_OUT_OF_MEMORY |
14186 | * \note_graph_thread_safety |
14187 | * \notefnerr |
14188 | * |
14189 | * \sa |
14190 | * ::cuGraphAddChildGraphNode, |
14191 | * ::cuGraphAddEmptyNode, |
14192 | * ::cuGraphAddKernelNode, |
14193 | * ::cuGraphAddHostNode, |
14194 | * ::cuGraphAddMemcpyNode, |
14195 | * ::cuGraphAddMemsetNode, |
14196 | * ::cuGraphInstantiate, |
14197 | * ::cuGraphDestroy, |
14198 | * ::cuGraphGetNodes, |
14199 | * ::cuGraphGetRootNodes, |
14200 | * ::cuGraphGetEdges, |
14201 | * ::cuGraphClone |
14202 | */ |
14203 | CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); |
14204 | |
14205 | /** |
14206 | * \brief Creates a kernel execution node and adds it to a graph |
14207 | * |
14208 | * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies |
14209 | * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
14210 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14211 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14212 | * A handle to the new node will be returned in \p phGraphNode. |
14213 | * |
14214 | * The CUDA_KERNEL_NODE_PARAMS structure is defined as: |
14215 | * |
14216 | * \code |
14217 | * typedef struct CUDA_KERNEL_NODE_PARAMS_st { |
14218 | * CUfunction func; |
14219 | * unsigned int gridDimX; |
14220 | * unsigned int gridDimY; |
14221 | * unsigned int gridDimZ; |
14222 | * unsigned int blockDimX; |
14223 | * unsigned int blockDimY; |
14224 | * unsigned int blockDimZ; |
14225 | * unsigned int sharedMemBytes; |
14226 | * void **kernelParams; |
14227 | * void **extra; |
14228 | * } CUDA_KERNEL_NODE_PARAMS; |
14229 | * \endcode |
14230 | * |
14231 | * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x |
14232 | * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains |
14233 | * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. |
14234 | * |
14235 | * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
14236 | * available to each thread block. |
14237 | * |
14238 | * Kernel parameters to \p func can be specified in one of two ways: |
14239 | * |
14240 | * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N |
14241 | * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, |
14242 | * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual |
14243 | * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need |
14244 | * to be specified as that information is retrieved directly from the kernel's image. |
14245 | * |
14246 | * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single |
14247 | * buffer that is passed in via \p extra. This places the burden on the application of knowing each |
14248 | * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists |
14249 | * to allow this function to take additional less commonly used arguments. \p extra specifies |
14250 | * a list of names of extra settings and their corresponding values. Each extra setting name is |
14251 | * immediately followed by the corresponding value. The list must be terminated with either NULL or |
14252 | * CU_LAUNCH_PARAM_END. |
14253 | * |
14254 | * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra |
14255 | * array; |
14256 | * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next |
14257 | * value in \p extra will be a pointer to a buffer |
14258 | * containing all the kernel parameters for launching kernel |
14259 | * \p func; |
14260 | * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next |
14261 | * value in \p extra will be a pointer to a size_t |
14262 | * containing the size of the buffer specified with |
14263 | * ::CU_LAUNCH_PARAM_BUFFER_POINTER; |
14264 | * |
14265 | * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both |
14266 | * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). |
14267 | * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. |
14268 | * |
14269 | * The \p kernelParams or \p extra array, as well as the argument values it points to, |
14270 | * are copied during this call. |
14271 | * |
14272 | * \note Kernels launched using graphs must not use texture and surface references. Reading or |
14273 | * writing through any texture or surface reference is undefined behavior. |
14274 | * This restriction does not apply to texture and surface objects. |
14275 | * |
14276 | * \param phGraphNode - Returns newly created node |
14277 | * \param hGraph - Graph to which to add the node |
14278 | * \param dependencies - Dependencies of the node |
14279 | * \param numDependencies - Number of dependencies |
14280 | * \param nodeParams - Parameters for the GPU execution node |
14281 | * |
14282 | * \return |
14283 | * ::CUDA_SUCCESS, |
14284 | * ::CUDA_ERROR_DEINITIALIZED, |
14285 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14286 | * ::CUDA_ERROR_INVALID_VALUE |
14287 | * \note_graph_thread_safety |
14288 | * \notefnerr |
14289 | * |
14290 | * \sa |
14291 | * ::cuLaunchKernel, |
14292 | * ::cuLaunchCooperativeKernel, |
14293 | * ::cuGraphKernelNodeGetParams, |
14294 | * ::cuGraphKernelNodeSetParams, |
14295 | * ::cuGraphCreate, |
14296 | * ::cuGraphDestroyNode, |
14297 | * ::cuGraphAddChildGraphNode, |
14298 | * ::cuGraphAddEmptyNode, |
14299 | * ::cuGraphAddHostNode, |
14300 | * ::cuGraphAddMemcpyNode, |
14301 | * ::cuGraphAddMemsetNode |
14302 | */ |
14303 | CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
14304 | |
14305 | /** |
14306 | * \brief Returns a kernel node's parameters |
14307 | * |
14308 | * Returns the parameters of kernel node \p hNode in \p nodeParams. |
14309 | * The \p kernelParams or \p extra array returned in \p nodeParams, |
14310 | * as well as the argument values it points to, are owned by the node. |
14311 | * This memory remains valid until the node is destroyed or its |
14312 | * parameters are modified, and should not be modified |
14313 | * directly. Use ::cuGraphKernelNodeSetParams to update the |
14314 | * parameters of this node. |
14315 | * |
14316 | * The params will contain either \p kernelParams or \p extra, |
14317 | * according to which of these was most recently set on the node. |
14318 | * |
14319 | * \param hNode - Node to get the parameters for |
14320 | * \param nodeParams - Pointer to return the parameters |
14321 | * |
14322 | * \return |
14323 | * ::CUDA_SUCCESS, |
14324 | * ::CUDA_ERROR_DEINITIALIZED, |
14325 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14326 | * ::CUDA_ERROR_INVALID_VALUE |
14327 | * \note_graph_thread_safety |
14328 | * \notefnerr |
14329 | * |
14330 | * \sa |
14331 | * ::cuLaunchKernel, |
14332 | * ::cuGraphAddKernelNode, |
14333 | * ::cuGraphKernelNodeSetParams |
14334 | */ |
14335 | CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); |
14336 | |
14337 | /** |
14338 | * \brief Sets a kernel node's parameters |
14339 | * |
14340 | * Sets the parameters of kernel node \p hNode to \p nodeParams. |
14341 | * |
14342 | * \param hNode - Node to set the parameters for |
14343 | * \param nodeParams - Parameters to copy |
14344 | * |
14345 | * \return |
14346 | * ::CUDA_SUCCESS, |
14347 | * ::CUDA_ERROR_INVALID_VALUE, |
14348 | * ::CUDA_ERROR_INVALID_HANDLE, |
14349 | * ::CUDA_ERROR_OUT_OF_MEMORY |
14350 | * \note_graph_thread_safety |
14351 | * \notefnerr |
14352 | * |
14353 | * \sa |
14354 | * ::cuLaunchKernel, |
14355 | * ::cuGraphAddKernelNode, |
14356 | * ::cuGraphKernelNodeGetParams |
14357 | */ |
14358 | CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
14359 | |
14360 | /** |
14361 | * \brief Creates a memcpy node and adds it to a graph |
14362 | * |
14363 | * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies |
14364 | * dependencies specified via \p dependencies. |
14365 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14366 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14367 | * A handle to the new node will be returned in \p phGraphNode. |
14368 | * |
14369 | * When the graph is launched, the node will perform the memcpy described by \p copyParams. |
14370 | * See ::cuMemcpy3D() for a description of the structure and its restrictions. |
14371 | * |
14372 | * Memcpy nodes have some additional restrictions with regards to managed memory, if the |
14373 | * system contains at least one device which has a zero value for the device attribute |
14374 | * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer |
14375 | * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed |
14376 | * for those operand(s). The managed memory will be treated as residing on either the |
14377 | * host or the device, depending on which memory type is specified. |
14378 | * |
14379 | * \param phGraphNode - Returns newly created node |
14380 | * \param hGraph - Graph to which to add the node |
14381 | * \param dependencies - Dependencies of the node |
14382 | * \param numDependencies - Number of dependencies |
14383 | * \param copyParams - Parameters for the memory copy |
14384 | * \param ctx - Context on which to run the node |
14385 | * |
14386 | * \return |
14387 | * ::CUDA_SUCCESS, |
14388 | * ::CUDA_ERROR_DEINITIALIZED, |
14389 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14390 | * ::CUDA_ERROR_INVALID_VALUE |
14391 | * \note_graph_thread_safety |
14392 | * \notefnerr |
14393 | * |
14394 | * \sa |
14395 | * ::cuMemcpy3D, |
14396 | * ::cuGraphMemcpyNodeGetParams, |
14397 | * ::cuGraphMemcpyNodeSetParams, |
14398 | * ::cuGraphCreate, |
14399 | * ::cuGraphDestroyNode, |
14400 | * ::cuGraphAddChildGraphNode, |
14401 | * ::cuGraphAddEmptyNode, |
14402 | * ::cuGraphAddKernelNode, |
14403 | * ::cuGraphAddHostNode, |
14404 | * ::cuGraphAddMemsetNode |
14405 | */ |
14406 | CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); |
14407 | |
14408 | /** |
14409 | * \brief Returns a memcpy node's parameters |
14410 | * |
14411 | * Returns the parameters of memcpy node \p hNode in \p nodeParams. |
14412 | * |
14413 | * \param hNode - Node to get the parameters for |
14414 | * \param nodeParams - Pointer to return the parameters |
14415 | * |
14416 | * \return |
14417 | * ::CUDA_SUCCESS, |
14418 | * ::CUDA_ERROR_DEINITIALIZED, |
14419 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14420 | * ::CUDA_ERROR_INVALID_VALUE |
14421 | * \note_graph_thread_safety |
14422 | * \notefnerr |
14423 | * |
14424 | * \sa |
14425 | * ::cuMemcpy3D, |
14426 | * ::cuGraphAddMemcpyNode, |
14427 | * ::cuGraphMemcpyNodeSetParams |
14428 | */ |
14429 | CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); |
14430 | |
14431 | /** |
14432 | * \brief Sets a memcpy node's parameters |
14433 | * |
14434 | * Sets the parameters of memcpy node \p hNode to \p nodeParams. |
14435 | * |
14436 | * \param hNode - Node to set the parameters for |
14437 | * \param nodeParams - Parameters to copy |
14438 | * |
14439 | * \return |
14440 | * ::CUDA_SUCCESS, |
14441 | * ::CUDA_ERROR_DEINITIALIZED, |
14442 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14443 | * ::CUDA_ERROR_INVALID_VALUE, |
14444 | * \note_graph_thread_safety |
14445 | * \notefnerr |
14446 | * |
14447 | * \sa |
14448 | * ::cuMemcpy3D, |
14449 | * ::cuGraphAddMemcpyNode, |
14450 | * ::cuGraphMemcpyNodeGetParams |
14451 | */ |
14452 | CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); |
14453 | |
14454 | /** |
14455 | * \brief Creates a memset node and adds it to a graph |
14456 | * |
14457 | * Creates a new memset node and adds it to \p hGraph with \p numDependencies |
14458 | * dependencies specified via \p dependencies. |
14459 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14460 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14461 | * A handle to the new node will be returned in \p phGraphNode. |
14462 | * |
14463 | * The element size must be 1, 2, or 4 bytes. |
14464 | * When the graph is launched, the node will perform the memset described by \p memsetParams. |
14465 | * |
14466 | * \param phGraphNode - Returns newly created node |
14467 | * \param hGraph - Graph to which to add the node |
14468 | * \param dependencies - Dependencies of the node |
14469 | * \param numDependencies - Number of dependencies |
14470 | * \param memsetParams - Parameters for the memory set |
14471 | * \param ctx - Context on which to run the node |
14472 | * |
14473 | * \return |
14474 | * ::CUDA_SUCCESS, |
14475 | * ::CUDA_ERROR_DEINITIALIZED, |
14476 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14477 | * ::CUDA_ERROR_INVALID_VALUE, |
14478 | * ::CUDA_ERROR_INVALID_CONTEXT |
14479 | * \note_graph_thread_safety |
14480 | * \notefnerr |
14481 | * |
14482 | * \sa |
14483 | * ::cuMemsetD2D32, |
14484 | * ::cuGraphMemsetNodeGetParams, |
14485 | * ::cuGraphMemsetNodeSetParams, |
14486 | * ::cuGraphCreate, |
14487 | * ::cuGraphDestroyNode, |
14488 | * ::cuGraphAddChildGraphNode, |
14489 | * ::cuGraphAddEmptyNode, |
14490 | * ::cuGraphAddKernelNode, |
14491 | * ::cuGraphAddHostNode, |
14492 | * ::cuGraphAddMemcpyNode |
14493 | */ |
14494 | CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); |
14495 | |
14496 | /** |
14497 | * \brief Returns a memset node's parameters |
14498 | * |
14499 | * Returns the parameters of memset node \p hNode in \p nodeParams. |
14500 | * |
14501 | * \param hNode - Node to get the parameters for |
14502 | * \param nodeParams - Pointer to return the parameters |
14503 | * |
14504 | * \return |
14505 | * ::CUDA_SUCCESS, |
14506 | * ::CUDA_ERROR_DEINITIALIZED, |
14507 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14508 | * ::CUDA_ERROR_INVALID_VALUE |
14509 | * \note_graph_thread_safety |
14510 | * \notefnerr |
14511 | * |
14512 | * \sa |
14513 | * ::cuMemsetD2D32, |
14514 | * ::cuGraphAddMemsetNode, |
14515 | * ::cuGraphMemsetNodeSetParams |
14516 | */ |
14517 | CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); |
14518 | |
14519 | /** |
14520 | * \brief Sets a memset node's parameters |
14521 | * |
14522 | * Sets the parameters of memset node \p hNode to \p nodeParams. |
14523 | * |
14524 | * \param hNode - Node to set the parameters for |
14525 | * \param nodeParams - Parameters to copy |
14526 | * |
14527 | * \return |
14528 | * ::CUDA_SUCCESS, |
14529 | * ::CUDA_ERROR_DEINITIALIZED, |
14530 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14531 | * ::CUDA_ERROR_INVALID_VALUE |
14532 | * \note_graph_thread_safety |
14533 | * \notefnerr |
14534 | * |
14535 | * \sa |
14536 | * ::cuMemsetD2D32, |
14537 | * ::cuGraphAddMemsetNode, |
14538 | * ::cuGraphMemsetNodeGetParams |
14539 | */ |
14540 | CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); |
14541 | |
14542 | /** |
14543 | * \brief Creates a host execution node and adds it to a graph |
14544 | * |
14545 | * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies |
14546 | * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
14547 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14548 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14549 | * A handle to the new node will be returned in \p phGraphNode. |
14550 | * |
14551 | * When the graph is launched, the node will invoke the specified CPU function. |
14552 | * Host nodes are not supported under MPS with pre-Volta GPUs. |
14553 | * |
14554 | * \param phGraphNode - Returns newly created node |
14555 | * \param hGraph - Graph to which to add the node |
14556 | * \param dependencies - Dependencies of the node |
14557 | * \param numDependencies - Number of dependencies |
14558 | * \param nodeParams - Parameters for the host node |
14559 | * |
14560 | * \return |
14561 | * ::CUDA_SUCCESS, |
14562 | * ::CUDA_ERROR_DEINITIALIZED, |
14563 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14564 | * ::CUDA_ERROR_NOT_SUPPORTED, |
14565 | * ::CUDA_ERROR_INVALID_VALUE |
14566 | * \note_graph_thread_safety |
14567 | * \notefnerr |
14568 | * |
14569 | * \sa |
14570 | * ::cuLaunchHostFunc, |
14571 | * ::cuGraphHostNodeGetParams, |
14572 | * ::cuGraphHostNodeSetParams, |
14573 | * ::cuGraphCreate, |
14574 | * ::cuGraphDestroyNode, |
14575 | * ::cuGraphAddChildGraphNode, |
14576 | * ::cuGraphAddEmptyNode, |
14577 | * ::cuGraphAddKernelNode, |
14578 | * ::cuGraphAddMemcpyNode, |
14579 | * ::cuGraphAddMemsetNode |
14580 | */ |
14581 | CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); |
14582 | |
14583 | /** |
14584 | * \brief Returns a host node's parameters |
14585 | * |
14586 | * Returns the parameters of host node \p hNode in \p nodeParams. |
14587 | * |
14588 | * \param hNode - Node to get the parameters for |
14589 | * \param nodeParams - Pointer to return the parameters |
14590 | * |
14591 | * \return |
14592 | * ::CUDA_SUCCESS, |
14593 | * ::CUDA_ERROR_DEINITIALIZED, |
14594 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14595 | * ::CUDA_ERROR_INVALID_VALUE |
14596 | * \note_graph_thread_safety |
14597 | * \notefnerr |
14598 | * |
14599 | * \sa |
14600 | * ::cuLaunchHostFunc, |
14601 | * ::cuGraphAddHostNode, |
14602 | * ::cuGraphHostNodeSetParams |
14603 | */ |
14604 | CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); |
14605 | |
14606 | /** |
14607 | * \brief Sets a host node's parameters |
14608 | * |
14609 | * Sets the parameters of host node \p hNode to \p nodeParams. |
14610 | * |
14611 | * \param hNode - Node to set the parameters for |
14612 | * \param nodeParams - Parameters to copy |
14613 | * |
14614 | * \return |
14615 | * ::CUDA_SUCCESS, |
14616 | * ::CUDA_ERROR_DEINITIALIZED, |
14617 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14618 | * ::CUDA_ERROR_INVALID_VALUE |
14619 | * \note_graph_thread_safety |
14620 | * \notefnerr |
14621 | * |
14622 | * \sa |
14623 | * ::cuLaunchHostFunc, |
14624 | * ::cuGraphAddHostNode, |
14625 | * ::cuGraphHostNodeGetParams |
14626 | */ |
14627 | CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); |
14628 | |
14629 | /** |
14630 | * \brief Creates a child graph node and adds it to a graph |
14631 | * |
14632 | * Creates a new node which executes an embedded graph, and adds it to \p hGraph with |
14633 | * \p numDependencies dependencies specified via \p dependencies. |
14634 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14635 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14636 | * A handle to the new node will be returned in \p phGraphNode. |
14637 | * |
14638 | * If \p hGraph contains allocation or free nodes, this call will return an error. |
14639 | * |
14640 | * The node executes an embedded child graph. The child graph is cloned in this call. |
14641 | * |
14642 | * \param phGraphNode - Returns newly created node |
14643 | * \param hGraph - Graph to which to add the node |
14644 | * \param dependencies - Dependencies of the node |
14645 | * \param numDependencies - Number of dependencies |
14646 | * \param childGraph - The graph to clone into this node |
14647 | * |
14648 | * \return |
14649 | * ::CUDA_SUCCESS, |
14650 | * ::CUDA_ERROR_DEINITIALIZED, |
14651 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14652 | * ::CUDA_ERROR_INVALID_VALUE, |
14653 | * \note_graph_thread_safety |
14654 | * \notefnerr |
14655 | * |
14656 | * \sa |
14657 | * ::cuGraphChildGraphNodeGetGraph, |
14658 | * ::cuGraphCreate, |
14659 | * ::cuGraphDestroyNode, |
14660 | * ::cuGraphAddEmptyNode, |
14661 | * ::cuGraphAddKernelNode, |
14662 | * ::cuGraphAddHostNode, |
14663 | * ::cuGraphAddMemcpyNode, |
14664 | * ::cuGraphAddMemsetNode, |
14665 | * ::cuGraphClone |
14666 | */ |
14667 | CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); |
14668 | |
14669 | /** |
14670 | * \brief Gets a handle to the embedded graph of a child graph node |
14671 | * |
14672 | * Gets a handle to the embedded graph in a child graph node. This call |
14673 | * does not clone the graph. Changes to the graph will be reflected in |
14674 | * the node, and the node retains ownership of the graph. |
14675 | * |
14676 | * Allocation and free nodes cannot be added to the returned graph. |
14677 | * Attempting to do so will return an error. |
14678 | * |
14679 | * \param hNode - Node to get the embedded graph for |
14680 | * \param phGraph - Location to store a handle to the graph |
14681 | * |
14682 | * \return |
14683 | * ::CUDA_SUCCESS, |
14684 | * ::CUDA_ERROR_DEINITIALIZED, |
14685 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14686 | * ::CUDA_ERROR_INVALID_VALUE, |
14687 | * \note_graph_thread_safety |
14688 | * \notefnerr |
14689 | * |
14690 | * \sa |
14691 | * ::cuGraphAddChildGraphNode, |
14692 | * ::cuGraphNodeFindInClone |
14693 | */ |
14694 | CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); |
14695 | |
14696 | /** |
14697 | * \brief Creates an empty node and adds it to a graph |
14698 | * |
14699 | * Creates a new node which performs no operation, and adds it to \p hGraph with |
14700 | * \p numDependencies dependencies specified via \p dependencies. |
14701 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14702 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14703 | * A handle to the new node will be returned in \p phGraphNode. |
14704 | * |
14705 | * An empty node performs no operation during execution, but can be used for |
14706 | * transitive ordering. For example, a phased execution graph with 2 groups of n |
14707 | * nodes with a barrier between them can be represented using an empty node and |
14708 | * 2*n dependency edges, rather than no empty node and n^2 dependency edges. |
14709 | * |
14710 | * \param phGraphNode - Returns newly created node |
14711 | * \param hGraph - Graph to which to add the node |
14712 | * \param dependencies - Dependencies of the node |
14713 | * \param numDependencies - Number of dependencies |
14714 | * |
14715 | * \return |
14716 | * ::CUDA_SUCCESS, |
14717 | * ::CUDA_ERROR_DEINITIALIZED, |
14718 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14719 | * ::CUDA_ERROR_INVALID_VALUE, |
14720 | * \note_graph_thread_safety |
14721 | * \notefnerr |
14722 | * |
14723 | * \sa |
14724 | * ::cuGraphCreate, |
14725 | * ::cuGraphDestroyNode, |
14726 | * ::cuGraphAddChildGraphNode, |
14727 | * ::cuGraphAddKernelNode, |
14728 | * ::cuGraphAddHostNode, |
14729 | * ::cuGraphAddMemcpyNode, |
14730 | * ::cuGraphAddMemsetNode |
14731 | */ |
14732 | CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); |
14733 | |
14734 | /** |
14735 | * \brief Creates an event record node and adds it to a graph |
14736 | * |
14737 | * Creates a new event record node and adds it to \p hGraph with \p numDependencies |
14738 | * dependencies specified via \p dependencies and event specified in \p event. |
14739 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14740 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14741 | * A handle to the new node will be returned in \p phGraphNode. |
14742 | * |
14743 | * Each launch of the graph will record \p event to capture execution of the |
14744 | * node's dependencies. |
14745 | * |
14746 | * \param phGraphNode - Returns newly created node |
14747 | * \param hGraph - Graph to which to add the node |
14748 | * \param dependencies - Dependencies of the node |
14749 | * \param numDependencies - Number of dependencies |
14750 | * \param event - Event for the node |
14751 | * |
14752 | * \return |
14753 | * ::CUDA_SUCCESS, |
14754 | * ::CUDA_ERROR_DEINITIALIZED, |
14755 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14756 | * ::CUDA_ERROR_NOT_SUPPORTED, |
14757 | * ::CUDA_ERROR_INVALID_VALUE |
14758 | * \note_graph_thread_safety |
14759 | * \notefnerr |
14760 | * |
14761 | * \sa |
14762 | * ::cuGraphAddEventWaitNode, |
14763 | * ::cuEventRecordWithFlags, |
14764 | * ::cuStreamWaitEvent, |
14765 | * ::cuGraphCreate, |
14766 | * ::cuGraphDestroyNode, |
14767 | * ::cuGraphAddChildGraphNode, |
14768 | * ::cuGraphAddEmptyNode, |
14769 | * ::cuGraphAddKernelNode, |
14770 | * ::cuGraphAddMemcpyNode, |
14771 | * ::cuGraphAddMemsetNode, |
14772 | */ |
14773 | CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); |
14774 | |
14775 | /** |
14776 | * \brief Returns the event associated with an event record node |
14777 | * |
14778 | * Returns the event of event record node \p hNode in \p event_out. |
14779 | * |
14780 | * \param hNode - Node to get the event for |
14781 | * \param event_out - Pointer to return the event |
14782 | * |
14783 | * \return |
14784 | * ::CUDA_SUCCESS, |
14785 | * ::CUDA_ERROR_DEINITIALIZED, |
14786 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14787 | * ::CUDA_ERROR_INVALID_VALUE |
14788 | * \note_graph_thread_safety |
14789 | * \notefnerr |
14790 | * |
14791 | * \sa |
14792 | * ::cuGraphAddEventRecordNode, |
14793 | * ::cuGraphEventRecordNodeSetEvent, |
14794 | * ::cuGraphEventWaitNodeGetEvent, |
14795 | * ::cuEventRecordWithFlags, |
14796 | * ::cuStreamWaitEvent |
14797 | */ |
14798 | CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); |
14799 | |
14800 | /** |
14801 | * \brief Sets an event record node's event |
14802 | * |
14803 | * Sets the event of event record node \p hNode to \p event. |
14804 | * |
14805 | * \param hNode - Node to set the event for |
14806 | * \param event - Event to use |
14807 | * |
14808 | * \return |
14809 | * ::CUDA_SUCCESS, |
14810 | * ::CUDA_ERROR_INVALID_VALUE, |
14811 | * ::CUDA_ERROR_INVALID_HANDLE, |
14812 | * ::CUDA_ERROR_OUT_OF_MEMORY |
14813 | * \note_graph_thread_safety |
14814 | * \notefnerr |
14815 | * |
14816 | * \sa |
14817 | * ::cuGraphAddEventRecordNode, |
14818 | * ::cuGraphEventRecordNodeGetEvent, |
14819 | * ::cuGraphEventWaitNodeSetEvent, |
14820 | * ::cuEventRecordWithFlags, |
14821 | * ::cuStreamWaitEvent |
14822 | */ |
14823 | CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); |
14824 | |
14825 | /** |
14826 | * \brief Creates an event wait node and adds it to a graph |
14827 | * |
14828 | * Creates a new event wait node and adds it to \p hGraph with \p numDependencies |
14829 | * dependencies specified via \p dependencies and event specified in \p event. |
14830 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
14831 | * at the root of the graph. \p dependencies may not have any duplicate entries. |
14832 | * A handle to the new node will be returned in \p phGraphNode. |
14833 | * |
14834 | * The graph node will wait for all work captured in \p event. See ::cuEventRecord() |
14835 | * for details on what is captured by an event. \p event may be from a different context |
14836 | * or device than the launch stream. |
14837 | * |
14838 | * \param phGraphNode - Returns newly created node |
14839 | * \param hGraph - Graph to which to add the node |
14840 | * \param dependencies - Dependencies of the node |
14841 | * \param numDependencies - Number of dependencies |
14842 | * \param event - Event for the node |
14843 | * |
14844 | * \return |
14845 | * ::CUDA_SUCCESS, |
14846 | * ::CUDA_ERROR_DEINITIALIZED, |
14847 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14848 | * ::CUDA_ERROR_NOT_SUPPORTED, |
14849 | * ::CUDA_ERROR_INVALID_VALUE |
14850 | * \note_graph_thread_safety |
14851 | * \notefnerr |
14852 | * |
14853 | * \sa |
14854 | * ::cuGraphAddEventRecordNode, |
14855 | * ::cuEventRecordWithFlags, |
14856 | * ::cuStreamWaitEvent, |
14857 | * ::cuGraphCreate, |
14858 | * ::cuGraphDestroyNode, |
14859 | * ::cuGraphAddChildGraphNode, |
14860 | * ::cuGraphAddEmptyNode, |
14861 | * ::cuGraphAddKernelNode, |
14862 | * ::cuGraphAddMemcpyNode, |
14863 | * ::cuGraphAddMemsetNode, |
14864 | */ |
14865 | CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); |
14866 | |
14867 | /** |
14868 | * \brief Returns the event associated with an event wait node |
14869 | * |
14870 | * Returns the event of event wait node \p hNode in \p event_out. |
14871 | * |
14872 | * \param hNode - Node to get the event for |
14873 | * \param event_out - Pointer to return the event |
14874 | * |
14875 | * \return |
14876 | * ::CUDA_SUCCESS, |
14877 | * ::CUDA_ERROR_DEINITIALIZED, |
14878 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14879 | * ::CUDA_ERROR_INVALID_VALUE |
14880 | * \note_graph_thread_safety |
14881 | * \notefnerr |
14882 | * |
14883 | * \sa |
14884 | * ::cuGraphAddEventWaitNode, |
14885 | * ::cuGraphEventWaitNodeSetEvent, |
14886 | * ::cuGraphEventRecordNodeGetEvent, |
14887 | * ::cuEventRecordWithFlags, |
14888 | * ::cuStreamWaitEvent |
14889 | */ |
14890 | CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); |
14891 | |
14892 | /** |
14893 | * \brief Sets an event wait node's event |
14894 | * |
14895 | * Sets the event of event wait node \p hNode to \p event. |
14896 | * |
14897 | * \param hNode - Node to set the event for |
14898 | * \param event - Event to use |
14899 | * |
14900 | * \return |
14901 | * ::CUDA_SUCCESS, |
14902 | * ::CUDA_ERROR_INVALID_VALUE, |
14903 | * ::CUDA_ERROR_INVALID_HANDLE, |
14904 | * ::CUDA_ERROR_OUT_OF_MEMORY |
14905 | * \note_graph_thread_safety |
14906 | * \notefnerr |
14907 | * |
14908 | * \sa |
14909 | * ::cuGraphAddEventWaitNode, |
14910 | * ::cuGraphEventWaitNodeGetEvent, |
14911 | * ::cuGraphEventRecordNodeSetEvent, |
14912 | * ::cuEventRecordWithFlags, |
14913 | * ::cuStreamWaitEvent |
14914 | */ |
14915 | CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); |
14916 | |
14917 | /** |
14918 | * \brief Creates an external semaphore signal node and adds it to a graph |
14919 | * |
14920 | * Creates a new external semaphore signal node and adds it to \p hGraph with \p |
14921 | * numDependencies dependencies specified via \p dependencies and arguments specified |
14922 | * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the |
14923 | * node will be placed at the root of the graph. \p dependencies may not have any |
14924 | * duplicate entries. A handle to the new node will be returned in \p phGraphNode. |
14925 | * |
14926 | * Performs a signal operation on a set of externally allocated semaphore objects |
14927 | * when the node is launched. The operation(s) will occur after all of the node's |
14928 | * dependencies have completed. |
14929 | * |
14930 | * \param phGraphNode - Returns newly created node |
14931 | * \param hGraph - Graph to which to add the node |
14932 | * \param dependencies - Dependencies of the node |
14933 | * \param numDependencies - Number of dependencies |
14934 | * \param nodeParams - Parameters for the node |
14935 | * |
14936 | * \return |
14937 | * ::CUDA_SUCCESS, |
14938 | * ::CUDA_ERROR_DEINITIALIZED, |
14939 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14940 | * ::CUDA_ERROR_NOT_SUPPORTED, |
14941 | * ::CUDA_ERROR_INVALID_VALUE |
14942 | * \note_graph_thread_safety |
14943 | * \notefnerr |
14944 | * |
14945 | * \sa |
14946 | * ::cuGraphExternalSemaphoresSignalNodeGetParams, |
14947 | * ::cuGraphExternalSemaphoresSignalNodeSetParams, |
14948 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
14949 | * ::cuGraphAddExternalSemaphoresWaitNode, |
14950 | * ::cuImportExternalSemaphore, |
14951 | * ::cuSignalExternalSemaphoresAsync, |
14952 | * ::cuWaitExternalSemaphoresAsync, |
14953 | * ::cuGraphCreate, |
14954 | * ::cuGraphDestroyNode, |
14955 | * ::cuGraphAddEventRecordNode, |
14956 | * ::cuGraphAddEventWaitNode, |
14957 | * ::cuGraphAddChildGraphNode, |
14958 | * ::cuGraphAddEmptyNode, |
14959 | * ::cuGraphAddKernelNode, |
14960 | * ::cuGraphAddMemcpyNode, |
14961 | * ::cuGraphAddMemsetNode, |
14962 | */ |
14963 | CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); |
14964 | |
14965 | /** |
14966 | * \brief Returns an external semaphore signal node's parameters |
14967 | * |
14968 | * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. |
14969 | * The \p extSemArray and \p paramsArray returned in \p params_out, |
14970 | * are owned by the node. This memory remains valid until the node is destroyed or its |
14971 | * parameters are modified, and should not be modified |
14972 | * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the |
14973 | * parameters of this node. |
14974 | * |
14975 | * \param hNode - Node to get the parameters for |
14976 | * \param params_out - Pointer to return the parameters |
14977 | * |
14978 | * \return |
14979 | * ::CUDA_SUCCESS, |
14980 | * ::CUDA_ERROR_DEINITIALIZED, |
14981 | * ::CUDA_ERROR_NOT_INITIALIZED, |
14982 | * ::CUDA_ERROR_INVALID_VALUE |
14983 | * \note_graph_thread_safety |
14984 | * \notefnerr |
14985 | * |
14986 | * \sa |
14987 | * ::cuLaunchKernel, |
14988 | * ::cuGraphAddExternalSemaphoresSignalNode, |
14989 | * ::cuGraphExternalSemaphoresSignalNodeSetParams, |
14990 | * ::cuGraphAddExternalSemaphoresWaitNode, |
14991 | * ::cuSignalExternalSemaphoresAsync, |
14992 | * ::cuWaitExternalSemaphoresAsync |
14993 | */ |
14994 | CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); |
14995 | |
14996 | /** |
14997 | * \brief Sets an external semaphore signal node's parameters |
14998 | * |
14999 | * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. |
15000 | * |
15001 | * \param hNode - Node to set the parameters for |
15002 | * \param nodeParams - Parameters to copy |
15003 | * |
15004 | * \return |
15005 | * ::CUDA_SUCCESS, |
15006 | * ::CUDA_ERROR_INVALID_VALUE, |
15007 | * ::CUDA_ERROR_INVALID_HANDLE, |
15008 | * ::CUDA_ERROR_OUT_OF_MEMORY |
15009 | * \note_graph_thread_safety |
15010 | * \notefnerr |
15011 | * |
15012 | * \sa |
15013 | * ::cuGraphAddExternalSemaphoresSignalNode, |
15014 | * ::cuGraphExternalSemaphoresSignalNodeSetParams, |
15015 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15016 | * ::cuSignalExternalSemaphoresAsync, |
15017 | * ::cuWaitExternalSemaphoresAsync |
15018 | */ |
15019 | CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); |
15020 | |
15021 | /** |
15022 | * \brief Creates an external semaphore wait node and adds it to a graph |
15023 | * |
15024 | * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies |
15025 | * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
15026 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
15027 | * at the root of the graph. \p dependencies may not have any duplicate entries. A handle |
15028 | * to the new node will be returned in \p phGraphNode. |
15029 | * |
15030 | * Performs a wait operation on a set of externally allocated semaphore objects |
15031 | * when the node is launched. The node's dependencies will not be launched until |
15032 | * the wait operation has completed. |
15033 | * |
15034 | * \param phGraphNode - Returns newly created node |
15035 | * \param hGraph - Graph to which to add the node |
15036 | * \param dependencies - Dependencies of the node |
15037 | * \param numDependencies - Number of dependencies |
15038 | * \param nodeParams - Parameters for the node |
15039 | * |
15040 | * \return |
15041 | * ::CUDA_SUCCESS, |
15042 | * ::CUDA_ERROR_DEINITIALIZED, |
15043 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15044 | * ::CUDA_ERROR_NOT_SUPPORTED, |
15045 | * ::CUDA_ERROR_INVALID_VALUE |
15046 | * \note_graph_thread_safety |
15047 | * \notefnerr |
15048 | * |
15049 | * \sa |
15050 | * ::cuGraphExternalSemaphoresWaitNodeGetParams, |
15051 | * ::cuGraphExternalSemaphoresWaitNodeSetParams, |
15052 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15053 | * ::cuGraphAddExternalSemaphoresSignalNode, |
15054 | * ::cuImportExternalSemaphore, |
15055 | * ::cuSignalExternalSemaphoresAsync, |
15056 | * ::cuWaitExternalSemaphoresAsync, |
15057 | * ::cuGraphCreate, |
15058 | * ::cuGraphDestroyNode, |
15059 | * ::cuGraphAddEventRecordNode, |
15060 | * ::cuGraphAddEventWaitNode, |
15061 | * ::cuGraphAddChildGraphNode, |
15062 | * ::cuGraphAddEmptyNode, |
15063 | * ::cuGraphAddKernelNode, |
15064 | * ::cuGraphAddMemcpyNode, |
15065 | * ::cuGraphAddMemsetNode, |
15066 | */ |
15067 | CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); |
15068 | |
15069 | /** |
15070 | * \brief Returns an external semaphore wait node's parameters |
15071 | * |
15072 | * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. |
15073 | * The \p extSemArray and \p paramsArray returned in \p params_out, |
15074 | * are owned by the node. This memory remains valid until the node is destroyed or its |
15075 | * parameters are modified, and should not be modified |
15076 | * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the |
15077 | * parameters of this node. |
15078 | * |
15079 | * \param hNode - Node to get the parameters for |
15080 | * \param params_out - Pointer to return the parameters |
15081 | * |
15082 | * \return |
15083 | * ::CUDA_SUCCESS, |
15084 | * ::CUDA_ERROR_DEINITIALIZED, |
15085 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15086 | * ::CUDA_ERROR_INVALID_VALUE |
15087 | * \note_graph_thread_safety |
15088 | * \notefnerr |
15089 | * |
15090 | * \sa |
15091 | * ::cuLaunchKernel, |
15092 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15093 | * ::cuGraphExternalSemaphoresWaitNodeSetParams, |
15094 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15095 | * ::cuSignalExternalSemaphoresAsync, |
15096 | * ::cuWaitExternalSemaphoresAsync |
15097 | */ |
15098 | CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); |
15099 | |
15100 | /** |
15101 | * \brief Sets an external semaphore wait node's parameters |
15102 | * |
15103 | * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. |
15104 | * |
15105 | * \param hNode - Node to set the parameters for |
15106 | * \param nodeParams - Parameters to copy |
15107 | * |
15108 | * \return |
15109 | * ::CUDA_SUCCESS, |
15110 | * ::CUDA_ERROR_INVALID_VALUE, |
15111 | * ::CUDA_ERROR_INVALID_HANDLE, |
15112 | * ::CUDA_ERROR_OUT_OF_MEMORY |
15113 | * \note_graph_thread_safety |
15114 | * \notefnerr |
15115 | * |
15116 | * \sa |
15117 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15118 | * ::cuGraphExternalSemaphoresWaitNodeSetParams, |
15119 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15120 | * ::cuSignalExternalSemaphoresAsync, |
15121 | * ::cuWaitExternalSemaphoresAsync |
15122 | */ |
15123 | CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); |
15124 | |
15125 | /** |
15126 | * \brief Creates an allocation node and adds it to a graph |
15127 | * |
15128 | * Creates a new allocation node and adds it to \p hGraph with \p numDependencies |
15129 | * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
15130 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
15131 | * at the root of the graph. \p dependencies may not have any duplicate entries. A handle |
15132 | * to the new node will be returned in \p phGraphNode. |
15133 | * |
15134 | * \param phGraphNode - Returns newly created node |
15135 | * \param hGraph - Graph to which to add the node |
15136 | * \param dependencies - Dependencies of the node |
15137 | * \param numDependencies - Number of dependencies |
15138 | * \param nodeParams - Parameters for the node |
15139 | * |
15140 | * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in |
15141 | * \param nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. |
15142 | * |
15143 | * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, |
15144 | * the allocation can be accessed by nodes ordered after the allocation node but before the free node. |
15145 | * These allocations cannot be freed outside the owning graph, and they can only be freed once in the |
15146 | * owning graph. |
15147 | * |
15148 | * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the |
15149 | * graph which are ordered after the allocation node, but also by stream operations ordered after the |
15150 | * graph's execution but before the allocation is freed. |
15151 | * |
15152 | * Allocations which are not freed in the same graph can be freed by: |
15153 | * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; |
15154 | * - launching a graph with a free node for that allocation; or |
15155 | * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes |
15156 | * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. |
15157 | * |
15158 | * It is not possible to free an allocation in both the owning graph and another graph. If the allocation |
15159 | * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed |
15160 | * in another graph, a free node can no longer be added to the owning graph. |
15161 | * |
15162 | * The following restrictions apply to graphs which contain allocation and/or memory free nodes: |
15163 | * - Nodes and edges of the graph cannot be deleted. |
15164 | * - The graph cannot be used in a child node. |
15165 | * - Only one instantiation of the graph may exist at any point in time. |
15166 | * - The graph cannot be cloned. |
15167 | * |
15168 | * \return |
15169 | * ::CUDA_SUCCESS, |
15170 | * ::CUDA_ERROR_DEINITIALIZED, |
15171 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15172 | * ::CUDA_ERROR_NOT_SUPPORTED, |
15173 | * ::CUDA_ERROR_INVALID_VALUE |
15174 | * \note_graph_thread_safety |
15175 | * \notefnerr |
15176 | * |
15177 | * \sa |
15178 | * ::cuGraphAddMemFreeNode, |
15179 | * ::cuGraphMemAllocNodeGetParams, |
15180 | * ::cuDeviceGraphMemTrim, |
15181 | * ::cuDeviceGetGraphMemAttribute, |
15182 | * ::cuDeviceSetGraphMemAttribute, |
15183 | * ::cuMemAllocAsync, |
15184 | * ::cuMemFreeAsync, |
15185 | * ::cuGraphCreate, |
15186 | * ::cuGraphDestroyNode, |
15187 | * ::cuGraphAddChildGraphNode, |
15188 | * ::cuGraphAddEmptyNode, |
15189 | * ::cuGraphAddEventRecordNode, |
15190 | * ::cuGraphAddEventWaitNode, |
15191 | * ::cuGraphAddExternalSemaphoresSignalNode, |
15192 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15193 | * ::cuGraphAddKernelNode, |
15194 | * ::cuGraphAddMemcpyNode, |
15195 | * ::cuGraphAddMemsetNode |
15196 | */ |
15197 | CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); |
15198 | |
15199 | /** |
15200 | * \brief Returns a memory alloc node's parameters |
15201 | * |
15202 | * Returns the parameters of a memory alloc node \p hNode in \p params_out. |
15203 | * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the |
15204 | * node. This memory remains valid until the node is destroyed. The returned |
15205 | * parameters must not be modified. |
15206 | * |
15207 | * \param hNode - Node to get the parameters for |
15208 | * \param params_out - Pointer to return the parameters |
15209 | * |
15210 | * \return |
15211 | * ::CUDA_SUCCESS, |
15212 | * ::CUDA_ERROR_DEINITIALIZED, |
15213 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15214 | * ::CUDA_ERROR_INVALID_VALUE |
15215 | * \note_graph_thread_safety |
15216 | * \notefnerr |
15217 | * |
15218 | * \sa |
15219 | * ::cuGraphAddMemAllocNode, |
15220 | * ::cuGraphMemFreeNodeGetParams |
15221 | */ |
15222 | CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); |
15223 | |
15224 | /** |
15225 | * \brief Creates a memory free node and adds it to a graph |
15226 | * |
15227 | * Creates a new memory free node and adds it to \p hGraph with \p numDependencies |
15228 | * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
15229 | * It is possible for \p numDependencies to be 0, in which case the node will be placed |
15230 | * at the root of the graph. \p dependencies may not have any duplicate entries. A handle |
15231 | * to the new node will be returned in \p phGraphNode. |
15232 | * |
15233 | * \param phGraphNode - Returns newly created node |
15234 | * \param hGraph - Graph to which to add the node |
15235 | * \param dependencies - Dependencies of the node |
15236 | * \param numDependencies - Number of dependencies |
15237 | * \param dptr - Address of memory to free |
15238 | * |
15239 | * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: |
15240 | * - an allocation twice in the same graph. |
15241 | * - an address that was not returned by an allocation node. |
15242 | * - an invalid address. |
15243 | * |
15244 | * The following restrictions apply to graphs which contain allocation and/or memory free nodes: |
15245 | * - Nodes and edges of the graph cannot be deleted. |
15246 | * - The graph cannot be used in a child node. |
15247 | * - Only one instantiation of the graph may exist at any point in time. |
15248 | * - The graph cannot be cloned. |
15249 | * |
15250 | * \return |
15251 | * ::CUDA_SUCCESS, |
15252 | * ::CUDA_ERROR_DEINITIALIZED, |
15253 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15254 | * ::CUDA_ERROR_NOT_SUPPORTED, |
15255 | * ::CUDA_ERROR_INVALID_VALUE |
15256 | * \note_graph_thread_safety |
15257 | * \notefnerr |
15258 | * |
15259 | * \sa |
15260 | * ::cuGraphAddMemAllocNode, |
15261 | * ::cuGraphMemFreeNodeGetParams, |
15262 | * ::cuDeviceGraphMemTrim, |
15263 | * ::cuDeviceGetGraphMemAttribute, |
15264 | * ::cuDeviceSetGraphMemAttribute, |
15265 | * ::cuMemAllocAsync, |
15266 | * ::cuMemFreeAsync, |
15267 | * ::cuGraphCreate, |
15268 | * ::cuGraphDestroyNode, |
15269 | * ::cuGraphAddChildGraphNode, |
15270 | * ::cuGraphAddEmptyNode, |
15271 | * ::cuGraphAddEventRecordNode, |
15272 | * ::cuGraphAddEventWaitNode, |
15273 | * ::cuGraphAddExternalSemaphoresSignalNode, |
15274 | * ::cuGraphAddExternalSemaphoresWaitNode, |
15275 | * ::cuGraphAddKernelNode, |
15276 | * ::cuGraphAddMemcpyNode, |
15277 | * ::cuGraphAddMemsetNode |
15278 | */ |
15279 | CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); |
15280 | |
15281 | /** |
15282 | * \brief Returns a memory free node's parameters |
15283 | * |
15284 | * Returns the address of a memory free node \p hNode in \p dptr_out. |
15285 | * |
15286 | * \param hNode - Node to get the parameters for |
15287 | * \param dptr_out - Pointer to return the device address |
15288 | * |
15289 | * \return |
15290 | * ::CUDA_SUCCESS, |
15291 | * ::CUDA_ERROR_DEINITIALIZED, |
15292 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15293 | * ::CUDA_ERROR_INVALID_VALUE |
15294 | * \note_graph_thread_safety |
15295 | * \notefnerr |
15296 | * |
15297 | * \sa |
15298 | * ::cuGraphAddMemFreeNode, |
15299 | * ::cuGraphMemAllocNodeGetParams |
15300 | */ |
15301 | CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); |
15302 | |
15303 | /** |
15304 | * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. |
15305 | * |
15306 | * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are |
15307 | * freed back to the operating system. |
15308 | * |
15309 | * \param device - The device for which cached memory should be freed. |
15310 | * |
15311 | * \return |
15312 | * ::CUDA_SUCCESS, |
15313 | * ::CUDA_ERROR_INVALID_DEVICE |
15314 | * |
15315 | * \sa |
15316 | * ::cuGraphAddMemAllocNode, |
15317 | * ::cuGraphAddMemFreeNode |
15318 | */ |
15319 | CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); |
15320 | |
15321 | /** |
15322 | * \brief Query asynchronous allocation attributes related to graphs |
15323 | * |
15324 | * Valid attributes are: |
15325 | * |
15326 | * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs |
15327 | * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the |
15328 | * last time it was reset. High watermark can only be reset to zero. |
15329 | * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by |
15330 | * the CUDA graphs asynchronous allocator. |
15331 | * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by |
15332 | * the CUDA graphs asynchronous allocator. |
15333 | * |
15334 | * \param device - Specifies the scope of the query |
15335 | * \param attr - attribute to get |
15336 | * \param value - retrieved value |
15337 | * |
15338 | * \return |
15339 | * ::CUDA_SUCCESS, |
15340 | * ::CUDA_ERROR_INVALID_DEVICE |
15341 | * |
15342 | * \sa |
15343 | * ::cuGraphAddMemAllocNode, |
15344 | * ::cuGraphAddMemFreeNode |
15345 | */ |
15346 | CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); |
15347 | |
15348 | /** |
15349 | * \brief Set asynchronous allocation attributes related to graphs |
15350 | * |
15351 | * Valid attributes are: |
15352 | * |
15353 | * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the |
15354 | * last time it was reset. High watermark can only be reset to zero. |
15355 | * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by |
15356 | * the CUDA graphs asynchronous allocator. |
15357 | * |
15358 | * \param device - Specifies the scope of the query |
15359 | * \param attr - attribute to get |
15360 | * \param value - pointer to value to set |
15361 | * |
15362 | * \return |
15363 | * ::CUDA_SUCCESS, |
15364 | * ::CUDA_ERROR_INVALID_DEVICE |
15365 | * |
15366 | * \sa |
15367 | * ::cuGraphAddMemAllocNode, |
15368 | * ::cuGraphAddMemFreeNode |
15369 | */ |
15370 | CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); |
15371 | |
15372 | /** |
15373 | * \brief Clones a graph |
15374 | * |
15375 | * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. |
15376 | * All parameters are copied into the cloned graph. The original graph may be modified |
15377 | * after this call without affecting the clone. |
15378 | * |
15379 | * Child graph nodes in the original graph are recursively copied into the clone. |
15380 | * |
15381 | * \param phGraphClone - Returns newly created cloned graph |
15382 | * \param originalGraph - Graph to clone |
15383 | * |
15384 | * \return |
15385 | * ::CUDA_SUCCESS, |
15386 | * ::CUDA_ERROR_INVALID_VALUE, |
15387 | * ::CUDA_ERROR_OUT_OF_MEMORY |
15388 | * \note_graph_thread_safety |
15389 | * \notefnerr |
15390 | * |
15391 | * \sa |
15392 | * ::cuGraphCreate, |
15393 | * ::cuGraphNodeFindInClone |
15394 | */ |
15395 | CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); |
15396 | |
15397 | /** |
15398 | * \brief Finds a cloned version of a node |
15399 | * |
15400 | * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode |
15401 | * in the original graph. |
15402 | * |
15403 | * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. |
15404 | * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to |
15405 | * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have |
15406 | * been removed. The cloned node is then returned via \p phClonedNode. |
15407 | * |
15408 | * \param phNode - Returns handle to the cloned node |
15409 | * \param hOriginalNode - Handle to the original node |
15410 | * \param hClonedGraph - Cloned graph to query |
15411 | * |
15412 | * \return |
15413 | * ::CUDA_SUCCESS, |
15414 | * ::CUDA_ERROR_INVALID_VALUE, |
15415 | * \note_graph_thread_safety |
15416 | * \notefnerr |
15417 | * |
15418 | * \sa |
15419 | * ::cuGraphClone |
15420 | */ |
15421 | CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); |
15422 | |
15423 | /** |
15424 | * \brief Returns a node's type |
15425 | * |
15426 | * Returns the node type of \p hNode in \p type. |
15427 | * |
15428 | * \param hNode - Node to query |
15429 | * \param type - Pointer to return the node type |
15430 | * |
15431 | * \return |
15432 | * ::CUDA_SUCCESS, |
15433 | * ::CUDA_ERROR_DEINITIALIZED, |
15434 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15435 | * ::CUDA_ERROR_INVALID_VALUE |
15436 | * \note_graph_thread_safety |
15437 | * \notefnerr |
15438 | * |
15439 | * \sa |
15440 | * ::cuGraphGetNodes, |
15441 | * ::cuGraphGetRootNodes, |
15442 | * ::cuGraphChildGraphNodeGetGraph, |
15443 | * ::cuGraphKernelNodeGetParams, |
15444 | * ::cuGraphKernelNodeSetParams, |
15445 | * ::cuGraphHostNodeGetParams, |
15446 | * ::cuGraphHostNodeSetParams, |
15447 | * ::cuGraphMemcpyNodeGetParams, |
15448 | * ::cuGraphMemcpyNodeSetParams, |
15449 | * ::cuGraphMemsetNodeGetParams, |
15450 | * ::cuGraphMemsetNodeSetParams |
15451 | */ |
15452 | CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); |
15453 | |
15454 | /** |
15455 | * \brief Returns a graph's nodes |
15456 | * |
15457 | * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this |
15458 | * function will return the number of nodes in \p numNodes. Otherwise, |
15459 | * \p numNodes entries will be filled in. If \p numNodes is higher than the actual |
15460 | * number of nodes, the remaining entries in \p nodes will be set to NULL, and the |
15461 | * number of nodes actually obtained will be returned in \p numNodes. |
15462 | * |
15463 | * \param hGraph - Graph to query |
15464 | * \param nodes - Pointer to return the nodes |
15465 | * \param numNodes - See description |
15466 | * |
15467 | * \return |
15468 | * ::CUDA_SUCCESS, |
15469 | * ::CUDA_ERROR_DEINITIALIZED, |
15470 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15471 | * ::CUDA_ERROR_INVALID_VALUE |
15472 | * \note_graph_thread_safety |
15473 | * \notefnerr |
15474 | * |
15475 | * \sa |
15476 | * ::cuGraphCreate, |
15477 | * ::cuGraphGetRootNodes, |
15478 | * ::cuGraphGetEdges, |
15479 | * ::cuGraphNodeGetType, |
15480 | * ::cuGraphNodeGetDependencies, |
15481 | * ::cuGraphNodeGetDependentNodes |
15482 | */ |
15483 | CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); |
15484 | |
15485 | /** |
15486 | * \brief Returns a graph's root nodes |
15487 | * |
15488 | * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this |
15489 | * function will return the number of root nodes in \p numRootNodes. Otherwise, |
15490 | * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual |
15491 | * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the |
15492 | * number of nodes actually obtained will be returned in \p numRootNodes. |
15493 | * |
15494 | * \param hGraph - Graph to query |
15495 | * \param rootNodes - Pointer to return the root nodes |
15496 | * \param numRootNodes - See description |
15497 | * |
15498 | * \return |
15499 | * ::CUDA_SUCCESS, |
15500 | * ::CUDA_ERROR_DEINITIALIZED, |
15501 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15502 | * ::CUDA_ERROR_INVALID_VALUE |
15503 | * \note_graph_thread_safety |
15504 | * \notefnerr |
15505 | * |
15506 | * \sa |
15507 | * ::cuGraphCreate, |
15508 | * ::cuGraphGetNodes, |
15509 | * ::cuGraphGetEdges, |
15510 | * ::cuGraphNodeGetType, |
15511 | * ::cuGraphNodeGetDependencies, |
15512 | * ::cuGraphNodeGetDependentNodes |
15513 | */ |
15514 | CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); |
15515 | |
15516 | /** |
15517 | * \brief Returns a graph's dependency edges |
15518 | * |
15519 | * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding |
15520 | * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the |
15521 | * node in \p from[i]. \p from and \p to may both be NULL, in which |
15522 | * case this function only returns the number of edges in \p numEdges. Otherwise, |
15523 | * \p numEdges entries will be filled in. If \p numEdges is higher than the actual |
15524 | * number of edges, the remaining entries in \p from and \p to will be set to NULL, and |
15525 | * the number of edges actually returned will be written to \p numEdges. |
15526 | * |
15527 | * \param hGraph - Graph to get the edges from |
15528 | * \param from - Location to return edge endpoints |
15529 | * \param to - Location to return edge endpoints |
15530 | * \param numEdges - See description |
15531 | * |
15532 | * \return |
15533 | * ::CUDA_SUCCESS, |
15534 | * ::CUDA_ERROR_DEINITIALIZED, |
15535 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15536 | * ::CUDA_ERROR_INVALID_VALUE |
15537 | * \note_graph_thread_safety |
15538 | * \notefnerr |
15539 | * |
15540 | * \sa |
15541 | * ::cuGraphGetNodes, |
15542 | * ::cuGraphGetRootNodes, |
15543 | * ::cuGraphAddDependencies, |
15544 | * ::cuGraphRemoveDependencies, |
15545 | * ::cuGraphNodeGetDependencies, |
15546 | * ::cuGraphNodeGetDependentNodes |
15547 | */ |
15548 | CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); |
15549 | |
15550 | /** |
15551 | * \brief Returns a node's dependencies |
15552 | * |
15553 | * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this |
15554 | * function will return the number of dependencies in \p numDependencies. Otherwise, |
15555 | * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual |
15556 | * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the |
15557 | * number of nodes actually obtained will be returned in \p numDependencies. |
15558 | * |
15559 | * \param hNode - Node to query |
15560 | * \param dependencies - Pointer to return the dependencies |
15561 | * \param numDependencies - See description |
15562 | * |
15563 | * \return |
15564 | * ::CUDA_SUCCESS, |
15565 | * ::CUDA_ERROR_DEINITIALIZED, |
15566 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15567 | * ::CUDA_ERROR_INVALID_VALUE |
15568 | * \note_graph_thread_safety |
15569 | * \notefnerr |
15570 | * |
15571 | * \sa |
15572 | * ::cuGraphNodeGetDependentNodes, |
15573 | * ::cuGraphGetNodes, |
15574 | * ::cuGraphGetRootNodes, |
15575 | * ::cuGraphGetEdges, |
15576 | * ::cuGraphAddDependencies, |
15577 | * ::cuGraphRemoveDependencies |
15578 | */ |
15579 | CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); |
15580 | |
15581 | /** |
15582 | * \brief Returns a node's dependent nodes |
15583 | * |
15584 | * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which |
15585 | * case this function will return the number of dependent nodes in \p numDependentNodes. |
15586 | * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is |
15587 | * higher than the actual number of dependent nodes, the remaining entries in |
15588 | * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will |
15589 | * be returned in \p numDependentNodes. |
15590 | * |
15591 | * \param hNode - Node to query |
15592 | * \param dependentNodes - Pointer to return the dependent nodes |
15593 | * \param numDependentNodes - See description |
15594 | * |
15595 | * \return |
15596 | * ::CUDA_SUCCESS, |
15597 | * ::CUDA_ERROR_DEINITIALIZED, |
15598 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15599 | * ::CUDA_ERROR_INVALID_VALUE |
15600 | * \note_graph_thread_safety |
15601 | * \notefnerr |
15602 | * |
15603 | * \sa |
15604 | * ::cuGraphNodeGetDependencies, |
15605 | * ::cuGraphGetNodes, |
15606 | * ::cuGraphGetRootNodes, |
15607 | * ::cuGraphGetEdges, |
15608 | * ::cuGraphAddDependencies, |
15609 | * ::cuGraphRemoveDependencies |
15610 | */ |
15611 | CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); |
15612 | |
15613 | /** |
15614 | * \brief Adds dependency edges to a graph |
15615 | * |
15616 | * The number of dependencies to be added is defined by \p numDependencies |
15617 | * Elements in \p from and \p to at corresponding indices define a dependency. |
15618 | * Each node in \p from and \p to must belong to \p hGraph. |
15619 | * |
15620 | * If \p numDependencies is 0, elements in \p from and \p to will be ignored. |
15621 | * Specifying an existing dependency will return an error. |
15622 | * |
15623 | * \param hGraph - Graph to which dependencies are added |
15624 | * \param from - Array of nodes that provide the dependencies |
15625 | * \param to - Array of dependent nodes |
15626 | * \param numDependencies - Number of dependencies to be added |
15627 | * |
15628 | * \return |
15629 | * ::CUDA_SUCCESS, |
15630 | * ::CUDA_ERROR_INVALID_VALUE |
15631 | * \note_graph_thread_safety |
15632 | * \notefnerr |
15633 | * |
15634 | * \sa |
15635 | * ::cuGraphRemoveDependencies, |
15636 | * ::cuGraphGetEdges, |
15637 | * ::cuGraphNodeGetDependencies, |
15638 | * ::cuGraphNodeGetDependentNodes |
15639 | */ |
15640 | CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); |
15641 | |
15642 | /** |
15643 | * \brief Removes dependency edges from a graph |
15644 | * |
15645 | * The number of \p dependencies to be removed is defined by \p numDependencies. |
15646 | * Elements in \p from and \p to at corresponding indices define a dependency. |
15647 | * Each node in \p from and \p to must belong to \p hGraph. |
15648 | * |
15649 | * If \p numDependencies is 0, elements in \p from and \p to will be ignored. |
15650 | * Specifying a non-existing dependency will return an error. |
15651 | * |
15652 | * Dependencies cannot be removed from graphs which contain allocation or free nodes. |
15653 | * Any attempt to do so will return an error. |
15654 | * |
15655 | * \param hGraph - Graph from which to remove dependencies |
15656 | * \param from - Array of nodes that provide the dependencies |
15657 | * \param to - Array of dependent nodes |
15658 | * \param numDependencies - Number of dependencies to be removed |
15659 | * |
15660 | * \return |
15661 | * ::CUDA_SUCCESS, |
15662 | * ::CUDA_ERROR_INVALID_VALUE |
15663 | * \note_graph_thread_safety |
15664 | * \notefnerr |
15665 | * |
15666 | * \sa |
15667 | * ::cuGraphAddDependencies, |
15668 | * ::cuGraphGetEdges, |
15669 | * ::cuGraphNodeGetDependencies, |
15670 | * ::cuGraphNodeGetDependentNodes |
15671 | */ |
15672 | CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); |
15673 | |
15674 | /** |
15675 | * \brief Remove a node from the graph |
15676 | * |
15677 | * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes |
15678 | * on \p hNode and vice versa. |
15679 | * |
15680 | * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. |
15681 | * Any attempt to do so will return an error. |
15682 | * |
15683 | * \param hNode - Node to remove |
15684 | * |
15685 | * \return |
15686 | * ::CUDA_SUCCESS, |
15687 | * ::CUDA_ERROR_INVALID_VALUE |
15688 | * \note_graph_thread_safety |
15689 | * \notefnerr |
15690 | * |
15691 | * \sa |
15692 | * ::cuGraphAddChildGraphNode, |
15693 | * ::cuGraphAddEmptyNode, |
15694 | * ::cuGraphAddKernelNode, |
15695 | * ::cuGraphAddHostNode, |
15696 | * ::cuGraphAddMemcpyNode, |
15697 | * ::cuGraphAddMemsetNode |
15698 | */ |
15699 | CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); |
15700 | |
15701 | /** |
15702 | * \brief Creates an executable graph from a graph |
15703 | * |
15704 | * Instantiates \p hGraph as an executable graph. The graph is validated for any |
15705 | * structural constraints or intra-node constraints which were not previously |
15706 | * validated. If instantiation is successful, a handle to the instantiated graph |
15707 | * is returned in \p phGraphExec. |
15708 | * |
15709 | * If there are any errors, diagnostic information may be returned in \p errorNode and |
15710 | * \p logBuffer. This is the primary way to inspect instantiation errors. The output |
15711 | * will be null terminated unless the diagnostics overflow |
15712 | * the buffer. In this case, they will be truncated, and the last byte can be |
15713 | * inspected to determine if truncation occurred. |
15714 | * |
15715 | * \param phGraphExec - Returns instantiated graph |
15716 | * \param hGraph - Graph to instantiate |
15717 | * \param phErrorNode - In case of an instantiation error, this may be modified to |
15718 | * indicate a node contributing to the error |
15719 | * \param logBuffer - A character buffer to store diagnostic messages |
15720 | * \param bufferSize - Size of the log buffer in bytes |
15721 | * |
15722 | * \return |
15723 | * ::CUDA_SUCCESS, |
15724 | * ::CUDA_ERROR_DEINITIALIZED, |
15725 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15726 | * ::CUDA_ERROR_INVALID_VALUE |
15727 | * \note_graph_thread_safety |
15728 | * \notefnerr |
15729 | * |
15730 | * \sa |
15731 | * ::cuGraphInstantiateWithFlags, |
15732 | * ::cuGraphCreate, |
15733 | * ::cuGraphUpload, |
15734 | * ::cuGraphLaunch, |
15735 | * ::cuGraphExecDestroy |
15736 | */ |
15737 | CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); |
15738 | |
15739 | /** |
15740 | * \brief Creates an executable graph from a graph |
15741 | * |
15742 | * Instantiates \p hGraph as an executable graph. The graph is validated for any |
15743 | * structural constraints or intra-node constraints which were not previously |
15744 | * validated. If instantiation is successful, a handle to the instantiated graph |
15745 | * is returned in \p phGraphExec. |
15746 | * |
15747 | * The \p flags parameter controls the behavior of instantiation and subsequent |
15748 | * graph launches. Valid flags are: |
15749 | * |
15750 | * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a |
15751 | * graph containing memory allocation nodes to automatically free any |
15752 | * unfreed memory allocations before the graph is relaunched. |
15753 | * |
15754 | * If \p hGraph contains any allocation or free nodes, there can be at most one |
15755 | * executable graph in existence for that graph at a time. |
15756 | * |
15757 | * An attempt to instantiate a second executable graph before destroying the first |
15758 | * with ::cuGraphExecDestroy will result in an error. |
15759 | * |
15760 | * \param phGraphExec - Returns instantiated graph |
15761 | * \param hGraph - Graph to instantiate |
15762 | * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. |
15763 | * |
15764 | * \return |
15765 | * ::CUDA_SUCCESS, |
15766 | * ::CUDA_ERROR_DEINITIALIZED, |
15767 | * ::CUDA_ERROR_NOT_INITIALIZED, |
15768 | * ::CUDA_ERROR_INVALID_VALUE |
15769 | * \note_graph_thread_safety |
15770 | * \notefnerr |
15771 | * |
15772 | * \sa |
15773 | * ::cuGraphInstantiate, |
15774 | * ::cuGraphCreate, |
15775 | * ::cuGraphUpload, |
15776 | * ::cuGraphLaunch, |
15777 | * ::cuGraphExecDestroy |
15778 | */ |
15779 | CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); |
15780 | |
15781 | /** |
15782 | * \brief Sets the parameters for a kernel node in the given graphExec |
15783 | * |
15784 | * Sets the parameters of a kernel node in an executable graph \p hGraphExec. |
15785 | * The node is identified by the corresponding node \p hNode in the |
15786 | * non-executable graph, from which the executable graph was instantiated. |
15787 | * |
15788 | * \p hNode must not have been removed from the original graph. The \p func field |
15789 | * of \p nodeParams cannot be modified and must match the original value. |
15790 | * All other values can be modified. |
15791 | * |
15792 | * The modifications only affect future launches of \p hGraphExec. Already |
15793 | * enqueued or running launches of \p hGraphExec are not affected by this call. |
15794 | * \p hNode is also not modified by this call. |
15795 | * |
15796 | * \param hGraphExec - The executable graph in which to set the specified node |
15797 | * \param hNode - kernel node from the graph from which graphExec was instantiated |
15798 | * \param nodeParams - Updated Parameters to set |
15799 | * |
15800 | * \return |
15801 | * ::CUDA_SUCCESS, |
15802 | * ::CUDA_ERROR_INVALID_VALUE, |
15803 | * \note_graph_thread_safety |
15804 | * \notefnerr |
15805 | * |
15806 | * \sa |
15807 | * ::cuGraphAddKernelNode, |
15808 | * ::cuGraphKernelNodeSetParams, |
15809 | * ::cuGraphExecMemcpyNodeSetParams, |
15810 | * ::cuGraphExecMemsetNodeSetParams, |
15811 | * ::cuGraphExecHostNodeSetParams, |
15812 | * ::cuGraphExecChildGraphNodeSetParams, |
15813 | * ::cuGraphExecEventRecordNodeSetEvent, |
15814 | * ::cuGraphExecEventWaitNodeSetEvent, |
15815 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
15816 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15817 | * ::cuGraphExecUpdate, |
15818 | * ::cuGraphInstantiate |
15819 | */ |
15820 | CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
15821 | |
15822 | /** |
15823 | * \brief Sets the parameters for a memcpy node in the given graphExec. |
15824 | * |
15825 | * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
15826 | * contained \p copyParams at instantiation. hNode must remain in the graph which was |
15827 | * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
15828 | * |
15829 | * The source and destination memory in \p copyParams must be allocated from the same |
15830 | * contexts as the original source and destination memory. Both the instantiation-time |
15831 | * memory operands and the memory operands in \p copyParams must be 1-dimensional. |
15832 | * Zero-length operations are not supported. |
15833 | * |
15834 | * The modifications only affect future launches of \p hGraphExec. Already enqueued |
15835 | * or running launches of \p hGraphExec are not affected by this call. hNode is also |
15836 | * not modified by this call. |
15837 | * |
15838 | * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or |
15839 | * either the original or new memory operands are multidimensional. |
15840 | * |
15841 | * \param hGraphExec - The executable graph in which to set the specified node |
15842 | * \param hNode - Memcpy node from the graph which was used to instantiate graphExec |
15843 | * \param copyParams - The updated parameters to set |
15844 | * \param ctx - Context on which to run the node |
15845 | * |
15846 | * \return |
15847 | * ::CUDA_SUCCESS, |
15848 | * ::CUDA_ERROR_INVALID_VALUE, |
15849 | * \note_graph_thread_safety |
15850 | * \notefnerr |
15851 | * |
15852 | * \sa |
15853 | * ::cuGraphAddMemcpyNode, |
15854 | * ::cuGraphMemcpyNodeSetParams, |
15855 | * ::cuGraphExecKernelNodeSetParams, |
15856 | * ::cuGraphExecMemsetNodeSetParams, |
15857 | * ::cuGraphExecHostNodeSetParams, |
15858 | * ::cuGraphExecChildGraphNodeSetParams, |
15859 | * ::cuGraphExecEventRecordNodeSetEvent, |
15860 | * ::cuGraphExecEventWaitNodeSetEvent, |
15861 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
15862 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15863 | * ::cuGraphExecUpdate, |
15864 | * ::cuGraphInstantiate |
15865 | */ |
15866 | CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); |
15867 | |
15868 | /** |
15869 | * \brief Sets the parameters for a memset node in the given graphExec. |
15870 | * |
15871 | * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
15872 | * contained \p memsetParams at instantiation. hNode must remain in the graph which was |
15873 | * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
15874 | * |
15875 | * The destination memory in \p memsetParams must be allocated from the same |
15876 | * contexts as the original destination memory. Both the instantiation-time |
15877 | * memory operand and the memory operand in \p memsetParams must be 1-dimensional. |
15878 | * Zero-length operations are not supported. |
15879 | * |
15880 | * The modifications only affect future launches of \p hGraphExec. Already enqueued |
15881 | * or running launches of \p hGraphExec are not affected by this call. hNode is also |
15882 | * not modified by this call. |
15883 | * |
15884 | * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or |
15885 | * either the original or new memory operand are multidimensional. |
15886 | * |
15887 | * \param hGraphExec - The executable graph in which to set the specified node |
15888 | * \param hNode - Memset node from the graph which was used to instantiate graphExec |
15889 | * \param memsetParams - The updated parameters to set |
15890 | * \param ctx - Context on which to run the node |
15891 | * |
15892 | * \return |
15893 | * ::CUDA_SUCCESS, |
15894 | * ::CUDA_ERROR_INVALID_VALUE, |
15895 | * \note_graph_thread_safety |
15896 | * \notefnerr |
15897 | * |
15898 | * \sa |
15899 | * ::cuGraphAddMemsetNode, |
15900 | * ::cuGraphMemsetNodeSetParams, |
15901 | * ::cuGraphExecKernelNodeSetParams, |
15902 | * ::cuGraphExecMemcpyNodeSetParams, |
15903 | * ::cuGraphExecHostNodeSetParams, |
15904 | * ::cuGraphExecChildGraphNodeSetParams, |
15905 | * ::cuGraphExecEventRecordNodeSetEvent, |
15906 | * ::cuGraphExecEventWaitNodeSetEvent, |
15907 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
15908 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15909 | * ::cuGraphExecUpdate, |
15910 | * ::cuGraphInstantiate |
15911 | */ |
15912 | CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); |
15913 | |
15914 | /** |
15915 | * \brief Sets the parameters for a host node in the given graphExec. |
15916 | * |
15917 | * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
15918 | * contained \p nodeParams at instantiation. hNode must remain in the graph which was |
15919 | * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
15920 | * |
15921 | * The modifications only affect future launches of \p hGraphExec. Already enqueued |
15922 | * or running launches of \p hGraphExec are not affected by this call. hNode is also |
15923 | * not modified by this call. |
15924 | * |
15925 | * \param hGraphExec - The executable graph in which to set the specified node |
15926 | * \param hNode - Host node from the graph which was used to instantiate graphExec |
15927 | * \param nodeParams - The updated parameters to set |
15928 | * |
15929 | * \return |
15930 | * ::CUDA_SUCCESS, |
15931 | * ::CUDA_ERROR_INVALID_VALUE, |
15932 | * \note_graph_thread_safety |
15933 | * \notefnerr |
15934 | * |
15935 | * \sa |
15936 | * ::cuGraphAddHostNode, |
15937 | * ::cuGraphHostNodeSetParams, |
15938 | * ::cuGraphExecKernelNodeSetParams, |
15939 | * ::cuGraphExecMemcpyNodeSetParams, |
15940 | * ::cuGraphExecMemsetNodeSetParams, |
15941 | * ::cuGraphExecChildGraphNodeSetParams, |
15942 | * ::cuGraphExecEventRecordNodeSetEvent, |
15943 | * ::cuGraphExecEventWaitNodeSetEvent, |
15944 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
15945 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15946 | * ::cuGraphExecUpdate, |
15947 | * ::cuGraphInstantiate |
15948 | */ |
15949 | CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); |
15950 | |
15951 | /** |
15952 | * \brief Updates node parameters in the child graph node in the given graphExec. |
15953 | * |
15954 | * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained |
15955 | * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. |
15956 | * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. |
15957 | * Changed edges to and from \p hNode are ignored. |
15958 | * |
15959 | * The modifications only affect future launches of \p hGraphExec. Already enqueued |
15960 | * or running launches of \p hGraphExec are not affected by this call. \p hNode is also |
15961 | * not modified by this call. |
15962 | * |
15963 | * The topology of \p childGraph, as well as the node insertion order, must match that |
15964 | * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions |
15965 | * on what can be updated in an instantiated graph. The update is recursive, so child graph |
15966 | * nodes contained within the top level child graph will also be updated. |
15967 | * |
15968 | * \param hGraphExec - The executable graph in which to set the specified node |
15969 | * \param hNode - Host node from the graph which was used to instantiate graphExec |
15970 | * \param childGraph - The graph supplying the updated parameters |
15971 | * |
15972 | * \return |
15973 | * ::CUDA_SUCCESS, |
15974 | * ::CUDA_ERROR_INVALID_VALUE, |
15975 | * \note_graph_thread_safety |
15976 | * \notefnerr |
15977 | * |
15978 | * \sa |
15979 | * ::cuGraphAddChildGraphNode, |
15980 | * ::cuGraphChildGraphNodeGetGraph, |
15981 | * ::cuGraphExecKernelNodeSetParams, |
15982 | * ::cuGraphExecMemcpyNodeSetParams, |
15983 | * ::cuGraphExecMemsetNodeSetParams, |
15984 | * ::cuGraphExecHostNodeSetParams, |
15985 | * ::cuGraphExecEventRecordNodeSetEvent, |
15986 | * ::cuGraphExecEventWaitNodeSetEvent, |
15987 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
15988 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
15989 | * ::cuGraphExecUpdate, |
15990 | * ::cuGraphInstantiate |
15991 | */ |
15992 | CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); |
15993 | |
15994 | /** |
15995 | * \brief Sets the event for an event record node in the given graphExec |
15996 | * |
15997 | * Sets the event of an event record node in an executable graph \p hGraphExec. |
15998 | * The node is identified by the corresponding node \p hNode in the |
15999 | * non-executable graph, from which the executable graph was instantiated. |
16000 | * |
16001 | * The modifications only affect future launches of \p hGraphExec. Already |
16002 | * enqueued or running launches of \p hGraphExec are not affected by this call. |
16003 | * \p hNode is also not modified by this call. |
16004 | * |
16005 | * \param hGraphExec - The executable graph in which to set the specified node |
16006 | * \param hNode - event record node from the graph from which graphExec was instantiated |
16007 | * \param event - Updated event to use |
16008 | * |
16009 | * \return |
16010 | * ::CUDA_SUCCESS, |
16011 | * ::CUDA_ERROR_INVALID_VALUE, |
16012 | * \note_graph_thread_safety |
16013 | * \notefnerr |
16014 | * |
16015 | * \sa |
16016 | * ::cuGraphAddEventRecordNode, |
16017 | * ::cuGraphEventRecordNodeGetEvent, |
16018 | * ::cuGraphEventWaitNodeSetEvent, |
16019 | * ::cuEventRecordWithFlags, |
16020 | * ::cuStreamWaitEvent, |
16021 | * ::cuGraphExecKernelNodeSetParams, |
16022 | * ::cuGraphExecMemcpyNodeSetParams, |
16023 | * ::cuGraphExecMemsetNodeSetParams, |
16024 | * ::cuGraphExecHostNodeSetParams, |
16025 | * ::cuGraphExecChildGraphNodeSetParams, |
16026 | * ::cuGraphExecEventWaitNodeSetEvent, |
16027 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
16028 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
16029 | * ::cuGraphExecUpdate, |
16030 | * ::cuGraphInstantiate |
16031 | */ |
16032 | CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); |
16033 | |
16034 | /** |
16035 | * \brief Sets the event for an event wait node in the given graphExec |
16036 | * |
16037 | * Sets the event of an event wait node in an executable graph \p hGraphExec. |
16038 | * The node is identified by the corresponding node \p hNode in the |
16039 | * non-executable graph, from which the executable graph was instantiated. |
16040 | * |
16041 | * The modifications only affect future launches of \p hGraphExec. Already |
16042 | * enqueued or running launches of \p hGraphExec are not affected by this call. |
16043 | * \p hNode is also not modified by this call. |
16044 | * |
16045 | * \param hGraphExec - The executable graph in which to set the specified node |
16046 | * \param hNode - event wait node from the graph from which graphExec was instantiated |
16047 | * \param event - Updated event to use |
16048 | * |
16049 | * \return |
16050 | * ::CUDA_SUCCESS, |
16051 | * ::CUDA_ERROR_INVALID_VALUE, |
16052 | * \note_graph_thread_safety |
16053 | * \notefnerr |
16054 | * |
16055 | * \sa |
16056 | * ::cuGraphAddEventWaitNode, |
16057 | * ::cuGraphEventWaitNodeGetEvent, |
16058 | * ::cuGraphEventRecordNodeSetEvent, |
16059 | * ::cuEventRecordWithFlags, |
16060 | * ::cuStreamWaitEvent, |
16061 | * ::cuGraphExecKernelNodeSetParams, |
16062 | * ::cuGraphExecMemcpyNodeSetParams, |
16063 | * ::cuGraphExecMemsetNodeSetParams, |
16064 | * ::cuGraphExecHostNodeSetParams, |
16065 | * ::cuGraphExecChildGraphNodeSetParams, |
16066 | * ::cuGraphExecEventRecordNodeSetEvent, |
16067 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
16068 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
16069 | * ::cuGraphExecUpdate, |
16070 | * ::cuGraphInstantiate |
16071 | */ |
16072 | CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); |
16073 | |
16074 | /** |
16075 | * \brief Sets the parameters for an external semaphore signal node in the given graphExec |
16076 | * |
16077 | * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. |
16078 | * The node is identified by the corresponding node \p hNode in the |
16079 | * non-executable graph, from which the executable graph was instantiated. |
16080 | * |
16081 | * \p hNode must not have been removed from the original graph. |
16082 | * |
16083 | * The modifications only affect future launches of \p hGraphExec. Already |
16084 | * enqueued or running launches of \p hGraphExec are not affected by this call. |
16085 | * \p hNode is also not modified by this call. |
16086 | * |
16087 | * Changing \p nodeParams->numExtSems is not supported. |
16088 | * |
16089 | * \param hGraphExec - The executable graph in which to set the specified node |
16090 | * \param hNode - semaphore signal node from the graph from which graphExec was instantiated |
16091 | * \param nodeParams - Updated Parameters to set |
16092 | * |
16093 | * \return |
16094 | * ::CUDA_SUCCESS, |
16095 | * ::CUDA_ERROR_INVALID_VALUE, |
16096 | * \note_graph_thread_safety |
16097 | * \notefnerr |
16098 | * |
16099 | * \sa |
16100 | * ::cuGraphAddExternalSemaphoresSignalNode, |
16101 | * ::cuImportExternalSemaphore, |
16102 | * ::cuSignalExternalSemaphoresAsync, |
16103 | * ::cuWaitExternalSemaphoresAsync, |
16104 | * ::cuGraphExecKernelNodeSetParams, |
16105 | * ::cuGraphExecMemcpyNodeSetParams, |
16106 | * ::cuGraphExecMemsetNodeSetParams, |
16107 | * ::cuGraphExecHostNodeSetParams, |
16108 | * ::cuGraphExecChildGraphNodeSetParams, |
16109 | * ::cuGraphExecEventRecordNodeSetEvent, |
16110 | * ::cuGraphExecEventWaitNodeSetEvent, |
16111 | * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, |
16112 | * ::cuGraphExecUpdate, |
16113 | * ::cuGraphInstantiate |
16114 | */ |
16115 | CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); |
16116 | |
16117 | /** |
16118 | * \brief Sets the parameters for an external semaphore wait node in the given graphExec |
16119 | * |
16120 | * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. |
16121 | * The node is identified by the corresponding node \p hNode in the |
16122 | * non-executable graph, from which the executable graph was instantiated. |
16123 | * |
16124 | * \p hNode must not have been removed from the original graph. |
16125 | * |
16126 | * The modifications only affect future launches of \p hGraphExec. Already |
16127 | * enqueued or running launches of \p hGraphExec are not affected by this call. |
16128 | * \p hNode is also not modified by this call. |
16129 | * |
16130 | * Changing \p nodeParams->numExtSems is not supported. |
16131 | * |
16132 | * \param hGraphExec - The executable graph in which to set the specified node |
16133 | * \param hNode - semaphore wait node from the graph from which graphExec was instantiated |
16134 | * \param nodeParams - Updated Parameters to set |
16135 | * |
16136 | * \return |
16137 | * ::CUDA_SUCCESS, |
16138 | * ::CUDA_ERROR_INVALID_VALUE, |
16139 | * \note_graph_thread_safety |
16140 | * \notefnerr |
16141 | * |
16142 | * \sa |
16143 | * ::cuGraphAddExternalSemaphoresWaitNode, |
16144 | * ::cuImportExternalSemaphore, |
16145 | * ::cuSignalExternalSemaphoresAsync, |
16146 | * ::cuWaitExternalSemaphoresAsync, |
16147 | * ::cuGraphExecKernelNodeSetParams, |
16148 | * ::cuGraphExecMemcpyNodeSetParams, |
16149 | * ::cuGraphExecMemsetNodeSetParams, |
16150 | * ::cuGraphExecHostNodeSetParams, |
16151 | * ::cuGraphExecChildGraphNodeSetParams, |
16152 | * ::cuGraphExecEventRecordNodeSetEvent, |
16153 | * ::cuGraphExecEventWaitNodeSetEvent, |
16154 | * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, |
16155 | * ::cuGraphExecUpdate, |
16156 | * ::cuGraphInstantiate |
16157 | */ |
16158 | CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); |
16159 | |
16160 | /** |
16161 | * \brief Uploads an executable graph in a stream |
16162 | * |
16163 | * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of |
16164 | * the same \p hGraphExec will be serialized. Each upload is ordered behind both any |
16165 | * previous work in \p hStream and any previous launches of \p hGraphExec. |
16166 | * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. |
16167 | * |
16168 | * \param hGraphExec - Executable graph to upload |
16169 | * \param hStream - Stream in which to upload the graph |
16170 | * |
16171 | * \return |
16172 | * ::CUDA_SUCCESS, |
16173 | * ::CUDA_ERROR_DEINITIALIZED, |
16174 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16175 | * ::CUDA_ERROR_INVALID_VALUE |
16176 | * \note_graph_thread_safety |
16177 | * \notefnerr |
16178 | * |
16179 | * \sa |
16180 | * ::cuGraphInstantiate, |
16181 | * ::cuGraphLaunch, |
16182 | * ::cuGraphExecDestroy |
16183 | */ |
16184 | CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); |
16185 | |
16186 | /** |
16187 | * \brief Launches an executable graph in a stream |
16188 | * |
16189 | * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing |
16190 | * at a time. Each launch is ordered behind both any previous work in \p hStream |
16191 | * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be |
16192 | * instantiated multiple times into multiple executable graphs. |
16193 | * |
16194 | * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and |
16195 | * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, |
16196 | * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. |
16197 | * |
16198 | * \param hGraphExec - Executable graph to launch |
16199 | * \param hStream - Stream in which to launch the graph |
16200 | * |
16201 | * \return |
16202 | * ::CUDA_SUCCESS, |
16203 | * ::CUDA_ERROR_DEINITIALIZED, |
16204 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16205 | * ::CUDA_ERROR_INVALID_VALUE |
16206 | * \note_graph_thread_safety |
16207 | * \notefnerr |
16208 | * |
16209 | * \sa |
16210 | * ::cuGraphInstantiate, |
16211 | * ::cuGraphUpload, |
16212 | * ::cuGraphExecDestroy |
16213 | */ |
16214 | CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); |
16215 | |
16216 | /** |
16217 | * \brief Destroys an executable graph |
16218 | * |
16219 | * Destroys the executable graph specified by \p hGraphExec, as well |
16220 | * as all of its executable nodes. If the executable graph is |
16221 | * in-flight, it will not be terminated, but rather freed |
16222 | * asynchronously on completion. |
16223 | * |
16224 | * \param hGraphExec - Executable graph to destroy |
16225 | * |
16226 | * \return |
16227 | * ::CUDA_SUCCESS, |
16228 | * ::CUDA_ERROR_DEINITIALIZED, |
16229 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16230 | * ::CUDA_ERROR_INVALID_VALUE |
16231 | * \note_graph_thread_safety |
16232 | * \notefnerr |
16233 | * |
16234 | * \sa |
16235 | * ::cuGraphInstantiate, |
16236 | * ::cuGraphUpload, |
16237 | * ::cuGraphLaunch |
16238 | */ |
16239 | CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); |
16240 | |
16241 | /** |
16242 | * \brief Destroys a graph |
16243 | * |
16244 | * Destroys the graph specified by \p hGraph, as well as all of its nodes. |
16245 | * |
16246 | * \param hGraph - Graph to destroy |
16247 | * |
16248 | * \return |
16249 | * ::CUDA_SUCCESS, |
16250 | * ::CUDA_ERROR_DEINITIALIZED, |
16251 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16252 | * ::CUDA_ERROR_INVALID_VALUE |
16253 | * \note_graph_thread_safety |
16254 | * \notefnerr |
16255 | * |
16256 | * \sa |
16257 | * ::cuGraphCreate |
16258 | */ |
16259 | CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); |
16260 | |
16261 | /** |
16262 | * \brief Check whether an executable graph can be updated with a graph and perform the update if possible |
16263 | * |
16264 | * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the |
16265 | * node parameters in a topologically identical graph specified by \p hGraph. |
16266 | * |
16267 | * Limitations: |
16268 | * |
16269 | * - Kernel nodes: |
16270 | * - The owning context of the function cannot change. |
16271 | * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated |
16272 | * to a function which uses CDP |
16273 | * - Memset and memcpy nodes: |
16274 | * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. |
16275 | * - The source/destination memory must be allocated from the same contexts as the original |
16276 | * source/destination memory. |
16277 | * - Only 1D memsets can be changed. |
16278 | * - Additional memcpy node restrictions: |
16279 | * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, |
16280 | * CU_MEMORYTYPE_ARRAY, etc.) is not supported. |
16281 | * - External semaphore wait nodes and record nodes: |
16282 | * - Changing the number of semaphores is not supported. |
16283 | * |
16284 | * Note: The API may add further restrictions in future releases. The return code should always be checked. |
16285 | * |
16286 | * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under |
16287 | * the following conditions: |
16288 | * |
16289 | * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out |
16290 | * is NULL. |
16291 | * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out |
16292 | * is NULL. |
16293 | * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is |
16294 | * the pairless node from \p hGraph. |
16295 | * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. |
16296 | * |
16297 | * cuGraphExecUpdate sets \p updateResult_out to: |
16298 | * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. |
16299 | * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed |
16300 | * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case |
16301 | * \p hErrorNode_out is set to the node from \p hGraph. |
16302 | * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported |
16303 | * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph |
16304 | * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way |
16305 | * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. |
16306 | * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like |
16307 | * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph |
16308 | * |
16309 | * If \p updateResult_out isn't set in one of the situations described above, the update check passes |
16310 | * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens |
16311 | * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, |
16312 | * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. |
16313 | * |
16314 | * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns |
16315 | * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included |
16316 | * changes which violated constraints specific to instantiated graph update. |
16317 | * |
16318 | * \param hGraphExec The instantiated graph to be updated |
16319 | * \param hGraph The graph containing the updated parameters |
16320 | * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any |
16321 | * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why |
16322 | * |
16323 | * \return |
16324 | * ::CUDA_SUCCESS, |
16325 | * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, |
16326 | * \note_graph_thread_safety |
16327 | * \notefnerr |
16328 | * |
16329 | * \sa |
16330 | * ::cuGraphInstantiate, |
16331 | */ |
16332 | CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); |
16333 | |
16334 | /** |
16335 | * \brief Copies attributes from source node to destination node. |
16336 | * |
16337 | * Copies attributes from source node \p src to destination node \p dst. |
16338 | * Both node must have the same context. |
16339 | * |
16340 | * \param[out] dst Destination node |
16341 | * \param[in] src Source node |
16342 | * For list of attributes see ::CUkernelNodeAttrID |
16343 | * |
16344 | * \return |
16345 | * ::CUDA_SUCCESS, |
16346 | * ::CUDA_ERROR_INVALID_VALUE |
16347 | * \notefnerr |
16348 | * |
16349 | * \sa |
16350 | * ::CUaccessPolicyWindow |
16351 | */ |
16352 | CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); |
16353 | |
16354 | /** |
16355 | * \brief Queries node attribute. |
16356 | * |
16357 | * Queries attribute \p attr from node \p hNode and stores it in corresponding |
16358 | * member of \p value_out. |
16359 | * |
16360 | * \param[in] hNode |
16361 | * \param[in] attr |
16362 | * \param[out] value_out |
16363 | * |
16364 | * \return |
16365 | * ::CUDA_SUCCESS, |
16366 | * ::CUDA_ERROR_INVALID_VALUE, |
16367 | * ::CUDA_ERROR_INVALID_HANDLE |
16368 | * \notefnerr |
16369 | * |
16370 | * \sa |
16371 | * ::CUaccessPolicyWindow |
16372 | */ |
16373 | CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, |
16374 | CUkernelNodeAttrValue *value_out); |
16375 | |
16376 | /** |
16377 | * \brief Sets node attribute. |
16378 | * |
16379 | * Sets attribute \p attr on node \p hNode from corresponding attribute of |
16380 | * \p value. |
16381 | * |
16382 | * \param[out] hNode |
16383 | * \param[in] attr |
16384 | * \param[out] value |
16385 | * |
16386 | * \return |
16387 | * ::CUDA_SUCCESS, |
16388 | * ::CUDA_ERROR_INVALID_VALUE, |
16389 | * ::CUDA_ERROR_INVALID_HANDLE |
16390 | * \notefnerr |
16391 | * |
16392 | * \sa |
16393 | * ::CUaccessPolicyWindow |
16394 | */ |
16395 | CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, |
16396 | const CUkernelNodeAttrValue *value); |
16397 | |
16398 | /** |
16399 | * \brief Write a DOT file describing graph structure |
16400 | * |
16401 | * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. |
16402 | * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. |
16403 | * \p flags can be specified to write more detailed information about each node type such as |
16404 | * parameter values, kernel attributes, node and function handles. |
16405 | * |
16406 | * \param hGraph - The graph to create a DOT file from |
16407 | * \param path - The path to write the DOT file to |
16408 | * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write |
16409 | * |
16410 | * \return |
16411 | * ::CUDA_SUCCESS, |
16412 | * ::CUDA_ERROR_INVALID_VALUE, |
16413 | * ::CUDA_ERROR_OPERATING_SYSTEM |
16414 | */ |
16415 | CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); |
16416 | |
16417 | /** |
16418 | * \brief Create a user object |
16419 | * |
16420 | * Create a user object with the specified destructor callback and initial reference count. The |
16421 | * initial references are owned by the caller. |
16422 | * |
16423 | * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they |
16424 | * are executed by a shared internal thread. Another thread may be signaled to perform such |
16425 | * actions, if it does not block forward progress of tasks scheduled through CUDA. |
16426 | * |
16427 | * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. |
16428 | * |
16429 | * \param object_out - Location to return the user object handle |
16430 | * \param ptr - The pointer to pass to the destroy function |
16431 | * \param destroy - Callback to free the user object when it is no longer in use |
16432 | * \param initialRefcount - The initial refcount to create the object with, typically 1. The |
16433 | * initial references are owned by the calling thread. |
16434 | * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, |
16435 | * which is the only defined flag. This indicates that the destroy |
16436 | * callback cannot be waited on by any CUDA API. Users requiring |
16437 | * synchronization of the callback should signal its completion |
16438 | * manually. |
16439 | * |
16440 | * \return |
16441 | * ::CUDA_SUCCESS, |
16442 | * ::CUDA_ERROR_INVALID_VALUE |
16443 | * |
16444 | * \sa |
16445 | * ::cuUserObjectRetain, |
16446 | * ::cuUserObjectRelease, |
16447 | * ::cuGraphRetainUserObject, |
16448 | * ::cuGraphReleaseUserObject, |
16449 | * ::cuGraphCreate |
16450 | */ |
16451 | CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, |
16452 | unsigned int initialRefcount, unsigned int flags); |
16453 | |
16454 | /** |
16455 | * \brief Retain a reference to a user object |
16456 | * |
16457 | * Retains new references to a user object. The new references are owned by the caller. |
16458 | * |
16459 | * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. |
16460 | * |
16461 | * \param object - The object to retain |
16462 | * \param count - The number of references to retain, typically 1. Must be nonzero |
16463 | * and not larger than INT_MAX. |
16464 | * |
16465 | * \return |
16466 | * ::CUDA_SUCCESS, |
16467 | * ::CUDA_ERROR_INVALID_VALUE |
16468 | * |
16469 | * \sa |
16470 | * ::cuUserObjectCreate, |
16471 | * ::cuUserObjectRelease, |
16472 | * ::cuGraphRetainUserObject, |
16473 | * ::cuGraphReleaseUserObject, |
16474 | * ::cuGraphCreate |
16475 | */ |
16476 | CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); |
16477 | |
16478 | /** |
16479 | * \brief Release a reference to a user object |
16480 | * |
16481 | * Releases user object references owned by the caller. The object's destructor is invoked if |
16482 | * the reference count reaches zero. |
16483 | * |
16484 | * It is undefined behavior to release references not owned by the caller, or to use a user |
16485 | * object handle after all references are released. |
16486 | * |
16487 | * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. |
16488 | * |
16489 | * \param object - The object to release |
16490 | * \param count - The number of references to release, typically 1. Must be nonzero |
16491 | * and not larger than INT_MAX. |
16492 | * |
16493 | * \return |
16494 | * ::CUDA_SUCCESS, |
16495 | * ::CUDA_ERROR_INVALID_VALUE |
16496 | * |
16497 | * \sa |
16498 | * ::cuUserObjectCreate, |
16499 | * ::cuUserObjectRetain, |
16500 | * ::cuGraphRetainUserObject, |
16501 | * ::cuGraphReleaseUserObject, |
16502 | * ::cuGraphCreate |
16503 | */ |
16504 | CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); |
16505 | |
16506 | /** |
16507 | * \brief Retain a reference to a user object from a graph |
16508 | * |
16509 | * Creates or moves user object references that will be owned by a CUDA graph. |
16510 | * |
16511 | * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. |
16512 | * |
16513 | * \param graph - The graph to associate the reference with |
16514 | * \param object - The user object to retain a reference for |
16515 | * \param count - The number of references to add to the graph, typically 1. Must be |
16516 | * nonzero and not larger than INT_MAX. |
16517 | * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references |
16518 | * from the calling thread, rather than create new references. Pass 0 |
16519 | * to create new references. |
16520 | * |
16521 | * \return |
16522 | * ::CUDA_SUCCESS, |
16523 | * ::CUDA_ERROR_INVALID_VALUE |
16524 | * |
16525 | * \sa |
16526 | * ::cuUserObjectCreate, |
16527 | * ::cuUserObjectRetain, |
16528 | * ::cuUserObjectRelease, |
16529 | * ::cuGraphReleaseUserObject, |
16530 | * ::cuGraphCreate |
16531 | */ |
16532 | CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); |
16533 | |
16534 | /** |
16535 | * \brief Release a user object reference from a graph |
16536 | * |
16537 | * Releases user object references owned by a graph. |
16538 | * |
16539 | * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. |
16540 | * |
16541 | * \param graph - The graph that will release the reference |
16542 | * \param object - The user object to release a reference for |
16543 | * \param count - The number of references to release, typically 1. Must be nonzero |
16544 | * and not larger than INT_MAX. |
16545 | * |
16546 | * \return |
16547 | * ::CUDA_SUCCESS, |
16548 | * ::CUDA_ERROR_INVALID_VALUE |
16549 | * |
16550 | * \sa |
16551 | * ::cuUserObjectCreate, |
16552 | * ::cuUserObjectRetain, |
16553 | * ::cuUserObjectRelease, |
16554 | * ::cuGraphRetainUserObject, |
16555 | * ::cuGraphCreate |
16556 | */ |
16557 | CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); |
16558 | |
16559 | /** @} */ /* END CUDA_GRAPH */ |
16560 | |
16561 | /** |
16562 | * \defgroup CUDA_OCCUPANCY Occupancy |
16563 | * |
16564 | * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver |
16565 | * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
16566 | * |
16567 | * This section describes the occupancy calculation functions of the low-level CUDA |
16568 | * driver application programming interface. |
16569 | * |
16570 | * @{ |
16571 | */ |
16572 | |
16573 | /** |
16574 | * \brief Returns occupancy of a function |
16575 | * |
16576 | * Returns in \p *numBlocks the number of the maximum active blocks per |
16577 | * streaming multiprocessor. |
16578 | * |
16579 | * \param numBlocks - Returned occupancy |
16580 | * \param func - Kernel for which occupancy is calculated |
16581 | * \param blockSize - Block size the kernel is intended to be launched with |
16582 | * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes |
16583 | * |
16584 | * \return |
16585 | * ::CUDA_SUCCESS, |
16586 | * ::CUDA_ERROR_DEINITIALIZED, |
16587 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16588 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16589 | * ::CUDA_ERROR_INVALID_VALUE, |
16590 | * ::CUDA_ERROR_UNKNOWN |
16591 | * \notefnerr |
16592 | * |
16593 | * \sa |
16594 | * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor |
16595 | */ |
16596 | CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); |
16597 | |
16598 | /** |
16599 | * \brief Returns occupancy of a function |
16600 | * |
16601 | * Returns in \p *numBlocks the number of the maximum active blocks per |
16602 | * streaming multiprocessor. |
16603 | * |
16604 | * The \p Flags parameter controls how special cases are handled. The |
16605 | * valid flags are: |
16606 | * |
16607 | * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as |
16608 | * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; |
16609 | * |
16610 | * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the |
16611 | * default behavior on platform where global caching affects |
16612 | * occupancy. On such platforms, if caching is enabled, but |
16613 | * per-block SM resource usage would result in zero occupancy, the |
16614 | * occupancy calculator will calculate the occupancy as if caching |
16615 | * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes |
16616 | * the occupancy calculator to return 0 in such cases. More information |
16617 | * can be found about this feature in the "Unified L1/Texture Cache" |
16618 | * section of the Maxwell tuning guide. |
16619 | * |
16620 | * \param numBlocks - Returned occupancy |
16621 | * \param func - Kernel for which occupancy is calculated |
16622 | * \param blockSize - Block size the kernel is intended to be launched with |
16623 | * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes |
16624 | * \param flags - Requested behavior for the occupancy calculator |
16625 | * |
16626 | * \return |
16627 | * ::CUDA_SUCCESS, |
16628 | * ::CUDA_ERROR_DEINITIALIZED, |
16629 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16630 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16631 | * ::CUDA_ERROR_INVALID_VALUE, |
16632 | * ::CUDA_ERROR_UNKNOWN |
16633 | * \notefnerr |
16634 | * |
16635 | * \sa |
16636 | * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags |
16637 | */ |
16638 | CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); |
16639 | |
16640 | /** |
16641 | * \brief Suggest a launch configuration with reasonable occupancy |
16642 | * |
16643 | * Returns in \p *blockSize a reasonable block size that can achieve |
16644 | * the maximum occupancy (or, the maximum number of active warps with |
16645 | * the fewest blocks per multiprocessor), and in \p *minGridSize the |
16646 | * minimum grid size to achieve the maximum occupancy. |
16647 | * |
16648 | * If \p blockSizeLimit is 0, the configurator will use the maximum |
16649 | * block size permitted by the device / function instead. |
16650 | * |
16651 | * If per-block dynamic shared memory allocation is not needed, the |
16652 | * user should leave both \p blockSizeToDynamicSMemSize and \p |
16653 | * dynamicSMemSize as 0. |
16654 | * |
16655 | * If per-block dynamic shared memory allocation is needed, then if |
16656 | * the dynamic shared memory size is constant regardless of block |
16657 | * size, the size should be passed through \p dynamicSMemSize, and \p |
16658 | * blockSizeToDynamicSMemSize should be NULL. |
16659 | * |
16660 | * Otherwise, if the per-block dynamic shared memory size varies with |
16661 | * different block sizes, the user needs to provide a unary function |
16662 | * through \p blockSizeToDynamicSMemSize that computes the dynamic |
16663 | * shared memory needed by \p func for any given block size. \p |
16664 | * dynamicSMemSize is ignored. An example signature is: |
16665 | * |
16666 | * \code |
16667 | * // Take block size, returns dynamic shared memory needed |
16668 | * size_t blockToSmem(int blockSize); |
16669 | * \endcode |
16670 | * |
16671 | * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy |
16672 | * \param blockSize - Returned maximum block size that can achieve the maximum occupancy |
16673 | * \param func - Kernel for which launch configuration is calculated |
16674 | * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size |
16675 | * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes |
16676 | * \param blockSizeLimit - The maximum block size \p func is designed to handle |
16677 | * |
16678 | * \return |
16679 | * ::CUDA_SUCCESS, |
16680 | * ::CUDA_ERROR_DEINITIALIZED, |
16681 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16682 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16683 | * ::CUDA_ERROR_INVALID_VALUE, |
16684 | * ::CUDA_ERROR_UNKNOWN |
16685 | * \notefnerr |
16686 | * |
16687 | * \sa |
16688 | * ::cudaOccupancyMaxPotentialBlockSize |
16689 | */ |
16690 | CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); |
16691 | |
16692 | /** |
16693 | * \brief Suggest a launch configuration with reasonable occupancy |
16694 | * |
16695 | * An extended version of ::cuOccupancyMaxPotentialBlockSize. In |
16696 | * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, |
16697 | * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags |
16698 | * parameter. |
16699 | * |
16700 | * The \p Flags parameter controls how special cases are handled. The |
16701 | * valid flags are: |
16702 | * |
16703 | * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as |
16704 | * ::cuOccupancyMaxPotentialBlockSize; |
16705 | * |
16706 | * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the |
16707 | * default behavior on platform where global caching affects |
16708 | * occupancy. On such platforms, the launch configurations that |
16709 | * produces maximal occupancy might not support global |
16710 | * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE |
16711 | * guarantees that the the produced launch configuration is global |
16712 | * caching compatible at a potential cost of occupancy. More information |
16713 | * can be found about this feature in the "Unified L1/Texture Cache" |
16714 | * section of the Maxwell tuning guide. |
16715 | * |
16716 | * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy |
16717 | * \param blockSize - Returned maximum block size that can achieve the maximum occupancy |
16718 | * \param func - Kernel for which launch configuration is calculated |
16719 | * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size |
16720 | * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes |
16721 | * \param blockSizeLimit - The maximum block size \p func is designed to handle |
16722 | * \param flags - Options |
16723 | * |
16724 | * \return |
16725 | * ::CUDA_SUCCESS, |
16726 | * ::CUDA_ERROR_DEINITIALIZED, |
16727 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16728 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16729 | * ::CUDA_ERROR_INVALID_VALUE, |
16730 | * ::CUDA_ERROR_UNKNOWN |
16731 | * \notefnerr |
16732 | * |
16733 | * \sa |
16734 | * ::cudaOccupancyMaxPotentialBlockSizeWithFlags |
16735 | */ |
16736 | CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); |
16737 | |
16738 | /** |
16739 | * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM |
16740 | * |
16741 | * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. |
16742 | * |
16743 | * \param dynamicSmemSize - Returned maximum dynamic shared memory |
16744 | * \param func - Kernel function for which occupancy is calculated |
16745 | * \param numBlocks - Number of blocks to fit on SM |
16746 | * \param blockSize - Size of the blocks |
16747 | * |
16748 | * \return |
16749 | * ::CUDA_SUCCESS, |
16750 | * ::CUDA_ERROR_DEINITIALIZED, |
16751 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16752 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16753 | * ::CUDA_ERROR_INVALID_VALUE, |
16754 | * ::CUDA_ERROR_UNKNOWN |
16755 | * \notefnerr |
16756 | * |
16757 | * \sa |
16758 | */ |
16759 | CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); |
16760 | |
16761 | /** @} */ /* END CUDA_OCCUPANCY */ |
16762 | |
16763 | /** |
16764 | * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] |
16765 | * |
16766 | * ___MANBRIEF___ deprecated texture reference management functions of the |
16767 | * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
16768 | * |
16769 | * This section describes the deprecated texture reference management |
16770 | * functions of the low-level CUDA driver application programming interface. |
16771 | * |
16772 | * @{ |
16773 | */ |
16774 | |
16775 | /** |
16776 | * \brief Binds an array as a texture reference |
16777 | * |
16778 | * \deprecated |
16779 | * |
16780 | * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any |
16781 | * previous address or CUDA array state associated with the texture reference |
16782 | * is superseded by this function. \p Flags must be set to |
16783 | * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is |
16784 | * unbound. |
16785 | * |
16786 | * \param hTexRef - Texture reference to bind |
16787 | * \param hArray - Array to bind |
16788 | * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) |
16789 | * |
16790 | * \return |
16791 | * ::CUDA_SUCCESS, |
16792 | * ::CUDA_ERROR_DEINITIALIZED, |
16793 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16794 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16795 | * ::CUDA_ERROR_INVALID_VALUE |
16796 | * |
16797 | * \sa ::cuTexRefSetAddress, |
16798 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, |
16799 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
16800 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
16801 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
16802 | * ::cudaBindTextureToArray |
16803 | */ |
16804 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); |
16805 | |
16806 | /** |
16807 | * \brief Binds a mipmapped array to a texture reference |
16808 | * |
16809 | * \deprecated |
16810 | * |
16811 | * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. |
16812 | * Any previous address or CUDA array state associated with the texture reference |
16813 | * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. |
16814 | * Any CUDA array previously bound to \p hTexRef is unbound. |
16815 | * |
16816 | * \param hTexRef - Texture reference to bind |
16817 | * \param hMipmappedArray - Mipmapped array to bind |
16818 | * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) |
16819 | * |
16820 | * \return |
16821 | * ::CUDA_SUCCESS, |
16822 | * ::CUDA_ERROR_DEINITIALIZED, |
16823 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16824 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16825 | * ::CUDA_ERROR_INVALID_VALUE |
16826 | * |
16827 | * \sa ::cuTexRefSetAddress, |
16828 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, |
16829 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
16830 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
16831 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
16832 | * ::cudaBindTextureToMipmappedArray |
16833 | */ |
16834 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); |
16835 | |
16836 | /** |
16837 | * \brief Binds an address as a texture reference |
16838 | * |
16839 | * \deprecated |
16840 | * |
16841 | * Binds a linear address range to the texture reference \p hTexRef. Any |
16842 | * previous address or CUDA array state associated with the texture reference |
16843 | * is superseded by this function. Any memory previously bound to \p hTexRef |
16844 | * is unbound. |
16845 | * |
16846 | * Since the hardware enforces an alignment requirement on texture base |
16847 | * addresses, ::cuTexRefSetAddress() passes back a byte offset in |
16848 | * \p *ByteOffset that must be applied to texture fetches in order to read from |
16849 | * the desired memory. This offset must be divided by the texel size and |
16850 | * passed to kernels that read from the texture so they can be applied to the |
16851 | * ::tex1Dfetch() function. |
16852 | * |
16853 | * If the device memory pointer was returned from ::cuMemAlloc(), the offset |
16854 | * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. |
16855 | * |
16856 | * The total number of elements (or texels) in the linear address range |
16857 | * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. |
16858 | * The number of elements is computed as (\p bytes / bytesPerElement), |
16859 | * where bytesPerElement is determined from the data format and number of |
16860 | * components set using ::cuTexRefSetFormat(). |
16861 | * |
16862 | * \param ByteOffset - Returned byte offset |
16863 | * \param hTexRef - Texture reference to bind |
16864 | * \param dptr - Device pointer to bind |
16865 | * \param bytes - Size of memory to bind in bytes |
16866 | * |
16867 | * \return |
16868 | * ::CUDA_SUCCESS, |
16869 | * ::CUDA_ERROR_DEINITIALIZED, |
16870 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16871 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16872 | * ::CUDA_ERROR_INVALID_VALUE |
16873 | * |
16874 | * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
16875 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
16876 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
16877 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
16878 | * ::cudaBindTexture |
16879 | */ |
16880 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); |
16881 | |
16882 | /** |
16883 | * \brief Binds an address as a 2D texture reference |
16884 | * |
16885 | * \deprecated |
16886 | * |
16887 | * Binds a linear address range to the texture reference \p hTexRef. Any |
16888 | * previous address or CUDA array state associated with the texture reference |
16889 | * is superseded by this function. Any memory previously bound to \p hTexRef |
16890 | * is unbound. |
16891 | * |
16892 | * Using a ::tex2D() function inside a kernel requires a call to either |
16893 | * ::cuTexRefSetArray() to bind the corresponding texture reference to an |
16894 | * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear |
16895 | * memory. |
16896 | * |
16897 | * Function calls to ::cuTexRefSetFormat() cannot follow calls to |
16898 | * ::cuTexRefSetAddress2D() for the same texture reference. |
16899 | * |
16900 | * It is required that \p dptr be aligned to the appropriate hardware-specific |
16901 | * texture alignment. You can query this value using the device attribute |
16902 | * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is |
16903 | * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. |
16904 | * |
16905 | * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. |
16906 | * This value can be queried using the device attribute |
16907 | * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is |
16908 | * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. |
16909 | * |
16910 | * Width and Height, which are specified in elements (or texels), cannot exceed |
16911 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and |
16912 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. |
16913 | * \p Pitch, which is specified in bytes, cannot exceed |
16914 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. |
16915 | * |
16916 | * \param hTexRef - Texture reference to bind |
16917 | * \param desc - Descriptor of CUDA array |
16918 | * \param dptr - Device pointer to bind |
16919 | * \param Pitch - Line pitch in bytes |
16920 | * |
16921 | * \return |
16922 | * ::CUDA_SUCCESS, |
16923 | * ::CUDA_ERROR_DEINITIALIZED, |
16924 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16925 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16926 | * ::CUDA_ERROR_INVALID_VALUE |
16927 | * |
16928 | * \sa ::cuTexRefSetAddress, |
16929 | * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
16930 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
16931 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
16932 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
16933 | * ::cudaBindTexture2D |
16934 | */ |
16935 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); |
16936 | |
16937 | /** |
16938 | * \brief Sets the format for a texture reference |
16939 | * |
16940 | * \deprecated |
16941 | * |
16942 | * Specifies the format of the data to be read by the texture reference |
16943 | * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the |
16944 | * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: |
16945 | * They specify the format of each component and the number of components per |
16946 | * array element. |
16947 | * |
16948 | * \param hTexRef - Texture reference |
16949 | * \param fmt - Format to set |
16950 | * \param NumPackedComponents - Number of components per array element |
16951 | * |
16952 | * \return |
16953 | * ::CUDA_SUCCESS, |
16954 | * ::CUDA_ERROR_DEINITIALIZED, |
16955 | * ::CUDA_ERROR_NOT_INITIALIZED, |
16956 | * ::CUDA_ERROR_INVALID_CONTEXT, |
16957 | * ::CUDA_ERROR_INVALID_VALUE |
16958 | * |
16959 | * \sa ::cuTexRefSetAddress, |
16960 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
16961 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, |
16962 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
16963 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
16964 | * ::cudaCreateChannelDesc, |
16965 | * ::cudaBindTexture, |
16966 | * ::cudaBindTexture2D, |
16967 | * ::cudaBindTextureToArray, |
16968 | * ::cudaBindTextureToMipmappedArray |
16969 | */ |
16970 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); |
16971 | |
16972 | /** |
16973 | * \brief Sets the addressing mode for a texture reference |
16974 | * |
16975 | * \deprecated |
16976 | * |
16977 | * Specifies the addressing mode \p am for the given dimension \p dim of the |
16978 | * texture reference \p hTexRef. If \p dim is zero, the addressing mode is |
16979 | * applied to the first parameter of the functions used to fetch from the |
16980 | * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined |
16981 | * as: |
16982 | * \code |
16983 | typedef enum CUaddress_mode_enum { |
16984 | CU_TR_ADDRESS_MODE_WRAP = 0, |
16985 | CU_TR_ADDRESS_MODE_CLAMP = 1, |
16986 | CU_TR_ADDRESS_MODE_MIRROR = 2, |
16987 | CU_TR_ADDRESS_MODE_BORDER = 3 |
16988 | } CUaddress_mode; |
16989 | * \endcode |
16990 | * |
16991 | * Note that this call has no effect if \p hTexRef is bound to linear memory. |
16992 | * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only |
16993 | * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. |
16994 | * |
16995 | * \param hTexRef - Texture reference |
16996 | * \param dim - Dimension |
16997 | * \param am - Addressing mode to set |
16998 | * |
16999 | * \return |
17000 | * ::CUDA_SUCCESS, |
17001 | * ::CUDA_ERROR_DEINITIALIZED, |
17002 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17003 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17004 | * ::CUDA_ERROR_INVALID_VALUE |
17005 | * |
17006 | * \sa ::cuTexRefSetAddress, |
17007 | * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, |
17008 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17009 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17010 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17011 | * ::cudaBindTexture, |
17012 | * ::cudaBindTexture2D, |
17013 | * ::cudaBindTextureToArray, |
17014 | * ::cudaBindTextureToMipmappedArray |
17015 | */ |
17016 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); |
17017 | |
17018 | /** |
17019 | * \brief Sets the filtering mode for a texture reference |
17020 | * |
17021 | * \deprecated |
17022 | * |
17023 | * Specifies the filtering mode \p fm to be used when reading memory through |
17024 | * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: |
17025 | * |
17026 | * \code |
17027 | typedef enum CUfilter_mode_enum { |
17028 | CU_TR_FILTER_MODE_POINT = 0, |
17029 | CU_TR_FILTER_MODE_LINEAR = 1 |
17030 | } CUfilter_mode; |
17031 | * \endcode |
17032 | * |
17033 | * Note that this call has no effect if \p hTexRef is bound to linear memory. |
17034 | * |
17035 | * \param hTexRef - Texture reference |
17036 | * \param fm - Filtering mode to set |
17037 | * |
17038 | * \return |
17039 | * ::CUDA_SUCCESS, |
17040 | * ::CUDA_ERROR_DEINITIALIZED, |
17041 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17042 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17043 | * ::CUDA_ERROR_INVALID_VALUE |
17044 | * |
17045 | * \sa ::cuTexRefSetAddress, |
17046 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17047 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17048 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17049 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17050 | * ::cudaBindTextureToArray |
17051 | */ |
17052 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); |
17053 | |
17054 | /** |
17055 | * \brief Sets the mipmap filtering mode for a texture reference |
17056 | * |
17057 | * \deprecated |
17058 | * |
17059 | * Specifies the mipmap filtering mode \p fm to be used when reading memory through |
17060 | * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: |
17061 | * |
17062 | * \code |
17063 | typedef enum CUfilter_mode_enum { |
17064 | CU_TR_FILTER_MODE_POINT = 0, |
17065 | CU_TR_FILTER_MODE_LINEAR = 1 |
17066 | } CUfilter_mode; |
17067 | * \endcode |
17068 | * |
17069 | * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
17070 | * |
17071 | * \param hTexRef - Texture reference |
17072 | * \param fm - Filtering mode to set |
17073 | * |
17074 | * \return |
17075 | * ::CUDA_SUCCESS, |
17076 | * ::CUDA_ERROR_DEINITIALIZED, |
17077 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17078 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17079 | * ::CUDA_ERROR_INVALID_VALUE |
17080 | * |
17081 | * \sa ::cuTexRefSetAddress, |
17082 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17083 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17084 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17085 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17086 | * ::cudaBindTextureToMipmappedArray |
17087 | */ |
17088 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); |
17089 | |
17090 | /** |
17091 | * \brief Sets the mipmap level bias for a texture reference |
17092 | * |
17093 | * \deprecated |
17094 | * |
17095 | * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when |
17096 | * reading memory through the texture reference \p hTexRef. |
17097 | * |
17098 | * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
17099 | * |
17100 | * \param hTexRef - Texture reference |
17101 | * \param bias - Mipmap level bias |
17102 | * |
17103 | * \return |
17104 | * ::CUDA_SUCCESS, |
17105 | * ::CUDA_ERROR_DEINITIALIZED, |
17106 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17107 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17108 | * ::CUDA_ERROR_INVALID_VALUE |
17109 | * |
17110 | * \sa ::cuTexRefSetAddress, |
17111 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17112 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17113 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17114 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17115 | * ::cudaBindTextureToMipmappedArray |
17116 | */ |
17117 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); |
17118 | |
17119 | /** |
17120 | * \brief Sets the mipmap min/max mipmap level clamps for a texture reference |
17121 | * |
17122 | * \deprecated |
17123 | * |
17124 | * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp |
17125 | * respectively, to be used when reading memory through the texture reference |
17126 | * \p hTexRef. |
17127 | * |
17128 | * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
17129 | * |
17130 | * \param hTexRef - Texture reference |
17131 | * \param minMipmapLevelClamp - Mipmap min level clamp |
17132 | * \param maxMipmapLevelClamp - Mipmap max level clamp |
17133 | * |
17134 | * \return |
17135 | * ::CUDA_SUCCESS, |
17136 | * ::CUDA_ERROR_DEINITIALIZED, |
17137 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17138 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17139 | * ::CUDA_ERROR_INVALID_VALUE |
17140 | * |
17141 | * \sa ::cuTexRefSetAddress, |
17142 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17143 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17144 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17145 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17146 | * ::cudaBindTextureToMipmappedArray |
17147 | */ |
17148 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); |
17149 | |
17150 | /** |
17151 | * \brief Sets the maximum anisotropy for a texture reference |
17152 | * |
17153 | * \deprecated |
17154 | * |
17155 | * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through |
17156 | * the texture reference \p hTexRef. |
17157 | * |
17158 | * Note that this call has no effect if \p hTexRef is bound to linear memory. |
17159 | * |
17160 | * \param hTexRef - Texture reference |
17161 | * \param maxAniso - Maximum anisotropy |
17162 | * |
17163 | * \return |
17164 | * ::CUDA_SUCCESS, |
17165 | * ::CUDA_ERROR_DEINITIALIZED, |
17166 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17167 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17168 | * ::CUDA_ERROR_INVALID_VALUE |
17169 | * |
17170 | * \sa ::cuTexRefSetAddress, |
17171 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17172 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17173 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17174 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17175 | * ::cudaBindTextureToArray, |
17176 | * ::cudaBindTextureToMipmappedArray |
17177 | */ |
17178 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); |
17179 | |
17180 | /** |
17181 | * \brief Sets the border color for a texture reference |
17182 | * |
17183 | * \deprecated |
17184 | * |
17185 | * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference |
17186 | * \p hTexRef. The color value supports only float type and holds color components in |
17187 | * the following sequence: |
17188 | * pBorderColor[0] holds 'R' component |
17189 | * pBorderColor[1] holds 'G' component |
17190 | * pBorderColor[2] holds 'B' component |
17191 | * pBorderColor[3] holds 'A' component |
17192 | * |
17193 | * Note that the color values can be set only when the Address mode is set to |
17194 | * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. |
17195 | * Applications using integer border color values have to "reinterpret_cast" their values to float. |
17196 | * |
17197 | * \param hTexRef - Texture reference |
17198 | * \param pBorderColor - RGBA color |
17199 | * |
17200 | * \return |
17201 | * ::CUDA_SUCCESS, |
17202 | * ::CUDA_ERROR_DEINITIALIZED, |
17203 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17204 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17205 | * ::CUDA_ERROR_INVALID_VALUE |
17206 | * |
17207 | * \sa ::cuTexRefSetAddressMode, |
17208 | * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, |
17209 | * ::cudaBindTexture, |
17210 | * ::cudaBindTexture2D, |
17211 | * ::cudaBindTextureToArray, |
17212 | * ::cudaBindTextureToMipmappedArray |
17213 | */ |
17214 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); |
17215 | |
17216 | /** |
17217 | * \brief Sets the flags for a texture reference |
17218 | * |
17219 | * \deprecated |
17220 | * |
17221 | * Specifies optional flags via \p Flags to specify the behavior of data |
17222 | * returned through the texture reference \p hTexRef. The valid flags are: |
17223 | * |
17224 | * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of |
17225 | * having the texture promote integer data to floating point data in the |
17226 | * range [0, 1]. Note that texture with 32-bit integer format |
17227 | * would not be promoted, regardless of whether or not this |
17228 | * flag is specified; |
17229 | * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the |
17230 | * default behavior of having the texture coordinates range |
17231 | * from [0, Dim) where Dim is the width or height of the CUDA |
17232 | * array. Instead, the texture coordinates [0, 1.0) reference |
17233 | * the entire breadth of the array dimension; |
17234 | * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear |
17235 | * filtering optimizations. Trilinear optimizations improve texture filtering |
17236 | * performance by allowing bilinear filtering on textures in scenarios where |
17237 | * it can closely approximate the expected results. |
17238 | * |
17239 | * \param hTexRef - Texture reference |
17240 | * \param Flags - Optional flags to set |
17241 | * |
17242 | * \return |
17243 | * ::CUDA_SUCCESS, |
17244 | * ::CUDA_ERROR_DEINITIALIZED, |
17245 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17246 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17247 | * ::CUDA_ERROR_INVALID_VALUE |
17248 | * |
17249 | * \sa ::cuTexRefSetAddress, |
17250 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17251 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, |
17252 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17253 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
17254 | * ::cudaBindTexture, |
17255 | * ::cudaBindTexture2D, |
17256 | * ::cudaBindTextureToArray, |
17257 | * ::cudaBindTextureToMipmappedArray |
17258 | */ |
17259 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); |
17260 | |
17261 | /** |
17262 | * \brief Gets the address associated with a texture reference |
17263 | * |
17264 | * \deprecated |
17265 | * |
17266 | * Returns in \p *pdptr the base address bound to the texture reference |
17267 | * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
17268 | * is not bound to any device memory range. |
17269 | * |
17270 | * \param pdptr - Returned device address |
17271 | * \param hTexRef - Texture reference |
17272 | * |
17273 | * \return |
17274 | * ::CUDA_SUCCESS, |
17275 | * ::CUDA_ERROR_DEINITIALIZED, |
17276 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17277 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17278 | * ::CUDA_ERROR_INVALID_VALUE |
17279 | * |
17280 | * \sa ::cuTexRefSetAddress, |
17281 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17282 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17283 | * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17284 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17285 | */ |
17286 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); |
17287 | |
17288 | /** |
17289 | * \brief Gets the array bound to a texture reference |
17290 | * |
17291 | * \deprecated |
17292 | * |
17293 | * Returns in \p *phArray the CUDA array bound to the texture reference |
17294 | * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
17295 | * is not bound to any CUDA array. |
17296 | * |
17297 | * \param phArray - Returned array |
17298 | * \param hTexRef - Texture reference |
17299 | * |
17300 | * \return |
17301 | * ::CUDA_SUCCESS, |
17302 | * ::CUDA_ERROR_DEINITIALIZED, |
17303 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17304 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17305 | * ::CUDA_ERROR_INVALID_VALUE |
17306 | * |
17307 | * \sa ::cuTexRefSetAddress, |
17308 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17309 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17310 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, |
17311 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17312 | */ |
17313 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); |
17314 | |
17315 | /** |
17316 | * \brief Gets the mipmapped array bound to a texture reference |
17317 | * |
17318 | * \deprecated |
17319 | * |
17320 | * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture |
17321 | * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
17322 | * is not bound to any CUDA mipmapped array. |
17323 | * |
17324 | * \param phMipmappedArray - Returned mipmapped array |
17325 | * \param hTexRef - Texture reference |
17326 | * |
17327 | * \return |
17328 | * ::CUDA_SUCCESS, |
17329 | * ::CUDA_ERROR_DEINITIALIZED, |
17330 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17331 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17332 | * ::CUDA_ERROR_INVALID_VALUE |
17333 | * |
17334 | * \sa ::cuTexRefSetAddress, |
17335 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17336 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17337 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, |
17338 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17339 | */ |
17340 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); |
17341 | |
17342 | /** |
17343 | * \brief Gets the addressing mode used by a texture reference |
17344 | * |
17345 | * \deprecated |
17346 | * |
17347 | * Returns in \p *pam the addressing mode corresponding to the |
17348 | * dimension \p dim of the texture reference \p hTexRef. Currently, the only |
17349 | * valid value for \p dim are 0 and 1. |
17350 | * |
17351 | * \param pam - Returned addressing mode |
17352 | * \param hTexRef - Texture reference |
17353 | * \param dim - Dimension |
17354 | * |
17355 | * \return |
17356 | * ::CUDA_SUCCESS, |
17357 | * ::CUDA_ERROR_DEINITIALIZED, |
17358 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17359 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17360 | * ::CUDA_ERROR_INVALID_VALUE |
17361 | * |
17362 | * \sa ::cuTexRefSetAddress, |
17363 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17364 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17365 | * ::cuTexRefGetAddress, ::cuTexRefGetArray, |
17366 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17367 | */ |
17368 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); |
17369 | |
17370 | /** |
17371 | * \brief Gets the filter-mode used by a texture reference |
17372 | * |
17373 | * \deprecated |
17374 | * |
17375 | * Returns in \p *pfm the filtering mode of the texture reference |
17376 | * \p hTexRef. |
17377 | * |
17378 | * \param pfm - Returned filtering mode |
17379 | * \param hTexRef - Texture reference |
17380 | * |
17381 | * \return |
17382 | * ::CUDA_SUCCESS, |
17383 | * ::CUDA_ERROR_DEINITIALIZED, |
17384 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17385 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17386 | * ::CUDA_ERROR_INVALID_VALUE |
17387 | * |
17388 | * \sa ::cuTexRefSetAddress, |
17389 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17390 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17391 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17392 | * ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17393 | */ |
17394 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); |
17395 | |
17396 | /** |
17397 | * \brief Gets the format used by a texture reference |
17398 | * |
17399 | * \deprecated |
17400 | * |
17401 | * Returns in \p *pFormat and \p *pNumChannels the format and number |
17402 | * of components of the CUDA array bound to the texture reference \p hTexRef. |
17403 | * If \p pFormat or \p pNumChannels is NULL, it will be ignored. |
17404 | * |
17405 | * \param pFormat - Returned format |
17406 | * \param pNumChannels - Returned number of components |
17407 | * \param hTexRef - Texture reference |
17408 | * |
17409 | * \return |
17410 | * ::CUDA_SUCCESS, |
17411 | * ::CUDA_ERROR_DEINITIALIZED, |
17412 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17413 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17414 | * ::CUDA_ERROR_INVALID_VALUE |
17415 | * |
17416 | * \sa ::cuTexRefSetAddress, |
17417 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17418 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17419 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17420 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags |
17421 | */ |
17422 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); |
17423 | |
17424 | /** |
17425 | * \brief Gets the mipmap filtering mode for a texture reference |
17426 | * |
17427 | * \deprecated |
17428 | * |
17429 | * Returns the mipmap filtering mode in \p pfm that's used when reading memory through |
17430 | * the texture reference \p hTexRef. |
17431 | * |
17432 | * \param pfm - Returned mipmap filtering mode |
17433 | * \param hTexRef - Texture reference |
17434 | * |
17435 | * \return |
17436 | * ::CUDA_SUCCESS, |
17437 | * ::CUDA_ERROR_DEINITIALIZED, |
17438 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17439 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17440 | * ::CUDA_ERROR_INVALID_VALUE |
17441 | * |
17442 | * \sa ::cuTexRefSetAddress, |
17443 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17444 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17445 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17446 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17447 | */ |
17448 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); |
17449 | |
17450 | /** |
17451 | * \brief Gets the mipmap level bias for a texture reference |
17452 | * |
17453 | * \deprecated |
17454 | * |
17455 | * Returns the mipmap level bias in \p pBias that's added to the specified mipmap |
17456 | * level when reading memory through the texture reference \p hTexRef. |
17457 | * |
17458 | * \param pbias - Returned mipmap level bias |
17459 | * \param hTexRef - Texture reference |
17460 | * |
17461 | * \return |
17462 | * ::CUDA_SUCCESS, |
17463 | * ::CUDA_ERROR_DEINITIALIZED, |
17464 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17465 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17466 | * ::CUDA_ERROR_INVALID_VALUE |
17467 | * |
17468 | * \sa ::cuTexRefSetAddress, |
17469 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17470 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17471 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17472 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17473 | */ |
17474 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); |
17475 | |
17476 | /** |
17477 | * \brief Gets the min/max mipmap level clamps for a texture reference |
17478 | * |
17479 | * \deprecated |
17480 | * |
17481 | * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp |
17482 | * that's used when reading memory through the texture reference \p hTexRef. |
17483 | * |
17484 | * \param pminMipmapLevelClamp - Returned mipmap min level clamp |
17485 | * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp |
17486 | * \param hTexRef - Texture reference |
17487 | * |
17488 | * \return |
17489 | * ::CUDA_SUCCESS, |
17490 | * ::CUDA_ERROR_DEINITIALIZED, |
17491 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17492 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17493 | * ::CUDA_ERROR_INVALID_VALUE |
17494 | * |
17495 | * \sa ::cuTexRefSetAddress, |
17496 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17497 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17498 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17499 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17500 | */ |
17501 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); |
17502 | |
17503 | /** |
17504 | * \brief Gets the maximum anisotropy for a texture reference |
17505 | * |
17506 | * \deprecated |
17507 | * |
17508 | * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through |
17509 | * the texture reference \p hTexRef. |
17510 | * |
17511 | * \param pmaxAniso - Returned maximum anisotropy |
17512 | * \param hTexRef - Texture reference |
17513 | * |
17514 | * \return |
17515 | * ::CUDA_SUCCESS, |
17516 | * ::CUDA_ERROR_DEINITIALIZED, |
17517 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17518 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17519 | * ::CUDA_ERROR_INVALID_VALUE |
17520 | * |
17521 | * \sa ::cuTexRefSetAddress, |
17522 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17523 | * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17524 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17525 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
17526 | */ |
17527 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); |
17528 | |
17529 | /** |
17530 | * \brief Gets the border color used by a texture reference |
17531 | * |
17532 | * \deprecated |
17533 | * |
17534 | * Returns in \p pBorderColor, values of the RGBA color used by |
17535 | * the texture reference \p hTexRef. |
17536 | * The color value is of type float and holds color components in |
17537 | * the following sequence: |
17538 | * pBorderColor[0] holds 'R' component |
17539 | * pBorderColor[1] holds 'G' component |
17540 | * pBorderColor[2] holds 'B' component |
17541 | * pBorderColor[3] holds 'A' component |
17542 | * |
17543 | * \param hTexRef - Texture reference |
17544 | * \param pBorderColor - Returned Type and Value of RGBA color |
17545 | * |
17546 | * \return |
17547 | * ::CUDA_SUCCESS, |
17548 | * ::CUDA_ERROR_DEINITIALIZED, |
17549 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17550 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17551 | * ::CUDA_ERROR_INVALID_VALUE |
17552 | * |
17553 | * \sa ::cuTexRefSetAddressMode, |
17554 | * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor |
17555 | */ |
17556 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); |
17557 | |
17558 | /** |
17559 | * \brief Gets the flags used by a texture reference |
17560 | * |
17561 | * \deprecated |
17562 | * |
17563 | * Returns in \p *pFlags the flags of the texture reference \p hTexRef. |
17564 | * |
17565 | * \param pFlags - Returned flags |
17566 | * \param hTexRef - Texture reference |
17567 | * |
17568 | * \return |
17569 | * ::CUDA_SUCCESS, |
17570 | * ::CUDA_ERROR_DEINITIALIZED, |
17571 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17572 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17573 | * ::CUDA_ERROR_INVALID_VALUE |
17574 | * |
17575 | * \sa ::cuTexRefSetAddress, |
17576 | * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
17577 | * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
17578 | * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
17579 | * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat |
17580 | */ |
17581 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); |
17582 | |
17583 | /** |
17584 | * \brief Creates a texture reference |
17585 | * |
17586 | * \deprecated |
17587 | * |
17588 | * Creates a texture reference and returns its handle in \p *pTexRef. Once |
17589 | * created, the application must call ::cuTexRefSetArray() or |
17590 | * ::cuTexRefSetAddress() to associate the reference with allocated memory. |
17591 | * Other texture reference functions are used to specify the format and |
17592 | * interpretation (addressing, filtering, etc.) to be used when the memory is |
17593 | * read through this texture reference. |
17594 | * |
17595 | * \param pTexRef - Returned texture reference |
17596 | * |
17597 | * \return |
17598 | * ::CUDA_SUCCESS, |
17599 | * ::CUDA_ERROR_DEINITIALIZED, |
17600 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17601 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17602 | * ::CUDA_ERROR_INVALID_VALUE |
17603 | * |
17604 | * \sa ::cuTexRefDestroy |
17605 | */ |
17606 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); |
17607 | |
17608 | /** |
17609 | * \brief Destroys a texture reference |
17610 | * |
17611 | * \deprecated |
17612 | * |
17613 | * Destroys the texture reference specified by \p hTexRef. |
17614 | * |
17615 | * \param hTexRef - Texture reference to destroy |
17616 | * |
17617 | * \return |
17618 | * ::CUDA_SUCCESS, |
17619 | * ::CUDA_ERROR_DEINITIALIZED, |
17620 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17621 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17622 | * ::CUDA_ERROR_INVALID_VALUE |
17623 | * |
17624 | * \sa ::cuTexRefCreate |
17625 | */ |
17626 | __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); |
17627 | |
17628 | /** @} */ /* END CUDA_TEXREF_DEPRECATED */ |
17629 | |
17630 | |
17631 | /** |
17632 | * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] |
17633 | * |
17634 | * ___MANBRIEF___ surface reference management functions of the low-level CUDA |
17635 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
17636 | * |
17637 | * This section describes the surface reference management functions of the |
17638 | * low-level CUDA driver application programming interface. |
17639 | * |
17640 | * @{ |
17641 | */ |
17642 | |
17643 | /** |
17644 | * \brief Sets the CUDA array for a surface reference. |
17645 | * |
17646 | * \deprecated |
17647 | * |
17648 | * Sets the CUDA array \p hArray to be read and written by the surface reference |
17649 | * \p hSurfRef. Any previous CUDA array state associated with the surface |
17650 | * reference is superseded by this function. \p Flags must be set to 0. |
17651 | * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. |
17652 | * Any CUDA array previously bound to \p hSurfRef is unbound. |
17653 | |
17654 | * \param hSurfRef - Surface reference handle |
17655 | * \param hArray - CUDA array handle |
17656 | * \param Flags - set to 0 |
17657 | * |
17658 | * \return |
17659 | * ::CUDA_SUCCESS, |
17660 | * ::CUDA_ERROR_DEINITIALIZED, |
17661 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17662 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17663 | * ::CUDA_ERROR_INVALID_VALUE |
17664 | * |
17665 | * \sa |
17666 | * ::cuModuleGetSurfRef, |
17667 | * ::cuSurfRefGetArray, |
17668 | * ::cudaBindSurfaceToArray |
17669 | */ |
17670 | __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); |
17671 | |
17672 | /** |
17673 | * \brief Passes back the CUDA array bound to a surface reference. |
17674 | * |
17675 | * \deprecated |
17676 | * |
17677 | * Returns in \p *phArray the CUDA array bound to the surface reference |
17678 | * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference |
17679 | * is not bound to any CUDA array. |
17680 | |
17681 | * \param phArray - Surface reference handle |
17682 | * \param hSurfRef - Surface reference handle |
17683 | * |
17684 | * \return |
17685 | * ::CUDA_SUCCESS, |
17686 | * ::CUDA_ERROR_DEINITIALIZED, |
17687 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17688 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17689 | * ::CUDA_ERROR_INVALID_VALUE |
17690 | * |
17691 | * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray |
17692 | */ |
17693 | __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); |
17694 | |
17695 | /** @} */ /* END CUDA_SURFREF_DEPRECATED */ |
17696 | |
17697 | /** |
17698 | * \defgroup CUDA_TEXOBJECT Texture Object Management |
17699 | * |
17700 | * ___MANBRIEF___ texture object management functions of the low-level CUDA |
17701 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
17702 | * |
17703 | * This section describes the texture object management functions of the |
17704 | * low-level CUDA driver application programming interface. The texture |
17705 | * object API is only supported on devices of compute capability 3.0 or higher. |
17706 | * |
17707 | * @{ |
17708 | */ |
17709 | |
17710 | /** |
17711 | * \brief Creates a texture object |
17712 | * |
17713 | * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes |
17714 | * the data to texture from. \p pTexDesc describes how the data should be sampled. |
17715 | * \p pResViewDesc is an optional argument that specifies an alternate format for |
17716 | * the data described by \p pResDesc, and also describes the subresource region |
17717 | * to restrict access to when texturing. \p pResViewDesc can only be specified if |
17718 | * the type of resource is a CUDA array or a CUDA mipmapped array. |
17719 | * |
17720 | * Texture objects are only supported on devices of compute capability 3.0 or higher. |
17721 | * Additionally, a texture object is an opaque value, and, as such, should only be |
17722 | * accessed through CUDA API calls. |
17723 | * |
17724 | * The ::CUDA_RESOURCE_DESC structure is defined as: |
17725 | * \code |
17726 | typedef struct CUDA_RESOURCE_DESC_st |
17727 | { |
17728 | CUresourcetype resType; |
17729 | |
17730 | union { |
17731 | struct { |
17732 | CUarray hArray; |
17733 | } array; |
17734 | struct { |
17735 | CUmipmappedArray hMipmappedArray; |
17736 | } mipmap; |
17737 | struct { |
17738 | CUdeviceptr devPtr; |
17739 | CUarray_format format; |
17740 | unsigned int numChannels; |
17741 | size_t sizeInBytes; |
17742 | } linear; |
17743 | struct { |
17744 | CUdeviceptr devPtr; |
17745 | CUarray_format format; |
17746 | unsigned int numChannels; |
17747 | size_t width; |
17748 | size_t height; |
17749 | size_t pitchInBytes; |
17750 | } pitch2D; |
17751 | } res; |
17752 | |
17753 | unsigned int flags; |
17754 | } CUDA_RESOURCE_DESC; |
17755 | |
17756 | * \endcode |
17757 | * where: |
17758 | * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. |
17759 | * CUresourceType is defined as: |
17760 | * \code |
17761 | typedef enum CUresourcetype_enum { |
17762 | CU_RESOURCE_TYPE_ARRAY = 0x00, |
17763 | CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, |
17764 | CU_RESOURCE_TYPE_LINEAR = 0x02, |
17765 | CU_RESOURCE_TYPE_PITCH2D = 0x03 |
17766 | } CUresourcetype; |
17767 | * \endcode |
17768 | * |
17769 | * \par |
17770 | * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray |
17771 | * must be set to a valid CUDA array handle. |
17772 | * |
17773 | * \par |
17774 | * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray |
17775 | * must be set to a valid CUDA mipmapped array handle. |
17776 | * |
17777 | * \par |
17778 | * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr |
17779 | * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. |
17780 | * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels |
17781 | * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes |
17782 | * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed |
17783 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). |
17784 | * |
17785 | * \par |
17786 | * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr |
17787 | * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. |
17788 | * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels |
17789 | * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width |
17790 | * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed |
17791 | * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. |
17792 | * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to |
17793 | * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. |
17794 | * |
17795 | * - ::flags must be set to zero. |
17796 | * |
17797 | * |
17798 | * The ::CUDA_TEXTURE_DESC struct is defined as |
17799 | * \code |
17800 | typedef struct CUDA_TEXTURE_DESC_st { |
17801 | CUaddress_mode addressMode[3]; |
17802 | CUfilter_mode filterMode; |
17803 | unsigned int flags; |
17804 | unsigned int maxAnisotropy; |
17805 | CUfilter_mode mipmapFilterMode; |
17806 | float mipmapLevelBias; |
17807 | float minMipmapLevelClamp; |
17808 | float maxMipmapLevelClamp; |
17809 | } CUDA_TEXTURE_DESC; |
17810 | * \endcode |
17811 | * where |
17812 | * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: |
17813 | * \code |
17814 | typedef enum CUaddress_mode_enum { |
17815 | CU_TR_ADDRESS_MODE_WRAP = 0, |
17816 | CU_TR_ADDRESS_MODE_CLAMP = 1, |
17817 | CU_TR_ADDRESS_MODE_MIRROR = 2, |
17818 | CU_TR_ADDRESS_MODE_BORDER = 3 |
17819 | } CUaddress_mode; |
17820 | * \endcode |
17821 | * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES |
17822 | * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. |
17823 | * |
17824 | * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: |
17825 | * \code |
17826 | typedef enum CUfilter_mode_enum { |
17827 | CU_TR_FILTER_MODE_POINT = 0, |
17828 | CU_TR_FILTER_MODE_LINEAR = 1 |
17829 | } CUfilter_mode; |
17830 | * \endcode |
17831 | * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. |
17832 | * |
17833 | * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: |
17834 | * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of |
17835 | * having the texture promote integer data to floating point data in the |
17836 | * range [0, 1]. Note that texture with 32-bit integer format would not be |
17837 | * promoted, regardless of whether or not this flag is specified. |
17838 | * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior |
17839 | * of having the texture coordinates range from [0, Dim) where Dim is the |
17840 | * width or height of the CUDA array. Instead, the texture coordinates |
17841 | * [0, 1.0) reference the entire breadth of the array dimension; Note that |
17842 | * for CUDA mipmapped arrays, this flag has to be set. |
17843 | * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear |
17844 | * filtering optimizations. Trilinear optimizations improve texture filtering |
17845 | * performance by allowing bilinear filtering on textures in scenarios where |
17846 | * it can closely approximate the expected results. |
17847 | * |
17848 | * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be |
17849 | * clamped to the range [1,16]. |
17850 | * |
17851 | * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. |
17852 | * |
17853 | * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. |
17854 | * |
17855 | * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. |
17856 | * |
17857 | * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. |
17858 | * |
17859 | * |
17860 | * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as |
17861 | * \code |
17862 | typedef struct CUDA_RESOURCE_VIEW_DESC_st |
17863 | { |
17864 | CUresourceViewFormat format; |
17865 | size_t width; |
17866 | size_t height; |
17867 | size_t depth; |
17868 | unsigned int firstMipmapLevel; |
17869 | unsigned int lastMipmapLevel; |
17870 | unsigned int firstLayer; |
17871 | unsigned int lastLayer; |
17872 | } CUDA_RESOURCE_VIEW_DESC; |
17873 | * \endcode |
17874 | * where: |
17875 | * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should |
17876 | * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block |
17877 | * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. |
17878 | * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have |
17879 | * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base |
17880 | * format but with 4 channels. |
17881 | * |
17882 | * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block |
17883 | * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, |
17884 | * this value has to be equal to that of the original resource. |
17885 | * |
17886 | * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block |
17887 | * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, |
17888 | * this value has to be equal to that of the original resource. |
17889 | * |
17890 | * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the |
17891 | * original resource. |
17892 | * |
17893 | * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. |
17894 | * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp |
17895 | * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, |
17896 | * then the actual minimum mipmap level clamp will be 3.2. |
17897 | * |
17898 | * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value |
17899 | * has to be zero. |
17900 | * |
17901 | * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. |
17902 | * For non-layered resources, this value has to be zero. |
17903 | * |
17904 | * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, |
17905 | * this value has to be zero. |
17906 | * |
17907 | * |
17908 | * \param pTexObject - Texture object to create |
17909 | * \param pResDesc - Resource descriptor |
17910 | * \param pTexDesc - Texture descriptor |
17911 | * \param pResViewDesc - Resource view descriptor |
17912 | * |
17913 | * \return |
17914 | * ::CUDA_SUCCESS, |
17915 | * ::CUDA_ERROR_DEINITIALIZED, |
17916 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17917 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17918 | * ::CUDA_ERROR_INVALID_VALUE |
17919 | * |
17920 | * \sa |
17921 | * ::cuTexObjectDestroy, |
17922 | * ::cudaCreateTextureObject |
17923 | */ |
17924 | CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); |
17925 | |
17926 | /** |
17927 | * \brief Destroys a texture object |
17928 | * |
17929 | * Destroys the texture object specified by \p texObject. |
17930 | * |
17931 | * \param texObject - Texture object to destroy |
17932 | * |
17933 | * \return |
17934 | * ::CUDA_SUCCESS, |
17935 | * ::CUDA_ERROR_DEINITIALIZED, |
17936 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17937 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17938 | * ::CUDA_ERROR_INVALID_VALUE |
17939 | * |
17940 | * \sa |
17941 | * ::cuTexObjectCreate, |
17942 | * ::cudaDestroyTextureObject |
17943 | */ |
17944 | CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); |
17945 | |
17946 | /** |
17947 | * \brief Returns a texture object's resource descriptor |
17948 | * |
17949 | * Returns the resource descriptor for the texture object specified by \p texObject. |
17950 | * |
17951 | * \param pResDesc - Resource descriptor |
17952 | * \param texObject - Texture object |
17953 | * |
17954 | * \return |
17955 | * ::CUDA_SUCCESS, |
17956 | * ::CUDA_ERROR_DEINITIALIZED, |
17957 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17958 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17959 | * ::CUDA_ERROR_INVALID_VALUE |
17960 | * |
17961 | * \sa |
17962 | * ::cuTexObjectCreate, |
17963 | * ::cudaGetTextureObjectResourceDesc, |
17964 | */ |
17965 | CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); |
17966 | |
17967 | /** |
17968 | * \brief Returns a texture object's texture descriptor |
17969 | * |
17970 | * Returns the texture descriptor for the texture object specified by \p texObject. |
17971 | * |
17972 | * \param pTexDesc - Texture descriptor |
17973 | * \param texObject - Texture object |
17974 | * |
17975 | * \return |
17976 | * ::CUDA_SUCCESS, |
17977 | * ::CUDA_ERROR_DEINITIALIZED, |
17978 | * ::CUDA_ERROR_NOT_INITIALIZED, |
17979 | * ::CUDA_ERROR_INVALID_CONTEXT, |
17980 | * ::CUDA_ERROR_INVALID_VALUE |
17981 | * |
17982 | * \sa |
17983 | * ::cuTexObjectCreate, |
17984 | * ::cudaGetTextureObjectTextureDesc |
17985 | */ |
17986 | CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); |
17987 | |
17988 | /** |
17989 | * \brief Returns a texture object's resource view descriptor |
17990 | * |
17991 | * Returns the resource view descriptor for the texture object specified by \p texObject. |
17992 | * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. |
17993 | * |
17994 | * \param pResViewDesc - Resource view descriptor |
17995 | * \param texObject - Texture object |
17996 | * |
17997 | * \return |
17998 | * ::CUDA_SUCCESS, |
17999 | * ::CUDA_ERROR_DEINITIALIZED, |
18000 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18001 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18002 | * ::CUDA_ERROR_INVALID_VALUE |
18003 | * |
18004 | * \sa |
18005 | * ::cuTexObjectCreate, |
18006 | * ::cudaGetTextureObjectResourceViewDesc |
18007 | */ |
18008 | CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); |
18009 | |
18010 | /** @} */ /* END CUDA_TEXOBJECT */ |
18011 | |
18012 | /** |
18013 | * \defgroup CUDA_SURFOBJECT Surface Object Management |
18014 | * |
18015 | * ___MANBRIEF___ surface object management functions of the low-level CUDA |
18016 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
18017 | * |
18018 | * This section describes the surface object management functions of the |
18019 | * low-level CUDA driver application programming interface. The surface |
18020 | * object API is only supported on devices of compute capability 3.0 or higher. |
18021 | * |
18022 | * @{ |
18023 | */ |
18024 | |
18025 | /** |
18026 | * \brief Creates a surface object |
18027 | * |
18028 | * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes |
18029 | * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be |
18030 | * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray |
18031 | * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. |
18032 | * |
18033 | * Surface objects are only supported on devices of compute capability 3.0 or higher. |
18034 | * Additionally, a surface object is an opaque value, and, as such, should only be |
18035 | * accessed through CUDA API calls. |
18036 | * |
18037 | * \param pSurfObject - Surface object to create |
18038 | * \param pResDesc - Resource descriptor |
18039 | * |
18040 | * \return |
18041 | * ::CUDA_SUCCESS, |
18042 | * ::CUDA_ERROR_DEINITIALIZED, |
18043 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18044 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18045 | * ::CUDA_ERROR_INVALID_VALUE |
18046 | * |
18047 | * \sa |
18048 | * ::cuSurfObjectDestroy, |
18049 | * ::cudaCreateSurfaceObject |
18050 | */ |
18051 | CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); |
18052 | |
18053 | /** |
18054 | * \brief Destroys a surface object |
18055 | * |
18056 | * Destroys the surface object specified by \p surfObject. |
18057 | * |
18058 | * \param surfObject - Surface object to destroy |
18059 | * |
18060 | * \return |
18061 | * ::CUDA_SUCCESS, |
18062 | * ::CUDA_ERROR_DEINITIALIZED, |
18063 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18064 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18065 | * ::CUDA_ERROR_INVALID_VALUE |
18066 | * |
18067 | * \sa |
18068 | * ::cuSurfObjectCreate, |
18069 | * ::cudaDestroySurfaceObject |
18070 | */ |
18071 | CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); |
18072 | |
18073 | /** |
18074 | * \brief Returns a surface object's resource descriptor |
18075 | * |
18076 | * Returns the resource descriptor for the surface object specified by \p surfObject. |
18077 | * |
18078 | * \param pResDesc - Resource descriptor |
18079 | * \param surfObject - Surface object |
18080 | * |
18081 | * \return |
18082 | * ::CUDA_SUCCESS, |
18083 | * ::CUDA_ERROR_DEINITIALIZED, |
18084 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18085 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18086 | * ::CUDA_ERROR_INVALID_VALUE |
18087 | * |
18088 | * \sa |
18089 | * ::cuSurfObjectCreate, |
18090 | * ::cudaGetSurfaceObjectResourceDesc |
18091 | */ |
18092 | CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); |
18093 | |
18094 | /** @} */ /* END CUDA_SURFOBJECT */ |
18095 | |
18096 | /** |
18097 | * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access |
18098 | * |
18099 | * ___MANBRIEF___ direct peer context memory access functions of the low-level |
18100 | * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
18101 | * |
18102 | * This section describes the direct peer context memory access functions |
18103 | * of the low-level CUDA driver application programming interface. |
18104 | * |
18105 | * @{ |
18106 | */ |
18107 | |
18108 | /** |
18109 | * \brief Queries if a device may directly access a peer device's memory. |
18110 | * |
18111 | * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of |
18112 | * directly accessing memory from contexts on \p peerDev and 0 otherwise. |
18113 | * If direct access of \p peerDev from \p dev is possible, then access may be |
18114 | * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). |
18115 | * |
18116 | * \param canAccessPeer - Returned access capability |
18117 | * \param dev - Device from which allocations on \p peerDev are to |
18118 | * be directly accessed. |
18119 | * \param peerDev - Device on which the allocations to be directly accessed |
18120 | * by \p dev reside. |
18121 | * |
18122 | * \return |
18123 | * ::CUDA_SUCCESS, |
18124 | * ::CUDA_ERROR_DEINITIALIZED, |
18125 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18126 | * ::CUDA_ERROR_INVALID_DEVICE |
18127 | * \notefnerr |
18128 | * |
18129 | * \sa |
18130 | * ::cuCtxEnablePeerAccess, |
18131 | * ::cuCtxDisablePeerAccess, |
18132 | * ::cudaDeviceCanAccessPeer |
18133 | */ |
18134 | CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); |
18135 | |
18136 | /** |
18137 | * \brief Enables direct access to memory allocations in a peer context. |
18138 | * |
18139 | * If both the current context and \p peerContext are on devices which support unified |
18140 | * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same |
18141 | * major compute capability, then on success all allocations from \p peerContext will |
18142 | * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional |
18143 | * details. |
18144 | * |
18145 | * Note that access granted by this call is unidirectional and that in order to access |
18146 | * memory from the current context in \p peerContext, a separate symmetric call |
18147 | * to ::cuCtxEnablePeerAccess() is required. |
18148 | * |
18149 | * Note that there are both device-wide and system-wide limitations per system |
18150 | * configuration, as noted in the CUDA Programming Guide under the section |
18151 | * "Peer-to-Peer Memory Access". |
18152 | * |
18153 | * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates |
18154 | * that the ::CUdevice of the current context cannot directly access memory |
18155 | * from the ::CUdevice of \p peerContext. |
18156 | * |
18157 | * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of |
18158 | * \p peerContext from the current context has already been enabled. |
18159 | * |
18160 | * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible |
18161 | * because hardware resources required for peer access have been exhausted. |
18162 | * |
18163 | * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext |
18164 | * is not a valid context, or if the current context is \p peerContext. |
18165 | * |
18166 | * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. |
18167 | * |
18168 | * \param peerContext - Peer context to enable direct access to from the current context |
18169 | * \param Flags - Reserved for future use and must be set to 0 |
18170 | * |
18171 | * \return |
18172 | * ::CUDA_SUCCESS, |
18173 | * ::CUDA_ERROR_DEINITIALIZED, |
18174 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18175 | * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, |
18176 | * ::CUDA_ERROR_TOO_MANY_PEERS, |
18177 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18178 | * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, |
18179 | * ::CUDA_ERROR_INVALID_VALUE |
18180 | * \notefnerr |
18181 | * |
18182 | * \sa |
18183 | * ::cuDeviceCanAccessPeer, |
18184 | * ::cuCtxDisablePeerAccess, |
18185 | * ::cudaDeviceEnablePeerAccess |
18186 | */ |
18187 | CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); |
18188 | |
18189 | /** |
18190 | * \brief Disables direct access to memory allocations in a peer context and |
18191 | * unregisters any registered allocations. |
18192 | * |
18193 | Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has |
18194 | * not yet been enabled from \p peerContext to the current context. |
18195 | * |
18196 | * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if |
18197 | * \p peerContext is not a valid context. |
18198 | * |
18199 | * \param peerContext - Peer context to disable direct access to |
18200 | * |
18201 | * \return |
18202 | * ::CUDA_SUCCESS, |
18203 | * ::CUDA_ERROR_DEINITIALIZED, |
18204 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18205 | * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, |
18206 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18207 | * \notefnerr |
18208 | * |
18209 | * \sa |
18210 | * ::cuDeviceCanAccessPeer, |
18211 | * ::cuCtxEnablePeerAccess, |
18212 | * ::cudaDeviceDisablePeerAccess |
18213 | */ |
18214 | CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); |
18215 | |
18216 | /** |
18217 | * \brief Queries attributes of the link between two devices. |
18218 | * |
18219 | * Returns in \p *value the value of the requested attribute \p attrib of the |
18220 | * link between \p srcDevice and \p dstDevice. The supported attributes are: |
18221 | * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the |
18222 | * performance of the link between two devices. |
18223 | * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. |
18224 | * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over |
18225 | * the link are supported. |
18226 | * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can |
18227 | * be accessed over the link. |
18228 | * |
18229 | * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid |
18230 | * or if they represent the same device. |
18231 | * |
18232 | * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is |
18233 | * a null pointer. |
18234 | * |
18235 | * \param value - Returned value of the requested attribute |
18236 | * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. |
18237 | * \param srcDevice - The source device of the target link. |
18238 | * \param dstDevice - The destination device of the target link. |
18239 | * |
18240 | * \return |
18241 | * ::CUDA_SUCCESS, |
18242 | * ::CUDA_ERROR_DEINITIALIZED, |
18243 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18244 | * ::CUDA_ERROR_INVALID_DEVICE, |
18245 | * ::CUDA_ERROR_INVALID_VALUE |
18246 | * \notefnerr |
18247 | * |
18248 | * \sa |
18249 | * ::cuCtxEnablePeerAccess, |
18250 | * ::cuCtxDisablePeerAccess, |
18251 | * ::cuDeviceCanAccessPeer, |
18252 | * ::cudaDeviceGetP2PAttribute |
18253 | */ |
18254 | CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); |
18255 | |
18256 | /** @} */ /* END CUDA_PEER_ACCESS */ |
18257 | |
18258 | /** |
18259 | * \defgroup CUDA_GRAPHICS Graphics Interoperability |
18260 | * |
18261 | * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA |
18262 | * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
18263 | * |
18264 | * This section describes the graphics interoperability functions of the |
18265 | * low-level CUDA driver application programming interface. |
18266 | * |
18267 | * @{ |
18268 | */ |
18269 | |
18270 | /** |
18271 | * \brief Unregisters a graphics resource for access by CUDA |
18272 | * |
18273 | * Unregisters the graphics resource \p resource so it is not accessible by |
18274 | * CUDA unless registered again. |
18275 | * |
18276 | * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is |
18277 | * returned. |
18278 | * |
18279 | * \param resource - Resource to unregister |
18280 | * |
18281 | * \return |
18282 | * ::CUDA_SUCCESS, |
18283 | * ::CUDA_ERROR_DEINITIALIZED, |
18284 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18285 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18286 | * ::CUDA_ERROR_INVALID_HANDLE, |
18287 | * ::CUDA_ERROR_UNKNOWN |
18288 | * \notefnerr |
18289 | * |
18290 | * \sa |
18291 | * ::cuGraphicsD3D9RegisterResource, |
18292 | * ::cuGraphicsD3D10RegisterResource, |
18293 | * ::cuGraphicsD3D11RegisterResource, |
18294 | * ::cuGraphicsGLRegisterBuffer, |
18295 | * ::cuGraphicsGLRegisterImage, |
18296 | * ::cudaGraphicsUnregisterResource |
18297 | */ |
18298 | CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); |
18299 | |
18300 | /** |
18301 | * \brief Get an array through which to access a subresource of a mapped graphics resource. |
18302 | * |
18303 | * Returns in \p *pArray an array through which the subresource of the mapped |
18304 | * graphics resource \p resource which corresponds to array index \p arrayIndex |
18305 | * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may |
18306 | * change every time that \p resource is mapped. |
18307 | * |
18308 | * If \p resource is not a texture then it cannot be accessed via an array and |
18309 | * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. |
18310 | * If \p arrayIndex is not a valid array index for \p resource then |
18311 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
18312 | * If \p mipLevel is not a valid mipmap level for \p resource then |
18313 | * ::CUDA_ERROR_INVALID_VALUE is returned. |
18314 | * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
18315 | * |
18316 | * \param pArray - Returned array through which a subresource of \p resource may be accessed |
18317 | * \param resource - Mapped resource to access |
18318 | * \param arrayIndex - Array index for array textures or cubemap face |
18319 | * index as defined by ::CUarray_cubemap_face for |
18320 | * cubemap textures for the subresource to access |
18321 | * \param mipLevel - Mipmap level for the subresource to access |
18322 | * |
18323 | * \return |
18324 | * ::CUDA_SUCCESS, |
18325 | * ::CUDA_ERROR_DEINITIALIZED, |
18326 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18327 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18328 | * ::CUDA_ERROR_INVALID_VALUE, |
18329 | * ::CUDA_ERROR_INVALID_HANDLE, |
18330 | * ::CUDA_ERROR_NOT_MAPPED, |
18331 | * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY |
18332 | * \notefnerr |
18333 | * |
18334 | * \sa |
18335 | * ::cuGraphicsResourceGetMappedPointer, |
18336 | * ::cudaGraphicsSubResourceGetMappedArray |
18337 | */ |
18338 | CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); |
18339 | |
18340 | /** |
18341 | * \brief Get a mipmapped array through which to access a mapped graphics resource. |
18342 | * |
18343 | * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics |
18344 | * resource \p resource. The value set in \p *pMipmappedArray may change every time |
18345 | * that \p resource is mapped. |
18346 | * |
18347 | * If \p resource is not a texture then it cannot be accessed via a mipmapped array and |
18348 | * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. |
18349 | * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
18350 | * |
18351 | * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed |
18352 | * \param resource - Mapped resource to access |
18353 | * |
18354 | * \return |
18355 | * ::CUDA_SUCCESS, |
18356 | * ::CUDA_ERROR_DEINITIALIZED, |
18357 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18358 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18359 | * ::CUDA_ERROR_INVALID_VALUE, |
18360 | * ::CUDA_ERROR_INVALID_HANDLE, |
18361 | * ::CUDA_ERROR_NOT_MAPPED, |
18362 | * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY |
18363 | * \notefnerr |
18364 | * |
18365 | * \sa |
18366 | * ::cuGraphicsResourceGetMappedPointer, |
18367 | * ::cudaGraphicsResourceGetMappedMipmappedArray |
18368 | */ |
18369 | CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); |
18370 | |
18371 | /** |
18372 | * \brief Get a device pointer through which to access a mapped graphics resource. |
18373 | * |
18374 | * Returns in \p *pDevPtr a pointer through which the mapped graphics resource |
18375 | * \p resource may be accessed. |
18376 | * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. |
18377 | * The value set in \p pPointer may change every time that \p resource is mapped. |
18378 | * |
18379 | * If \p resource is not a buffer then it cannot be accessed via a pointer and |
18380 | * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. |
18381 | * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
18382 | * * |
18383 | * \param pDevPtr - Returned pointer through which \p resource may be accessed |
18384 | * \param pSize - Returned size of the buffer accessible starting at \p *pPointer |
18385 | * \param resource - Mapped resource to access |
18386 | * |
18387 | * \return |
18388 | * ::CUDA_SUCCESS, |
18389 | * ::CUDA_ERROR_DEINITIALIZED, |
18390 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18391 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18392 | * ::CUDA_ERROR_INVALID_VALUE, |
18393 | * ::CUDA_ERROR_INVALID_HANDLE, |
18394 | * ::CUDA_ERROR_NOT_MAPPED, |
18395 | * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER |
18396 | * \notefnerr |
18397 | * |
18398 | * \sa |
18399 | * ::cuGraphicsMapResources, |
18400 | * ::cuGraphicsSubResourceGetMappedArray, |
18401 | * ::cudaGraphicsResourceGetMappedPointer |
18402 | */ |
18403 | CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); |
18404 | |
18405 | /** |
18406 | * \brief Set usage flags for mapping a graphics resource |
18407 | * |
18408 | * Set \p flags for mapping the graphics resource \p resource. |
18409 | * |
18410 | * Changes to \p flags will take effect the next time \p resource is mapped. |
18411 | * The \p flags argument may be any of the following: |
18412 | |
18413 | * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this |
18414 | * resource will be used. It is therefore assumed that this resource will be |
18415 | * read from and written to by CUDA kernels. This is the default value. |
18416 | * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which |
18417 | * access this resource will not write to this resource. |
18418 | * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels |
18419 | * which access this resource will not read from this resource and will |
18420 | * write over the entire contents of the resource, so none of the data |
18421 | * previously stored in the resource will be preserved. |
18422 | * |
18423 | * If \p resource is presently mapped for access by CUDA then |
18424 | * ::CUDA_ERROR_ALREADY_MAPPED is returned. |
18425 | * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. |
18426 | * |
18427 | * \param resource - Registered resource to set flags for |
18428 | * \param flags - Parameters for resource mapping |
18429 | * |
18430 | * \return |
18431 | * ::CUDA_SUCCESS, |
18432 | * ::CUDA_ERROR_DEINITIALIZED, |
18433 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18434 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18435 | * ::CUDA_ERROR_INVALID_VALUE, |
18436 | * ::CUDA_ERROR_INVALID_HANDLE, |
18437 | * ::CUDA_ERROR_ALREADY_MAPPED |
18438 | * \notefnerr |
18439 | * |
18440 | * \sa |
18441 | * ::cuGraphicsMapResources, |
18442 | * ::cudaGraphicsResourceSetMapFlags |
18443 | */ |
18444 | CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); |
18445 | |
18446 | /** |
18447 | * \brief Map graphics resources for access by CUDA |
18448 | * |
18449 | * Maps the \p count graphics resources in \p resources for access by CUDA. |
18450 | * |
18451 | * The resources in \p resources may be accessed by CUDA until they |
18452 | * are unmapped. The graphics API from which \p resources were registered |
18453 | * should not access any resources while they are mapped by CUDA. If an |
18454 | * application does so, the results are undefined. |
18455 | * |
18456 | * This function provides the synchronization guarantee that any graphics calls |
18457 | * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA |
18458 | * work issued in \p stream begins. |
18459 | * |
18460 | * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. |
18461 | * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. |
18462 | * |
18463 | * \param count - Number of resources to map |
18464 | * \param resources - Resources to map for CUDA usage |
18465 | * \param hStream - Stream with which to synchronize |
18466 | * |
18467 | * \return |
18468 | * ::CUDA_SUCCESS, |
18469 | * ::CUDA_ERROR_DEINITIALIZED, |
18470 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18471 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18472 | * ::CUDA_ERROR_INVALID_HANDLE, |
18473 | * ::CUDA_ERROR_ALREADY_MAPPED, |
18474 | * ::CUDA_ERROR_UNKNOWN |
18475 | * \note_null_stream |
18476 | * \notefnerr |
18477 | * |
18478 | * \sa |
18479 | * ::cuGraphicsResourceGetMappedPointer, |
18480 | * ::cuGraphicsSubResourceGetMappedArray, |
18481 | * ::cuGraphicsUnmapResources, |
18482 | * ::cudaGraphicsMapResources |
18483 | */ |
18484 | CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
18485 | |
18486 | /** |
18487 | * \brief Unmap graphics resources. |
18488 | * |
18489 | * Unmaps the \p count graphics resources in \p resources. |
18490 | * |
18491 | * Once unmapped, the resources in \p resources may not be accessed by CUDA |
18492 | * until they are mapped again. |
18493 | * |
18494 | * This function provides the synchronization guarantee that any CUDA work issued |
18495 | * in \p stream before ::cuGraphicsUnmapResources() will complete before any |
18496 | * subsequently issued graphics work begins. |
18497 | * |
18498 | * |
18499 | * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. |
18500 | * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. |
18501 | * |
18502 | * \param count - Number of resources to unmap |
18503 | * \param resources - Resources to unmap |
18504 | * \param hStream - Stream with which to synchronize |
18505 | * |
18506 | * \return |
18507 | * ::CUDA_SUCCESS, |
18508 | * ::CUDA_ERROR_DEINITIALIZED, |
18509 | * ::CUDA_ERROR_NOT_INITIALIZED, |
18510 | * ::CUDA_ERROR_INVALID_CONTEXT, |
18511 | * ::CUDA_ERROR_INVALID_HANDLE, |
18512 | * ::CUDA_ERROR_NOT_MAPPED, |
18513 | * ::CUDA_ERROR_UNKNOWN |
18514 | * \note_null_stream |
18515 | * \notefnerr |
18516 | * |
18517 | * \sa |
18518 | * ::cuGraphicsMapResources, |
18519 | * ::cudaGraphicsUnmapResources |
18520 | */ |
18521 | CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
18522 | |
18523 | /** @} */ /* END CUDA_GRAPHICS */ |
18524 | |
18525 | /** |
18526 | * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access |
18527 | * |
18528 | * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API |
18529 | * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
18530 | * |
18531 | * This section describes the driver entry point access functions of the low-level CUDA |
18532 | * driver application programming interface. |
18533 | * |
18534 | * @{ |
18535 | */ |
18536 | |
18537 | /** |
18538 | * \brief Returns the requested driver API function pointer |
18539 | * |
18540 | * Returns in \p **pfn the address of the CUDA driver function for the requested |
18541 | * CUDA version and flags. |
18542 | * |
18543 | * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 |
18544 | * should be specified as 11020. For a requested driver symbol, if the specified |
18545 | * CUDA version is greater than or equal to the CUDA version in which the driver symbol |
18546 | * was introduced, this API will return the function pointer to the corresponding |
18547 | * versioned function. |
18548 | * |
18549 | * The pointer returned by the API should be cast to a function pointer matching the |
18550 | * requested driver function's definition in the API header file. The function pointer |
18551 | * typedef can be picked up from the corresponding typedefs header file. For example, |
18552 | * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. |
18553 | * |
18554 | * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not |
18555 | * supported on the platform, no ABI compatible driver function exists for the specified |
18556 | * \p cudaVersion or if the driver symbol is invalid. |
18557 | * |
18558 | * The requested flags can be: |
18559 | * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to |
18560 | * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with |
18561 | * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM |
18562 | * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. |
18563 | * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols |
18564 | * that match the requested driver symbol name except the corresponding per-thread versions. |
18565 | * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all |
18566 | * driver symbols that match the requested driver symbol name including the per-thread |
18567 | * versions. If a per-thread version is not found, the API will return the legacy version |
18568 | * of the driver function. |
18569 | * |
18570 | * \param symbol - The base name of the driver API function to look for. As an example, |
18571 | * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and |
18572 | * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. |
18573 | * \param pfn - Location to return the function pointer to the requested driver function |
18574 | * \param cudaVersion - The CUDA version to look for the requested driver symbol |
18575 | * \param flags - Flags to specify search options. |
18576 | * |
18577 | * \return |
18578 | * ::CUDA_SUCCESS, |
18579 | * ::CUDA_ERROR_INVALID_VALUE, |
18580 | * ::CUDA_ERROR_NOT_SUPPORTED, |
18581 | * ::CUDA_ERROR_NOT_FOUND |
18582 | * \note_version_mixing |
18583 | * |
18584 | * \sa |
18585 | * ::cudaGetDriverEntryPoint |
18586 | */ |
18587 | CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); |
18588 | |
18589 | /** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ |
18590 | |
18591 | CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); |
18592 | |
18593 | /** |
18594 | * CUDA API versioning support |
18595 | */ |
18596 | #if defined(__CUDA_API_VERSION_INTERNAL) |
18597 | #undef cuMemHostRegister |
18598 | #undef cuGraphicsResourceSetMapFlags |
18599 | #undef cuLinkCreate |
18600 | #undef cuLinkAddData |
18601 | #undef cuLinkAddFile |
18602 | #undef cuDeviceTotalMem |
18603 | #undef cuCtxCreate |
18604 | #undef cuModuleGetGlobal |
18605 | #undef cuMemGetInfo |
18606 | #undef cuMemAlloc |
18607 | #undef cuMemAllocPitch |
18608 | #undef cuMemFree |
18609 | #undef cuMemGetAddressRange |
18610 | #undef cuMemAllocHost |
18611 | #undef cuMemHostGetDevicePointer |
18612 | #undef cuMemcpyHtoD |
18613 | #undef cuMemcpyDtoH |
18614 | #undef cuMemcpyDtoD |
18615 | #undef cuMemcpyDtoA |
18616 | #undef cuMemcpyAtoD |
18617 | #undef cuMemcpyHtoA |
18618 | #undef cuMemcpyAtoH |
18619 | #undef cuMemcpyAtoA |
18620 | #undef cuMemcpyHtoAAsync |
18621 | #undef cuMemcpyAtoHAsync |
18622 | #undef cuMemcpy2D |
18623 | #undef cuMemcpy2DUnaligned |
18624 | #undef cuMemcpy3D |
18625 | #undef cuMemcpyHtoDAsync |
18626 | #undef cuMemcpyDtoHAsync |
18627 | #undef cuMemcpyDtoDAsync |
18628 | #undef cuMemcpy2DAsync |
18629 | #undef cuMemcpy3DAsync |
18630 | #undef cuMemsetD8 |
18631 | #undef cuMemsetD16 |
18632 | #undef cuMemsetD32 |
18633 | #undef cuMemsetD2D8 |
18634 | #undef cuMemsetD2D16 |
18635 | #undef cuMemsetD2D32 |
18636 | #undef cuArrayCreate |
18637 | #undef cuArrayGetDescriptor |
18638 | #undef cuArray3DCreate |
18639 | #undef cuArray3DGetDescriptor |
18640 | #undef cuTexRefSetAddress |
18641 | #undef cuTexRefSetAddress2D |
18642 | #undef cuTexRefGetAddress |
18643 | #undef cuGraphicsResourceGetMappedPointer |
18644 | #undef cuCtxDestroy |
18645 | #undef cuCtxPopCurrent |
18646 | #undef cuCtxPushCurrent |
18647 | #undef cuStreamDestroy |
18648 | #undef cuEventDestroy |
18649 | #undef cuMemcpy |
18650 | #undef cuMemcpyAsync |
18651 | #undef cuMemcpyPeer |
18652 | #undef cuMemcpyPeerAsync |
18653 | #undef cuMemcpy3DPeer |
18654 | #undef cuMemcpy3DPeerAsync |
18655 | #undef cuMemsetD8Async |
18656 | #undef cuMemsetD16Async |
18657 | #undef cuMemsetD32Async |
18658 | #undef cuMemsetD2D8Async |
18659 | #undef cuMemsetD2D16Async |
18660 | #undef cuMemsetD2D32Async |
18661 | #undef cuStreamGetPriority |
18662 | #undef cuStreamGetFlags |
18663 | #undef cuStreamGetCtx |
18664 | #undef cuStreamWaitEvent |
18665 | #undef cuStreamAddCallback |
18666 | #undef cuStreamAttachMemAsync |
18667 | #undef cuStreamQuery |
18668 | #undef cuStreamSynchronize |
18669 | #undef cuEventRecord |
18670 | #undef cuEventRecordWithFlags |
18671 | #undef cuLaunchKernel |
18672 | #undef cuLaunchHostFunc |
18673 | #undef cuGraphicsMapResources |
18674 | #undef cuGraphicsUnmapResources |
18675 | #undef cuStreamWriteValue32 |
18676 | #undef cuStreamWaitValue32 |
18677 | #undef cuStreamWriteValue64 |
18678 | #undef cuStreamWaitValue64 |
18679 | #undef cuStreamBatchMemOp |
18680 | #undef cuMemPrefetchAsync |
18681 | #undef cuLaunchCooperativeKernel |
18682 | #undef cuSignalExternalSemaphoresAsync |
18683 | #undef cuWaitExternalSemaphoresAsync |
18684 | #undef cuStreamBeginCapture |
18685 | #undef cuStreamEndCapture |
18686 | #undef cuStreamIsCapturing |
18687 | #undef cuStreamGetCaptureInfo |
18688 | #undef cuStreamGetCaptureInfo_v2 |
18689 | #undef cuGraphUpload |
18690 | #undef cuGraphLaunch |
18691 | #undef cuDevicePrimaryCtxRelease |
18692 | #undef cuDevicePrimaryCtxReset |
18693 | #undef cuDevicePrimaryCtxSetFlags |
18694 | #undef cuIpcOpenMemHandle |
18695 | #undef cuStreamCopyAttributes |
18696 | #undef cuStreamSetAttribute |
18697 | #undef cuStreamGetAttribute |
18698 | #undef cuGraphInstantiate |
18699 | #undef cuMemMapArrayAsync |
18700 | #undef cuMemFreeAsync |
18701 | #undef cuMemAllocAsync |
18702 | #undef cuMemAllocFromPoolAsync |
18703 | #undef cuStreamUpdateCaptureDependencies |
18704 | |
18705 | CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); |
18706 | CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); |
18707 | CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); |
18708 | CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, |
18709 | unsigned int numOptions, CUjit_option *options, void **optionValues); |
18710 | CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, |
18711 | unsigned int numOptions, CUjit_option *options, void **optionValues); |
18712 | CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); |
18713 | |
18714 | typedef unsigned int CUdeviceptr_v1; |
18715 | |
18716 | typedef struct CUDA_MEMCPY2D_v1_st |
18717 | { |
18718 | unsigned int srcXInBytes; /**< Source X in bytes */ |
18719 | unsigned int srcY; /**< Source Y */ |
18720 | CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
18721 | const void *srcHost; /**< Source host pointer */ |
18722 | CUdeviceptr_v1 srcDevice; /**< Source device pointer */ |
18723 | CUarray srcArray; /**< Source array reference */ |
18724 | unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ |
18725 | |
18726 | unsigned int dstXInBytes; /**< Destination X in bytes */ |
18727 | unsigned int dstY; /**< Destination Y */ |
18728 | CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
18729 | void *dstHost; /**< Destination host pointer */ |
18730 | CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ |
18731 | CUarray dstArray; /**< Destination array reference */ |
18732 | unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ |
18733 | |
18734 | unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ |
18735 | unsigned int Height; /**< Height of 2D memory copy */ |
18736 | } CUDA_MEMCPY2D_v1; |
18737 | |
18738 | typedef struct CUDA_MEMCPY3D_v1_st |
18739 | { |
18740 | unsigned int srcXInBytes; /**< Source X in bytes */ |
18741 | unsigned int srcY; /**< Source Y */ |
18742 | unsigned int srcZ; /**< Source Z */ |
18743 | unsigned int srcLOD; /**< Source LOD */ |
18744 | CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
18745 | const void *srcHost; /**< Source host pointer */ |
18746 | CUdeviceptr_v1 srcDevice; /**< Source device pointer */ |
18747 | CUarray srcArray; /**< Source array reference */ |
18748 | void *reserved0; /**< Must be NULL */ |
18749 | unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ |
18750 | unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
18751 | |
18752 | unsigned int dstXInBytes; /**< Destination X in bytes */ |
18753 | unsigned int dstY; /**< Destination Y */ |
18754 | unsigned int dstZ; /**< Destination Z */ |
18755 | unsigned int dstLOD; /**< Destination LOD */ |
18756 | CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
18757 | void *dstHost; /**< Destination host pointer */ |
18758 | CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ |
18759 | CUarray dstArray; /**< Destination array reference */ |
18760 | void *reserved1; /**< Must be NULL */ |
18761 | unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ |
18762 | unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
18763 | |
18764 | unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ |
18765 | unsigned int Height; /**< Height of 3D memory copy */ |
18766 | unsigned int Depth; /**< Depth of 3D memory copy */ |
18767 | } CUDA_MEMCPY3D_v1; |
18768 | |
18769 | typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st |
18770 | { |
18771 | unsigned int Width; /**< Width of array */ |
18772 | unsigned int Height; /**< Height of array */ |
18773 | |
18774 | CUarray_format Format; /**< Array format */ |
18775 | unsigned int NumChannels; /**< Channels per array element */ |
18776 | } CUDA_ARRAY_DESCRIPTOR_v1; |
18777 | |
18778 | typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st |
18779 | { |
18780 | unsigned int Width; /**< Width of 3D array */ |
18781 | unsigned int Height; /**< Height of 3D array */ |
18782 | unsigned int Depth; /**< Depth of 3D array */ |
18783 | |
18784 | CUarray_format Format; /**< Array format */ |
18785 | unsigned int NumChannels; /**< Channels per array element */ |
18786 | unsigned int Flags; /**< Flags */ |
18787 | } CUDA_ARRAY3D_DESCRIPTOR_v1; |
18788 | |
18789 | CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); |
18790 | CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); |
18791 | CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); |
18792 | CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); |
18793 | CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); |
18794 | CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); |
18795 | CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); |
18796 | CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); |
18797 | CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); |
18798 | CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); |
18799 | CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); |
18800 | CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
18801 | CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
18802 | CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
18803 | CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
18804 | CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); |
18805 | CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
18806 | CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
18807 | CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); |
18808 | CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); |
18809 | CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); |
18810 | CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); |
18811 | CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); |
18812 | CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); |
18813 | CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); |
18814 | CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); |
18815 | CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); |
18816 | CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); |
18817 | CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); |
18818 | CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); |
18819 | CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); |
18820 | CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); |
18821 | CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); |
18822 | CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); |
18823 | CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); |
18824 | CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); |
18825 | CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); |
18826 | CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); |
18827 | CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); |
18828 | CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); |
18829 | CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); |
18830 | CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); |
18831 | |
18832 | CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); |
18833 | CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); |
18834 | CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); |
18835 | CUresult CUDAAPI cuStreamDestroy(CUstream hStream); |
18836 | CUresult CUDAAPI cuEventDestroy(CUevent hEvent); |
18837 | CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); |
18838 | CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); |
18839 | CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); |
18840 | |
18841 | CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); |
18842 | CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); |
18843 | CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); |
18844 | CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); |
18845 | CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
18846 | CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); |
18847 | CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
18848 | CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
18849 | CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); |
18850 | CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); |
18851 | CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); |
18852 | CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); |
18853 | CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); |
18854 | CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); |
18855 | CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
18856 | CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
18857 | CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); |
18858 | CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); |
18859 | CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); |
18860 | CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); |
18861 | CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); |
18862 | CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); |
18863 | CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); |
18864 | CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); |
18865 | CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); |
18866 | CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); |
18867 | CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); |
18868 | CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); |
18869 | CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); |
18870 | CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); |
18871 | |
18872 | CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); |
18873 | CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); |
18874 | CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); |
18875 | CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); |
18876 | CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); |
18877 | CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); |
18878 | |
18879 | CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); |
18880 | CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); |
18881 | CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); |
18882 | CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); |
18883 | CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); |
18884 | CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); |
18885 | CUresult CUDAAPI cuStreamQuery(CUstream hStream); |
18886 | CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); |
18887 | CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); |
18888 | CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); |
18889 | CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); |
18890 | CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); |
18891 | CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
18892 | CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
18893 | CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
18894 | CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
18895 | CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
18896 | CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
18897 | CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); |
18898 | CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); |
18899 | CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); |
18900 | CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
18901 | CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
18902 | CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); |
18903 | CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); |
18904 | CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); |
18905 | CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); |
18906 | CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); |
18907 | CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); |
18908 | CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); |
18909 | CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); |
18910 | CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); |
18911 | CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); |
18912 | CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); |
18913 | CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); |
18914 | |
18915 | CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); |
18916 | CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); |
18917 | CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); |
18918 | |
18919 | CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); |
18920 | CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); |
18921 | CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); |
18922 | |
18923 | CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); |
18924 | #elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) |
18925 | static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { |
18926 | const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| |
18927 | CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); |
18928 | if ((flags & procAddressMask) == 0) { |
18929 | flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; |
18930 | } |
18931 | return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); |
18932 | } |
18933 | #define cuGetProcAddress cuGetProcAddress_ptsz |
18934 | #endif |
18935 | |
18936 | #ifdef __cplusplus |
18937 | } |
18938 | #endif |
18939 | |
18940 | #if defined(__GNUC__) |
18941 | #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) |
18942 | #pragma GCC visibility pop |
18943 | #endif |
18944 | #endif |
18945 | |
18946 | #undef __CUDA_DEPRECATED |
18947 | |
18948 | #endif /* __cuda_cuda_h__ */ |
18949 | |