1 | /** |
2 | * Copyright (c) 2017-present, Facebook, Inc. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #pragma once |
10 | |
11 | #include <math.h> |
12 | #include <stddef.h> |
13 | #include <string.h> |
14 | |
15 | #include "gloo/algorithm.h" |
16 | #include "gloo/common/error.h" |
17 | #include "gloo/cuda.h" |
18 | #include "gloo/cuda_workspace.h" |
19 | |
20 | namespace gloo { |
21 | |
22 | template <typename T, typename W = CudaHostWorkspace<T> > |
23 | class CudaAllreduceHalvingDoubling : public Algorithm { |
24 | public: |
25 | CudaAllreduceHalvingDoubling( |
26 | const std::shared_ptr<Context>& context, |
27 | const std::vector<T*>& ptrs, |
28 | const int count, |
29 | const std::vector<cudaStream_t>& streams = std::vector<cudaStream_t>(), |
30 | bool pipelineBroadcastAndReduce = false); |
31 | |
32 | virtual ~CudaAllreduceHalvingDoubling() = default; |
33 | |
34 | virtual void run() override; |
35 | |
36 | protected: |
37 | |
38 | void initBinaryBlocks(); |
39 | void devicePointerInit(); |
40 | |
41 | // Both workspace types have their own initialization function. |
42 | template <typename U = W> |
43 | void init( |
44 | typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value, |
45 | typename U::Pointer>::type* = 0); |
46 | |
47 | template <typename U = W> |
48 | void init( |
49 | typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value, |
50 | typename U::Pointer>::type* = 0); |
51 | |
52 | template <typename U = W> |
53 | void initReductionsAndBroadcasts( |
54 | typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value, |
55 | typename U::Pointer>::type* = 0); |
56 | |
57 | template <typename U = W> |
58 | void initReductionsAndBroadcasts( |
59 | typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value, |
60 | typename U::Pointer>::type* = 0); |
61 | |
62 | std::vector<CudaDevicePointer<T> > devicePtrs_; |
63 | std::vector<CudaStream> streams_; |
64 | typename W::Pointer scratch_; |
65 | CudaStream* scratchStream_; |
66 | |
67 | const int count_; |
68 | const int bytes_; |
69 | const size_t steps_; |
70 | const size_t chunks_; |
71 | const size_t chunkSize_; |
72 | const size_t chunkBytes_; |
73 | |
74 | const CudaReductionFunction<T>* fn_; |
75 | |
76 | // offsets into the data buffer from which to send during the reduce-scatter |
77 | // these become the offsets at which the process receives during the allgather |
78 | // indexed by step |
79 | std::vector<size_t> sendOffsets_; |
80 | |
81 | // offsets at which data is reduced during the reduce-scatter and sent from in |
82 | // the allgather |
83 | std::vector<size_t> recvOffsets_; |
84 | |
85 | std::vector<std::unique_ptr<transport::Buffer>> sendDataBufs_; |
86 | std::vector<std::unique_ptr<transport::Buffer>> recvDataBufs_; |
87 | |
88 | std::unique_ptr<transport::Buffer> smallerBlockSendDataBuf_; |
89 | std::unique_ptr<transport::Buffer> smallerBlockRecvDataBuf_; |
90 | |
91 | std::vector<std::unique_ptr<transport::Buffer>> largerBlockSendDataBufs_; |
92 | std::vector<std::unique_ptr<transport::Buffer>> largerBlockRecvDataBufs_; |
93 | |
94 | std::vector<size_t> sendCounts_; |
95 | std::vector<size_t> recvCounts_; |
96 | size_t sendCountToLargerBlock_; |
97 | |
98 | int dummy_; |
99 | std::vector<std::unique_ptr<transport::Buffer>> sendNotificationBufs_; |
100 | std::vector<std::unique_ptr<transport::Buffer>> recvNotificationBufs_; |
101 | |
102 | std::unique_ptr<LocalOp<T>> reduceBeforeFirstSend_; |
103 | std::unique_ptr<LocalOp<T>> reduceBeforeFirstRecv_; |
104 | |
105 | std::unique_ptr<LocalOp<T> > localReduceOp_; |
106 | std::unique_ptr<LocalOp<T> > localBroadcastOp_; |
107 | |
108 | // buffer where data is received prior to being reduced |
109 | typename W::Pointer recvBuf_; |
110 | |
111 | typename W::Pointer scratchPtrForFirstSend_; |
112 | typename W::Pointer scratchPtrForFirstRecv_; |
113 | |
114 | std::vector<CudaDevicePointer<T>> devicePtrsForFirstSend_; |
115 | std::vector<CudaDevicePointer<T>> devicePtrsForFirstRecv_; |
116 | |
117 | std::vector<typename W::Pointer> scratchPtrForBroadcast_; |
118 | std::vector<std::vector<CudaDevicePointer<T>>> devicePtrsForBroadcast_; |
119 | std::vector<std::unique_ptr<LocalOp<T>>> broadcastOps_; |
120 | |
121 | bool pipelined_; |
122 | |
123 | // for non-power-of-two number of processes, partition the processes into |
124 | // binary blocks and keep track of which block each process is in, as well as |
125 | // the adjoining larger and smaller blocks (with which communication will be |
126 | // required) |
127 | uint32_t offsetToMyBinaryBlock_; |
128 | uint32_t myBinaryBlockSize_; |
129 | uint32_t stepsWithinBlock_; |
130 | uint32_t rankInBinaryBlock_; |
131 | uint32_t nextSmallerBlockSize_; |
132 | uint32_t nextLargerBlockSize_; |
133 | |
134 | int slotOffset_; |
135 | }; |
136 | |
137 | } // namespace gloo |
138 | |