1/**
2 * Copyright (c) 2017-present, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9#pragma once
10
11#include <math.h>
12#include <stddef.h>
13#include <string.h>
14
15#include "gloo/algorithm.h"
16#include "gloo/common/error.h"
17#include "gloo/cuda.h"
18#include "gloo/cuda_workspace.h"
19
20namespace gloo {
21
22template <typename T, typename W = CudaHostWorkspace<T> >
23class CudaAllreduceHalvingDoubling : public Algorithm {
24 public:
25 CudaAllreduceHalvingDoubling(
26 const std::shared_ptr<Context>& context,
27 const std::vector<T*>& ptrs,
28 const int count,
29 const std::vector<cudaStream_t>& streams = std::vector<cudaStream_t>(),
30 bool pipelineBroadcastAndReduce = false);
31
32 virtual ~CudaAllreduceHalvingDoubling() = default;
33
34 virtual void run() override;
35
36 protected:
37
38 void initBinaryBlocks();
39 void devicePointerInit();
40
41 // Both workspace types have their own initialization function.
42 template <typename U = W>
43 void init(
44 typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value,
45 typename U::Pointer>::type* = 0);
46
47 template <typename U = W>
48 void init(
49 typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value,
50 typename U::Pointer>::type* = 0);
51
52 template <typename U = W>
53 void initReductionsAndBroadcasts(
54 typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value,
55 typename U::Pointer>::type* = 0);
56
57 template <typename U = W>
58 void initReductionsAndBroadcasts(
59 typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value,
60 typename U::Pointer>::type* = 0);
61
62 std::vector<CudaDevicePointer<T> > devicePtrs_;
63 std::vector<CudaStream> streams_;
64 typename W::Pointer scratch_;
65 CudaStream* scratchStream_;
66
67 const int count_;
68 const int bytes_;
69 const size_t steps_;
70 const size_t chunks_;
71 const size_t chunkSize_;
72 const size_t chunkBytes_;
73
74 const CudaReductionFunction<T>* fn_;
75
76 // offsets into the data buffer from which to send during the reduce-scatter
77 // these become the offsets at which the process receives during the allgather
78 // indexed by step
79 std::vector<size_t> sendOffsets_;
80
81 // offsets at which data is reduced during the reduce-scatter and sent from in
82 // the allgather
83 std::vector<size_t> recvOffsets_;
84
85 std::vector<std::unique_ptr<transport::Buffer>> sendDataBufs_;
86 std::vector<std::unique_ptr<transport::Buffer>> recvDataBufs_;
87
88 std::unique_ptr<transport::Buffer> smallerBlockSendDataBuf_;
89 std::unique_ptr<transport::Buffer> smallerBlockRecvDataBuf_;
90
91 std::vector<std::unique_ptr<transport::Buffer>> largerBlockSendDataBufs_;
92 std::vector<std::unique_ptr<transport::Buffer>> largerBlockRecvDataBufs_;
93
94 std::vector<size_t> sendCounts_;
95 std::vector<size_t> recvCounts_;
96 size_t sendCountToLargerBlock_;
97
98 int dummy_;
99 std::vector<std::unique_ptr<transport::Buffer>> sendNotificationBufs_;
100 std::vector<std::unique_ptr<transport::Buffer>> recvNotificationBufs_;
101
102 std::unique_ptr<LocalOp<T>> reduceBeforeFirstSend_;
103 std::unique_ptr<LocalOp<T>> reduceBeforeFirstRecv_;
104
105 std::unique_ptr<LocalOp<T> > localReduceOp_;
106 std::unique_ptr<LocalOp<T> > localBroadcastOp_;
107
108 // buffer where data is received prior to being reduced
109 typename W::Pointer recvBuf_;
110
111 typename W::Pointer scratchPtrForFirstSend_;
112 typename W::Pointer scratchPtrForFirstRecv_;
113
114 std::vector<CudaDevicePointer<T>> devicePtrsForFirstSend_;
115 std::vector<CudaDevicePointer<T>> devicePtrsForFirstRecv_;
116
117 std::vector<typename W::Pointer> scratchPtrForBroadcast_;
118 std::vector<std::vector<CudaDevicePointer<T>>> devicePtrsForBroadcast_;
119 std::vector<std::unique_ptr<LocalOp<T>>> broadcastOps_;
120
121 bool pipelined_;
122
123 // for non-power-of-two number of processes, partition the processes into
124 // binary blocks and keep track of which block each process is in, as well as
125 // the adjoining larger and smaller blocks (with which communication will be
126 // required)
127 uint32_t offsetToMyBinaryBlock_;
128 uint32_t myBinaryBlockSize_;
129 uint32_t stepsWithinBlock_;
130 uint32_t rankInBinaryBlock_;
131 uint32_t nextSmallerBlockSize_;
132 uint32_t nextLargerBlockSize_;
133
134 int slotOffset_;
135};
136
137} // namespace gloo
138