cuda_allreduce_halving_doubling.h source code [pytorch/third_party/gloo/gloo/cuda_allreduce_halving_doubling.h]

1	/**
2	* Copyright (c) 2017-present, Facebook, Inc.
3	* All rights reserved.
4	*
5	* This source code is licensed under the BSD-style license found in the
6	* LICENSE file in the root directory of this source tree.
7	*/
8
9	#pragma once
10
11	#include <math.h>
12	#include <stddef.h>
13	#include <string.h>
14
15	#include "gloo/algorithm.h"
16	#include "gloo/common/error.h"
17	#include "gloo/cuda.h"
18	#include "gloo/cuda_workspace.h"
19
20	namespace gloo {
21
22	template <typename T, typename W = CudaHostWorkspace<T> >
23	class CudaAllreduceHalvingDoubling : public Algorithm {
24	public:
25	CudaAllreduceHalvingDoubling(
26	const std::shared_ptr<Context>& context,
27	const std::vector<T*>& ptrs,
28	const int count,
29	const std::vector<cudaStream_t>& streams = std::vector<cudaStream_t>(),
30	bool pipelineBroadcastAndReduce = false);
31
32	virtual ~CudaAllreduceHalvingDoubling() = default;
33
34	virtual void run() override;
35
36	protected:
37
38	void initBinaryBlocks();
39	void devicePointerInit();
40
41	// Both workspace types have their own initialization function.
42	template <typename U = W>
43	void init(
44	typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value,
45	typename U::Pointer>::type* = `0`);
46
47	template <typename U = W>
48	void init(
49	typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value,
50	typename U::Pointer>::type* = `0`);
51
52	template <typename U = W>
53	void initReductionsAndBroadcasts(
54	typename std::enable_if<std::is_same<U, CudaHostWorkspace<T> >::value,
55	typename U::Pointer>::type* = `0`);
56
57	template <typename U = W>
58	void initReductionsAndBroadcasts(
59	typename std::enable_if<std::is_same<U, CudaDeviceWorkspace<T> >::value,
60	typename U::Pointer>::type* = `0`);
61
62	std::vector<CudaDevicePointer<T> > devicePtrs_;
63	std::vector<CudaStream> streams_;
64	typename W::Pointer scratch_;
65	CudaStream* scratchStream_;
66
67	const int count_;
68	const int bytes_;
69	const size_t steps_;
70	const size_t chunks_;
71	const size_t chunkSize_;
72	const size_t chunkBytes_;
73
74	const CudaReductionFunction<T>* fn_;
75
76	// offsets into the data buffer from which to send during the reduce-scatter
77	// these become the offsets at which the process receives during the allgather
78	// indexed by step
79	std::vector<size_t> sendOffsets_;
80
81	// offsets at which data is reduced during the reduce-scatter and sent from in
82	// the allgather
83	std::vector<size_t> recvOffsets_;
84
85	std::vector<std::unique_ptr<transport::Buffer>> sendDataBufs_;
86	std::vector<std::unique_ptr<transport::Buffer>> recvDataBufs_;
87
88	std::unique_ptr<transport::Buffer> smallerBlockSendDataBuf_;
89	std::unique_ptr<transport::Buffer> smallerBlockRecvDataBuf_;
90
91	std::vector<std::unique_ptr<transport::Buffer>> largerBlockSendDataBufs_;
92	std::vector<std::unique_ptr<transport::Buffer>> largerBlockRecvDataBufs_;
93
94	std::vector<size_t> sendCounts_;
95	std::vector<size_t> recvCounts_;
96	size_t sendCountToLargerBlock_;
97
98	int dummy_;
99	std::vector<std::unique_ptr<transport::Buffer>> sendNotificationBufs_;
100	std::vector<std::unique_ptr<transport::Buffer>> recvNotificationBufs_;
101
102	std::unique_ptr<LocalOp<T>> reduceBeforeFirstSend_;
103	std::unique_ptr<LocalOp<T>> reduceBeforeFirstRecv_;
104
105	std::unique_ptr<LocalOp<T> > localReduceOp_;
106	std::unique_ptr<LocalOp<T> > localBroadcastOp_;
107
108	// buffer where data is received prior to being reduced
109	typename W::Pointer recvBuf_;
110
111	typename W::Pointer scratchPtrForFirstSend_;
112	typename W::Pointer scratchPtrForFirstRecv_;
113
114	std::vector<CudaDevicePointer<T>> devicePtrsForFirstSend_;
115	std::vector<CudaDevicePointer<T>> devicePtrsForFirstRecv_;
116
117	std::vector<typename W::Pointer> scratchPtrForBroadcast_;
118	std::vector<std::vector<CudaDevicePointer<T>>> devicePtrsForBroadcast_;
119	std::vector<std::unique_ptr<LocalOp<T>>> broadcastOps_;
120
121	bool pipelined_;
122
123	// for non-power-of-two number of processes, partition the processes into
124	// binary blocks and keep track of which block each process is in, as well as
125	// the adjoining larger and smaller blocks (with which communication will be
126	// required)
127	uint32_t offsetToMyBinaryBlock_;
128	uint32_t myBinaryBlockSize_;
129	uint32_t stepsWithinBlock_;
130	uint32_t rankInBinaryBlock_;
131	uint32_t nextSmallerBlockSize_;
132	uint32_t nextLargerBlockSize_;
133
134	int slotOffset_;
135	};
136
137	} // namespace gloo
138

Browse the source code of pytorch/third_party/gloo/gloo/cuda_allreduce_halving_doubling.h