1#pragma once
2
3#include <algorithm>
4#include <array>
5#include <cstddef>
6#include <cstdint>
7#include <forward_list>
8#include <new>
9#include <utility>
10#include <vector>
11
12#include <c10/macros/Macros.h>
13#include <c10/util/ArrayRef.h>
14#include <c10/util/Exception.h>
15
16namespace torch {
17namespace profiler {
18namespace impl {
19
20// ============================================================================
21// == AppendOnlyList ==========================================================
22// ============================================================================
23// During profiling, we have a very predictable access pattern: we only
24// append to the end of the container. We can specialize and outperform both
25// std::vector (which must realloc) and std::deque (which performs a double
26// indirection), and this class of operation is sufficiently important to the
27// profiling hot path to warrant specializing:
28// https://godbolt.org/z/rTjozf1c4
29// https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4 (Prototype #1,
30// int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no (Prototype
31// #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q
32// (Prototype #2, int pair)
33// https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg (Prototype #3, int
34// pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo (Full impl,
35// int pair)
36// AppendOnlyList has 2x lower emplace overhead compared to more generic STL
37// containers.
38//
39// The optimal value of `ChunkSize` will vary by use case, but testing shows
40// that a value of 1024 does a good job amortizing the `malloc` cost of growth.
41// Performance drops off for larger values, so testing on a case-by-case basis
42// is recommended if performance is absolutely critical.
43
44template <
45 typename T,
46 size_t ChunkSize,
47 template <typename U, size_t N> class block_t = std::array>
48class AppendOnlyList {
49 public:
50 using array_t = block_t<T, ChunkSize>;
51 static_assert(
52 std::is_base_of<std::array<T, ChunkSize>, array_t>::value,
53 "AppendOnlyList expects raw low level pointer storage.");
54 static_assert(ChunkSize > 0, "Block cannot be empty.");
55
56 AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
57 AppendOnlyList(const AppendOnlyList&) = delete;
58 AppendOnlyList& operator=(const AppendOnlyList&) = delete;
59
60 size_t size() const {
61 return n_blocks_ * ChunkSize + (size_t)(next_ - end_);
62 }
63
64 template <class... Args>
65 T* emplace_back(Args&&... args) {
66 maybe_grow();
67 ::new ((void*)next_) T{std::forward<Args>(args)...};
68 return next_++;
69 }
70
71 template <typename T0>
72 typename std::enable_if<
73 std::is_same<T0, T>::value && std::is_trivially_copyable<T>::value>::type
74 copy(c10::ArrayRef<T0> src) {
75 size_t n = src.size();
76 if (C10_LIKELY(next_ && (next_ + n <= end_))) {
77 std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0));
78 next_ += n;
79 } else {
80 // We could chunk this into several `memcpy`s, but because we expect this
81 // fallback to be infrequent (n << ChunkSize) the performance impact is
82 // negligible.
83 for (auto i : src) {
84 emplace_back(i);
85 }
86 }
87 }
88
89 void clear() {
90 buffer_.clear();
91 buffer_last_ = buffer_.before_begin();
92 n_blocks_ = 0;
93 next_ = nullptr;
94 end_ = nullptr;
95 }
96
97 struct Iterator {
98 using iterator_category = std::forward_iterator_tag;
99 using difference_type = std::ptrdiff_t;
100 using value_type = T;
101 using pointer = T*;
102 using reference = T&;
103
104 Iterator(std::forward_list<array_t>& buffer, const size_t size)
105 : block_{buffer.begin()}, size_{size} {}
106
107 // End iterator.
108 Iterator() = default;
109
110 bool exhausted() const {
111 return current_ >= size_;
112 }
113
114 reference operator*() const {
115 return *current_ptr(/*checked=*/true);
116 }
117 pointer operator->() {
118 return current_ptr(/*checked=*/true);
119 }
120
121 // Prefix increment
122 Iterator& operator++() {
123 if (!(++current_ % ChunkSize)) {
124 block_++;
125 }
126 return *this;
127 }
128
129 // Postfix increment
130 Iterator operator++(int) {
131 Iterator tmp = *this;
132 ++(*this);
133 return tmp;
134 }
135
136 friend bool operator==(const Iterator& a, const Iterator& b) {
137 return a.current_ptr() == b.current_ptr();
138 }
139 friend bool operator!=(const Iterator& a, const Iterator& b) {
140 return a.current_ptr() != b.current_ptr();
141 }
142
143 std::pair<array_t*, size_t> address() const {
144 if (current_ >= size_) {
145 return {nullptr, 0};
146 }
147 return {&(*block_), current_ % ChunkSize};
148 }
149
150 private:
151 T* current_ptr(bool checked = false) const {
152 auto a = address();
153 if (a.first == nullptr) {
154 TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList.");
155 return nullptr;
156 }
157 return a.first->data() + a.second;
158 }
159
160 typename std::forward_list<array_t>::iterator block_;
161 size_t current_{0};
162 size_t size_{0};
163 };
164
165 Iterator begin() {
166 return Iterator(buffer_, size());
167 }
168 Iterator end() {
169 return Iterator();
170 }
171 // TODO: cbegin and cend()
172
173 // TODO: make private
174 protected:
175 void maybe_grow() {
176 if (C10_UNLIKELY(next_ == end_)) {
177 buffer_last_ = buffer_.emplace_after(buffer_last_);
178 n_blocks_++;
179 next_ = buffer_last_->data();
180 end_ = next_ + ChunkSize;
181 }
182 }
183
184 std::forward_list<array_t> buffer_;
185
186 // We maintain a pointer to the last element of `buffer_` so that we can
187 // insert at the end in O(1) time.
188 typename std::forward_list<array_t>::iterator buffer_last_;
189 size_t n_blocks_{0};
190 T* next_{nullptr};
191 T* end_{nullptr};
192};
193
194} // namespace impl
195} // namespace profiler
196} // namespace torch
197