1 | #pragma once |
2 | |
3 | #include <algorithm> |
4 | #include <array> |
5 | #include <cstddef> |
6 | #include <cstdint> |
7 | #include <forward_list> |
8 | #include <new> |
9 | #include <utility> |
10 | #include <vector> |
11 | |
12 | #include <c10/macros/Macros.h> |
13 | #include <c10/util/ArrayRef.h> |
14 | #include <c10/util/Exception.h> |
15 | |
16 | namespace torch { |
17 | namespace profiler { |
18 | namespace impl { |
19 | |
20 | // ============================================================================ |
21 | // == AppendOnlyList ========================================================== |
22 | // ============================================================================ |
23 | // During profiling, we have a very predictable access pattern: we only |
24 | // append to the end of the container. We can specialize and outperform both |
25 | // std::vector (which must realloc) and std::deque (which performs a double |
26 | // indirection), and this class of operation is sufficiently important to the |
27 | // profiling hot path to warrant specializing: |
28 | // https://godbolt.org/z/rTjozf1c4 |
29 | // https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4 (Prototype #1, |
30 | // int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no (Prototype |
31 | // #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q |
32 | // (Prototype #2, int pair) |
33 | // https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg (Prototype #3, int |
34 | // pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo (Full impl, |
35 | // int pair) |
36 | // AppendOnlyList has 2x lower emplace overhead compared to more generic STL |
37 | // containers. |
38 | // |
39 | // The optimal value of `ChunkSize` will vary by use case, but testing shows |
40 | // that a value of 1024 does a good job amortizing the `malloc` cost of growth. |
41 | // Performance drops off for larger values, so testing on a case-by-case basis |
42 | // is recommended if performance is absolutely critical. |
43 | |
44 | template < |
45 | typename T, |
46 | size_t ChunkSize, |
47 | template <typename U, size_t N> class block_t = std::array> |
48 | class AppendOnlyList { |
49 | public: |
50 | using array_t = block_t<T, ChunkSize>; |
51 | static_assert( |
52 | std::is_base_of<std::array<T, ChunkSize>, array_t>::value, |
53 | "AppendOnlyList expects raw low level pointer storage." ); |
54 | static_assert(ChunkSize > 0, "Block cannot be empty." ); |
55 | |
56 | AppendOnlyList() : buffer_last_{buffer_.before_begin()} {} |
57 | AppendOnlyList(const AppendOnlyList&) = delete; |
58 | AppendOnlyList& operator=(const AppendOnlyList&) = delete; |
59 | |
60 | size_t size() const { |
61 | return n_blocks_ * ChunkSize + (size_t)(next_ - end_); |
62 | } |
63 | |
64 | template <class... Args> |
65 | T* emplace_back(Args&&... args) { |
66 | maybe_grow(); |
67 | ::new ((void*)next_) T{std::forward<Args>(args)...}; |
68 | return next_++; |
69 | } |
70 | |
71 | template <typename T0> |
72 | typename std::enable_if< |
73 | std::is_same<T0, T>::value && std::is_trivially_copyable<T>::value>::type |
74 | copy(c10::ArrayRef<T0> src) { |
75 | size_t n = src.size(); |
76 | if (C10_LIKELY(next_ && (next_ + n <= end_))) { |
77 | std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0)); |
78 | next_ += n; |
79 | } else { |
80 | // We could chunk this into several `memcpy`s, but because we expect this |
81 | // fallback to be infrequent (n << ChunkSize) the performance impact is |
82 | // negligible. |
83 | for (auto i : src) { |
84 | emplace_back(i); |
85 | } |
86 | } |
87 | } |
88 | |
89 | void clear() { |
90 | buffer_.clear(); |
91 | buffer_last_ = buffer_.before_begin(); |
92 | n_blocks_ = 0; |
93 | next_ = nullptr; |
94 | end_ = nullptr; |
95 | } |
96 | |
97 | struct Iterator { |
98 | using iterator_category = std::forward_iterator_tag; |
99 | using difference_type = std::ptrdiff_t; |
100 | using value_type = T; |
101 | using pointer = T*; |
102 | using reference = T&; |
103 | |
104 | Iterator(std::forward_list<array_t>& buffer, const size_t size) |
105 | : block_{buffer.begin()}, size_{size} {} |
106 | |
107 | // End iterator. |
108 | Iterator() = default; |
109 | |
110 | bool exhausted() const { |
111 | return current_ >= size_; |
112 | } |
113 | |
114 | reference operator*() const { |
115 | return *current_ptr(/*checked=*/true); |
116 | } |
117 | pointer operator->() { |
118 | return current_ptr(/*checked=*/true); |
119 | } |
120 | |
121 | // Prefix increment |
122 | Iterator& operator++() { |
123 | if (!(++current_ % ChunkSize)) { |
124 | block_++; |
125 | } |
126 | return *this; |
127 | } |
128 | |
129 | // Postfix increment |
130 | Iterator operator++(int) { |
131 | Iterator tmp = *this; |
132 | ++(*this); |
133 | return tmp; |
134 | } |
135 | |
136 | friend bool operator==(const Iterator& a, const Iterator& b) { |
137 | return a.current_ptr() == b.current_ptr(); |
138 | } |
139 | friend bool operator!=(const Iterator& a, const Iterator& b) { |
140 | return a.current_ptr() != b.current_ptr(); |
141 | } |
142 | |
143 | std::pair<array_t*, size_t> address() const { |
144 | if (current_ >= size_) { |
145 | return {nullptr, 0}; |
146 | } |
147 | return {&(*block_), current_ % ChunkSize}; |
148 | } |
149 | |
150 | private: |
151 | T* current_ptr(bool checked = false) const { |
152 | auto a = address(); |
153 | if (a.first == nullptr) { |
154 | TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList." ); |
155 | return nullptr; |
156 | } |
157 | return a.first->data() + a.second; |
158 | } |
159 | |
160 | typename std::forward_list<array_t>::iterator block_; |
161 | size_t current_{0}; |
162 | size_t size_{0}; |
163 | }; |
164 | |
165 | Iterator begin() { |
166 | return Iterator(buffer_, size()); |
167 | } |
168 | Iterator end() { |
169 | return Iterator(); |
170 | } |
171 | // TODO: cbegin and cend() |
172 | |
173 | // TODO: make private |
174 | protected: |
175 | void maybe_grow() { |
176 | if (C10_UNLIKELY(next_ == end_)) { |
177 | buffer_last_ = buffer_.emplace_after(buffer_last_); |
178 | n_blocks_++; |
179 | next_ = buffer_last_->data(); |
180 | end_ = next_ + ChunkSize; |
181 | } |
182 | } |
183 | |
184 | std::forward_list<array_t> buffer_; |
185 | |
186 | // We maintain a pointer to the last element of `buffer_` so that we can |
187 | // insert at the end in O(1) time. |
188 | typename std::forward_list<array_t>::iterator buffer_last_; |
189 | size_t n_blocks_{0}; |
190 | T* next_{nullptr}; |
191 | T* end_{nullptr}; |
192 | }; |
193 | |
194 | } // namespace impl |
195 | } // namespace profiler |
196 | } // namespace torch |
197 | |