1 | #include <numeric> |
2 | |
3 | #include "taichi/program/ndarray.h" |
4 | #include "taichi/program/program.h" |
5 | |
6 | #ifdef TI_WITH_LLVM |
7 | #include "taichi/runtime/llvm/llvm_context.h" |
8 | #include "taichi/runtime/program_impls/llvm/llvm_program.h" |
9 | #endif |
10 | |
11 | namespace taichi::lang { |
12 | |
13 | namespace { |
14 | |
15 | size_t flatten_index(const std::vector<int> &shapes, |
16 | const std::vector<int> &indices) { |
17 | TI_ASSERT(shapes.size() == indices.size()); |
18 | if (indices.size() == 1) { |
19 | return indices[0]; |
20 | } else { |
21 | size_t ind = indices[0]; |
22 | for (int i = 1; i < indices.size(); i++) { |
23 | ind = ind * shapes[i] + indices[i]; |
24 | } |
25 | return ind; |
26 | } |
27 | } |
28 | } // namespace |
29 | |
30 | Ndarray::Ndarray(Program *prog, |
31 | const DataType type, |
32 | const std::vector<int> &shape_, |
33 | ExternalArrayLayout layout_) |
34 | : dtype(type), |
35 | shape(shape_), |
36 | layout(layout_), |
37 | nelement_(std::accumulate(std::begin(shape_), |
38 | std::end(shape_), |
39 | 1, |
40 | std::multiplies<>())), |
41 | element_size_(data_type_size(dtype)), |
42 | prog_(prog) { |
43 | // Now that we have two shapes which may be concatenated differently |
44 | // depending on layout, total_shape_ comes handy. |
45 | total_shape_ = shape; |
46 | auto element_shape = data_type_shape(dtype); |
47 | if (layout == ExternalArrayLayout::kAOS) { |
48 | total_shape_.insert(total_shape_.end(), element_shape.begin(), |
49 | element_shape.end()); |
50 | } else if (layout == ExternalArrayLayout::kSOA) { |
51 | total_shape_.insert(total_shape_.begin(), element_shape.begin(), |
52 | element_shape.end()); |
53 | } |
54 | auto total_num_scalar = |
55 | std::accumulate(std::begin(total_shape_), std::end(total_shape_), 1LL, |
56 | std::multiplies<>()); |
57 | if (total_num_scalar > std::numeric_limits<int>::max()) { |
58 | TI_WARN( |
59 | "Ndarray index might be out of int32 boundary but int64 indexing is " |
60 | "not supported yet." ); |
61 | } |
62 | ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_, |
63 | prog->result_buffer); |
64 | } |
65 | |
66 | Ndarray::Ndarray(DeviceAllocation &devalloc, |
67 | const DataType type, |
68 | const std::vector<int> &shape, |
69 | ExternalArrayLayout layout) |
70 | : ndarray_alloc_(devalloc), |
71 | dtype(type), |
72 | shape(shape), |
73 | layout(layout), |
74 | nelement_(std::accumulate(std::begin(shape), |
75 | std::end(shape), |
76 | 1, |
77 | std::multiplies<>())), |
78 | element_size_(data_type_size(dtype)) { |
79 | // When element_shape is specified but layout is not, default layout is AOS. |
80 | auto element_shape = data_type_shape(dtype); |
81 | if (!element_shape.empty() && layout == ExternalArrayLayout::kNull) { |
82 | layout = ExternalArrayLayout::kAOS; |
83 | } |
84 | // Now that we have two shapes which may be concatenated differently |
85 | // depending on layout, total_shape_ comes handy. |
86 | total_shape_ = shape; |
87 | if (layout == ExternalArrayLayout::kAOS) { |
88 | total_shape_.insert(total_shape_.end(), element_shape.begin(), |
89 | element_shape.end()); |
90 | } else if (layout == ExternalArrayLayout::kSOA) { |
91 | total_shape_.insert(total_shape_.begin(), element_shape.begin(), |
92 | element_shape.end()); |
93 | } |
94 | auto total_num_scalar = |
95 | std::accumulate(std::begin(total_shape_), std::end(total_shape_), 1LL, |
96 | std::multiplies<>()); |
97 | if (total_num_scalar > std::numeric_limits<int>::max()) { |
98 | TI_WARN( |
99 | "Ndarray index might be out of int32 boundary but int64 indexing is " |
100 | "not supported yet." ); |
101 | } |
102 | } |
103 | |
104 | Ndarray::Ndarray(DeviceAllocation &devalloc, |
105 | const DataType type, |
106 | const std::vector<int> &shape, |
107 | const std::vector<int> &element_shape, |
108 | ExternalArrayLayout layout) |
109 | : Ndarray(devalloc, |
110 | TypeFactory::create_tensor_type(element_shape, type), |
111 | shape, |
112 | layout) { |
113 | TI_ASSERT(type->is<PrimitiveType>()); |
114 | } |
115 | |
116 | Ndarray::~Ndarray() { |
117 | if (prog_) { |
118 | // prog_->flush(); |
119 | ndarray_alloc_.device->dealloc_memory(ndarray_alloc_); |
120 | } |
121 | } |
122 | |
123 | intptr_t Ndarray::get_device_allocation_ptr_as_int() const { |
124 | // taichi's own ndarray's ptr points to its |DeviceAllocation| on the |
125 | // specified device. Note that torch-based ndarray's ptr is a raw ptr but |
126 | // we'll get rid of it soon. |
127 | return reinterpret_cast<intptr_t>(&ndarray_alloc_); |
128 | } |
129 | |
130 | DeviceAllocation Ndarray::get_device_allocation() const { |
131 | return ndarray_alloc_; |
132 | } |
133 | |
134 | std::vector<int> Ndarray::get_element_shape() const { |
135 | return data_type_shape(dtype); |
136 | } |
137 | |
138 | DataType Ndarray::get_element_data_type() const { |
139 | if (dtype->is<TensorType>()) { |
140 | return dtype->cast<TensorType>()->get_element_type(); |
141 | } |
142 | return dtype; |
143 | } |
144 | |
145 | std::size_t Ndarray::get_element_size() const { |
146 | return element_size_; |
147 | } |
148 | |
149 | std::size_t Ndarray::get_nelement() const { |
150 | return nelement_; |
151 | } |
152 | |
153 | TypedConstant Ndarray::read(const std::vector<int> &I) const { |
154 | prog_->synchronize(); |
155 | size_t index = flatten_index(total_shape_, I); |
156 | size_t size = data_type_size(get_element_data_type()); |
157 | taichi::lang::Device::AllocParams alloc_params; |
158 | alloc_params.host_write = false; |
159 | alloc_params.host_read = true; |
160 | alloc_params.size = size; |
161 | alloc_params.usage = taichi::lang::AllocUsage::Storage; |
162 | auto staging_buf_ = |
163 | this->ndarray_alloc_.device->allocate_memory_unique(alloc_params); |
164 | staging_buf_->device->memcpy_internal( |
165 | staging_buf_->get_ptr(), |
166 | this->ndarray_alloc_.get_ptr(/*offset=*/index * size), size); |
167 | |
168 | char *device_arr_ptr{nullptr}; |
169 | TI_ASSERT(staging_buf_->device->map( |
170 | *staging_buf_, (void **)&device_arr_ptr) == RhiResult::success); |
171 | |
172 | TypedConstant data(get_element_data_type()); |
173 | std::memcpy(&data.value_bits, device_arr_ptr, size); |
174 | staging_buf_->device->unmap(*staging_buf_); |
175 | return data; |
176 | } |
177 | |
178 | template <typename T> |
179 | void Ndarray::write(const std::vector<int> &I, T val) const { |
180 | size_t index = flatten_index(total_shape_, I); |
181 | size_t size_ = sizeof(T); |
182 | taichi::lang::Device::AllocParams alloc_params; |
183 | alloc_params.host_write = true; |
184 | alloc_params.host_read = false; |
185 | alloc_params.size = size_; |
186 | alloc_params.usage = taichi::lang::AllocUsage::Storage; |
187 | auto staging_buf_ = |
188 | this->ndarray_alloc_.device->allocate_memory_unique(alloc_params); |
189 | |
190 | T *device_arr_ptr{nullptr}; |
191 | TI_ASSERT(staging_buf_->device->map( |
192 | *staging_buf_, (void **)&device_arr_ptr) == RhiResult::success); |
193 | |
194 | TI_ASSERT(device_arr_ptr); |
195 | device_arr_ptr[0] = val; |
196 | |
197 | staging_buf_->device->unmap(*staging_buf_); |
198 | staging_buf_->device->memcpy_internal( |
199 | this->ndarray_alloc_.get_ptr(index * sizeof(T)), staging_buf_->get_ptr(), |
200 | size_); |
201 | |
202 | prog_->synchronize(); |
203 | } |
204 | |
205 | int64 Ndarray::read_int(const std::vector<int> &i) { |
206 | return read(i).val_int(); |
207 | } |
208 | |
209 | uint64 Ndarray::read_uint(const std::vector<int> &i) { |
210 | return read(i).val_uint(); |
211 | } |
212 | |
213 | float64 Ndarray::read_float(const std::vector<int> &i) { |
214 | return read(i).val_float(); |
215 | } |
216 | |
217 | void Ndarray::write_int(const std::vector<int> &i, int64 val) { |
218 | write<int>(i, val); |
219 | } |
220 | |
221 | void Ndarray::write_float(const std::vector<int> &i, float64 val) { |
222 | write<float>(i, val); |
223 | } |
224 | |
225 | } // namespace taichi::lang |
226 | |