1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | |
20 | /*! |
21 | * |
22 | * \file calibrate.cc |
23 | * |
24 | * \brief Create profile graph and calibrate on dataset |
25 | */ |
26 | #include <tvm/relay/analysis.h> |
27 | #include <tvm/relay/expr_functor.h> |
28 | #include <tvm/relay/op.h> |
29 | |
30 | #include <numeric> |
31 | |
32 | #include "./quantize.h" |
33 | |
34 | namespace tvm { |
35 | namespace relay { |
36 | namespace quantize { |
37 | |
38 | // KL divergence minimization code is adapted from MXNet. |
39 | // The original one is in incubator-mxnet/src/operator/quantization/calibrate.cc |
40 | static std::vector<float> SmoothDistribution(const std::vector<float>& p, |
41 | const float eps = 0.0001) { |
42 | std::vector<size_t> is_zeros(p.size()); |
43 | std::vector<size_t> is_nonzeros(p.size()); |
44 | { |
45 | auto it = p.begin(); |
46 | std::generate(is_zeros.begin(), is_zeros.end(), |
47 | [&it]() { return static_cast<size_t>(*(it++) == 0.f); }); |
48 | } |
49 | { |
50 | auto it = p.begin(); |
51 | std::generate(is_nonzeros.begin(), is_nonzeros.end(), |
52 | [&it]() { return static_cast<size_t>(*(it++) != 0.f); }); |
53 | } |
54 | size_t n_zeros = std::accumulate(is_zeros.begin(), is_zeros.end(), 0); |
55 | size_t n_nonzeros = p.size() - n_zeros; |
56 | if (!n_nonzeros) { |
57 | // The discrete probability distribution is malformed. All entries are 0. |
58 | return std::vector<float>(); |
59 | } |
60 | float eps1 = eps * static_cast<float>(n_zeros) / static_cast<float>(n_nonzeros); |
61 | if (eps1 >= 1.0) return std::vector<float>(); |
62 | auto ret = p; |
63 | for (size_t i = 0; i < p.size(); i++) { |
64 | ret[i] += eps * is_zeros[i] - eps1 * is_nonzeros[i]; |
65 | } |
66 | return ret; |
67 | } |
68 | |
69 | static float ComputeEntropy(float* p, float* q, size_t size) { |
70 | float p_sum = std::accumulate(p, p + size, 0.f); |
71 | float q_sum = std::accumulate(q, q + size, 0.f); |
72 | float ret = 0; |
73 | for (size_t i = 0; i < size; i++) { |
74 | ICHECK(p[i] > 0 && q[i] > 0); |
75 | p[i] /= p_sum; |
76 | q[i] /= q_sum; |
77 | if (p[i] && q[i]) ret += p[i] * std::log(p[i] / q[i]); |
78 | } |
79 | return ret; |
80 | } |
81 | |
82 | float MinimizeKL(const std::vector<int>& hist, const std::vector<float>& hist_edges, int num_bins, |
83 | int num_quantized_bins) { |
84 | const int zero_bin_idx = num_bins / 2; |
85 | const int num_half_quantized_bins = num_quantized_bins / 2; |
86 | std::vector<float> thresholds(num_bins / 2 + 1 - num_quantized_bins / 2, 0.f); |
87 | std::vector<float> divergence(thresholds.size(), 0.f); |
88 | std::vector<float> quantized_bins(num_quantized_bins, 0); |
89 | for (int i = num_quantized_bins / 2; i < zero_bin_idx + 1; ++i) { |
90 | const int p_bin_idx_start = zero_bin_idx - i; |
91 | const int p_bin_idx_stop = zero_bin_idx + i + 1; |
92 | thresholds[i - num_half_quantized_bins] = hist_edges[p_bin_idx_stop]; |
93 | |
94 | std::vector<int> sliced_nd_hist(p_bin_idx_stop - p_bin_idx_start); |
95 | std::vector<float> p(sliced_nd_hist.size()); |
96 | p[0] = 0; |
97 | p.back() = 0; |
98 | for (int j = 0; j < num_bins; j++) { |
99 | if (j <= p_bin_idx_start) { |
100 | p[0] += hist[j]; |
101 | } else if (j >= p_bin_idx_stop) { |
102 | p.back() += hist[j]; |
103 | } else { |
104 | sliced_nd_hist[j - p_bin_idx_start] = hist[j]; |
105 | p[j - p_bin_idx_start] = hist[j]; |
106 | } |
107 | } |
108 | // calculate how many bins should be merged to generate quantized distribution q |
109 | const auto num_merged_bins = sliced_nd_hist.size() / num_quantized_bins; |
110 | for (int j = 0; j < num_quantized_bins; j++) { |
111 | const int start = j * num_merged_bins; |
112 | const int stop = (j + 1) * num_merged_bins; |
113 | quantized_bins[j] = |
114 | std::accumulate(sliced_nd_hist.begin() + start, sliced_nd_hist.begin() + stop, 0); |
115 | } |
116 | quantized_bins.back() += std::accumulate( |
117 | sliced_nd_hist.begin() + static_cast<int>(num_quantized_bins * num_merged_bins), |
118 | sliced_nd_hist.end(), 0); |
119 | // expand quantized_bins into p.size bins |
120 | std::vector<float> q(sliced_nd_hist.size(), 0); |
121 | for (int j = 0; j < num_quantized_bins; j++) { |
122 | const int start = j * num_merged_bins; |
123 | const int stop = (j == num_quantized_bins - 1) ? q.size() : ((j + 1) * num_merged_bins); |
124 | int norm = std::count_if(sliced_nd_hist.begin() + start, sliced_nd_hist.begin() + stop, |
125 | [](size_t i) { return i != 0; }); |
126 | if (norm) { |
127 | for (int k = start; k < stop; k++) { |
128 | if (p[k]) q[k] = quantized_bins[j] / norm; |
129 | } |
130 | } |
131 | } |
132 | p = SmoothDistribution(p); |
133 | q = SmoothDistribution(q); |
134 | |
135 | if (!q.size()) { |
136 | divergence[i - num_half_quantized_bins] = std::numeric_limits<float>::infinity(); |
137 | } else { |
138 | divergence[i - num_half_quantized_bins] = ComputeEntropy(p.data(), q.data(), p.size()); |
139 | } |
140 | } |
141 | auto min_divergence_idx = |
142 | std::distance(divergence.begin(), std::min_element(divergence.begin(), divergence.end())); |
143 | return thresholds[min_divergence_idx]; |
144 | } |
145 | |
146 | class StatsCollector : private ExprMutator { |
147 | public: |
148 | StatsCollector() : simulated_quantize_op_(Op::Get("relay.op.annotation.simulated_quantize" )) {} |
149 | |
150 | Expr Collect(const Expr& expr) { |
151 | auto new_e = this->Mutate(expr); |
152 | const FunctionNode* func = new_e.as<FunctionNode>(); |
153 | ICHECK(func) << "Input shoule be Function" ; |
154 | Expr new_body = Tuple(std::move(profile_data_)); |
155 | Function ret_func = WithFields(GetRef<Function>(func), FreeVars(new_body), new_body); |
156 | |
157 | // We are changing the function's ret_type to an empty type. Unfortunately, Optional<Type>() is |
158 | // indistinguishable from NullValue<Type>(), so we can't express "update to nullptr" in |
159 | // WithFields. |
160 | ret_func.CopyOnWrite()->ret_type = NullValue<Type>(); |
161 | return std::move(ret_func); |
162 | } |
163 | |
164 | private: |
165 | Array<Expr> profile_data_; |
166 | const Op& simulated_quantize_op_; |
167 | |
168 | Expr VisitExpr_(const CallNode* call) { |
169 | Expr new_e = ExprMutator::VisitExpr_(call); |
170 | const CallNode* new_call = new_e.as<CallNode>(); |
171 | ICHECK(new_call); |
172 | if (new_call->op == simulated_quantize_op_) { |
173 | auto attrs = new_call->attrs.as<SimulatedQuantizeAttrs>(); |
174 | // rewrite the annotation |
175 | auto new_attrs = make_object<SimulatedQuantizeAttrs>(); |
176 | const Expr& quantize_input = new_call->args[0]; // expression being quantized |
177 | auto placeholder = MakeConstantScalar(DataType::Float(32), 0.); // unused argument |
178 | Array<Expr> new_args{quantize_input, placeholder, placeholder, placeholder}; |
179 | new_attrs->kind = QAnnotateKind::kQIdentity; |
180 | new_attrs->sign = attrs->sign; |
181 | new_attrs->rounding = attrs->rounding; |
182 | Expr identity_quantize = Call(new_call->op, new_args, Attrs{new_attrs}, {}); |
183 | |
184 | // add non-const expressions to profile data |
185 | if (attrs->kind != QAnnotateKind::kQWeight) { |
186 | ICHECK(!quantize_input.as<ConstantNode>()); |
187 | profile_data_.push_back(identity_quantize); |
188 | } |
189 | return identity_quantize; |
190 | } else { |
191 | return new_e; |
192 | } |
193 | } |
194 | }; |
195 | |
196 | /* |
197 | * \brief Given an annotated graph, create a profile graph to collect profile data from the |
198 | * calibration dataset. |
199 | * |
200 | * This pass collects simulated_quantize op into a tuple. Simulated_quantize ops are rewritten to |
201 | * identity mode. The tuple is the output of the profile graph. Both input and output of this pass |
202 | * are relay::Function. |
203 | * |
204 | * \param expr The simulation graph after annotation. |
205 | * \return The profile graph. |
206 | */ |
207 | Expr CreateStatsCollector(const Expr& expr) { return StatsCollector().Collect(expr); } |
208 | |
209 | TVM_REGISTER_GLOBAL("relay._quantize.CreateStatsCollector" ).set_body_typed(CreateStatsCollector); |
210 | |
211 | TVM_REGISTER_GLOBAL("relay._quantize.FindScaleByKLMinimization" ) |
212 | .set_body([](TVMArgs args, TVMRetValue* ret) { |
213 | int* hist_ptr = static_cast<int*>(static_cast<void*>(args[0])); |
214 | float* hist_edges_ptr = static_cast<float*>(static_cast<void*>(args[1])); |
215 | int num_bins = args[2]; |
216 | int num_quantized_bins = args[3]; |
217 | std::vector<int> hist(hist_ptr, hist_ptr + num_bins); |
218 | std::vector<float> hist_edges(hist_edges_ptr, hist_edges_ptr + num_bins + 1); |
219 | ret[0] = MinimizeKL(hist, hist_edges, num_bins, num_quantized_bins); |
220 | }); |
221 | |
222 | } // namespace quantize |
223 | } // namespace relay |
224 | } // namespace tvm |
225 | |