1 | #include "Python.h" |
2 | #include "frameobject.h" |
3 | |
4 | #include "pycore_pyerrors.h" |
5 | |
6 | #define MAX_CANDIDATE_ITEMS 750 |
7 | #define MAX_STRING_SIZE 40 |
8 | |
9 | #define MOVE_COST 2 |
10 | #define CASE_COST 1 |
11 | |
12 | #define LEAST_FIVE_BITS(n) ((n) & 31) |
13 | |
14 | static inline int |
15 | substitution_cost(char a, char b) |
16 | { |
17 | if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) { |
18 | // Not the same, not a case flip. |
19 | return MOVE_COST; |
20 | } |
21 | if (a == b) { |
22 | return 0; |
23 | } |
24 | if ('A' <= a && a <= 'Z') { |
25 | a += ('a' - 'A'); |
26 | } |
27 | if ('A' <= b && b <= 'Z') { |
28 | b += ('a' - 'A'); |
29 | } |
30 | if (a == b) { |
31 | return CASE_COST; |
32 | } |
33 | return MOVE_COST; |
34 | } |
35 | |
36 | /* Calculate the Levenshtein distance between string1 and string2 */ |
37 | static Py_ssize_t |
38 | levenshtein_distance(const char *a, size_t a_size, |
39 | const char *b, size_t b_size, |
40 | size_t max_cost) |
41 | { |
42 | static size_t buffer[MAX_STRING_SIZE]; |
43 | |
44 | // Both strings are the same (by identity) |
45 | if (a == b) { |
46 | return 0; |
47 | } |
48 | |
49 | // Trim away common affixes. |
50 | while (a_size && b_size && a[0] == b[0]) { |
51 | a++; a_size--; |
52 | b++; b_size--; |
53 | } |
54 | while (a_size && b_size && a[a_size-1] == b[b_size-1]) { |
55 | a_size--; |
56 | b_size--; |
57 | } |
58 | if (a_size == 0 || b_size == 0) { |
59 | return (a_size + b_size) * MOVE_COST; |
60 | } |
61 | if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) { |
62 | return max_cost + 1; |
63 | } |
64 | |
65 | // Prefer shorter buffer |
66 | if (b_size < a_size) { |
67 | const char *t = a; a = b; b = t; |
68 | size_t t_size = a_size; a_size = b_size; b_size = t_size; |
69 | } |
70 | |
71 | // quick fail when a match is impossible. |
72 | if ((b_size - a_size) * MOVE_COST > max_cost) { |
73 | return max_cost + 1; |
74 | } |
75 | |
76 | // Instead of producing the whole traditional len(a)-by-len(b) |
77 | // matrix, we can update just one row in place. |
78 | // Initialize the buffer row |
79 | for (size_t i = 0; i < a_size; i++) { |
80 | // cost from b[:0] to a[:i+1] |
81 | buffer[i] = (i + 1) * MOVE_COST; |
82 | } |
83 | |
84 | size_t result = 0; |
85 | for (size_t b_index = 0; b_index < b_size; b_index++) { |
86 | char code = b[b_index]; |
87 | // cost(b[:b_index], a[:0]) == b_index * MOVE_COST |
88 | size_t distance = result = b_index * MOVE_COST; |
89 | size_t minimum = SIZE_MAX; |
90 | for (size_t index = 0; index < a_size; index++) { |
91 | |
92 | // cost(b[:b_index+1], a[:index+1]) = min( |
93 | // // 1) substitute |
94 | // cost(b[:b_index], a[:index]) |
95 | // + substitution_cost(b[b_index], a[index]), |
96 | // // 2) delete from b |
97 | // cost(b[:b_index], a[:index+1]) + MOVE_COST, |
98 | // // 3) delete from a |
99 | // cost(b[:b_index+1], a[index]) + MOVE_COST |
100 | // ) |
101 | |
102 | // 1) Previous distance in this row is cost(b[:b_index], a[:index]) |
103 | size_t substitute = distance + substitution_cost(code, a[index]); |
104 | // 2) cost(b[:b_index], a[:index+1]) from previous row |
105 | distance = buffer[index]; |
106 | // 3) existing result is cost(b[:b_index+1], a[index]) |
107 | |
108 | size_t insert_delete = Py_MIN(result, distance) + MOVE_COST; |
109 | result = Py_MIN(insert_delete, substitute); |
110 | |
111 | // cost(b[:b_index+1], a[:index+1]) |
112 | buffer[index] = result; |
113 | if (result < minimum) { |
114 | minimum = result; |
115 | } |
116 | } |
117 | if (minimum > max_cost) { |
118 | // Everything in this row is too big, so bail early. |
119 | return max_cost + 1; |
120 | } |
121 | } |
122 | return result; |
123 | } |
124 | |
125 | static inline PyObject * |
126 | calculate_suggestions(PyObject *dir, |
127 | PyObject *name) |
128 | { |
129 | assert(!PyErr_Occurred()); |
130 | assert(PyList_CheckExact(dir)); |
131 | |
132 | Py_ssize_t dir_size = PyList_GET_SIZE(dir); |
133 | if (dir_size >= MAX_CANDIDATE_ITEMS) { |
134 | return NULL; |
135 | } |
136 | |
137 | Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX; |
138 | PyObject *suggestion = NULL; |
139 | Py_ssize_t name_size; |
140 | const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size); |
141 | if (name_str == NULL) { |
142 | return NULL; |
143 | } |
144 | |
145 | for (int i = 0; i < dir_size; ++i) { |
146 | PyObject *item = PyList_GET_ITEM(dir, i); |
147 | Py_ssize_t item_size; |
148 | const char *item_str = PyUnicode_AsUTF8AndSize(item, &item_size); |
149 | if (item_str == NULL) { |
150 | return NULL; |
151 | } |
152 | if (PyUnicode_CompareWithASCIIString(name, item_str) == 0) { |
153 | continue; |
154 | } |
155 | // No more than 1/3 of the involved characters should need changed. |
156 | Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6; |
157 | // Don't take matches we've already beaten. |
158 | max_distance = Py_MIN(max_distance, suggestion_distance - 1); |
159 | Py_ssize_t current_distance = |
160 | levenshtein_distance(name_str, name_size, |
161 | item_str, item_size, max_distance); |
162 | if (current_distance > max_distance) { |
163 | continue; |
164 | } |
165 | if (!suggestion || current_distance < suggestion_distance) { |
166 | suggestion = item; |
167 | suggestion_distance = current_distance; |
168 | } |
169 | } |
170 | Py_XINCREF(suggestion); |
171 | return suggestion; |
172 | } |
173 | |
174 | static PyObject * |
175 | offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) |
176 | { |
177 | PyObject *name = exc->name; // borrowed reference |
178 | PyObject *obj = exc->obj; // borrowed reference |
179 | |
180 | // Abort if we don't have an attribute name or we have an invalid one |
181 | if (name == NULL || obj == NULL || !PyUnicode_CheckExact(name)) { |
182 | return NULL; |
183 | } |
184 | |
185 | PyObject *dir = PyObject_Dir(obj); |
186 | if (dir == NULL) { |
187 | return NULL; |
188 | } |
189 | |
190 | PyObject *suggestions = calculate_suggestions(dir, name); |
191 | Py_DECREF(dir); |
192 | return suggestions; |
193 | } |
194 | |
195 | |
196 | static PyObject * |
197 | offer_suggestions_for_name_error(PyNameErrorObject *exc) |
198 | { |
199 | PyObject *name = exc->name; // borrowed reference |
200 | PyTracebackObject *traceback = (PyTracebackObject *) exc->traceback; // borrowed reference |
201 | // Abort if we don't have a variable name or we have an invalid one |
202 | // or if we don't have a traceback to work with |
203 | if (name == NULL || !PyUnicode_CheckExact(name) || |
204 | traceback == NULL || !Py_IS_TYPE(traceback, &PyTraceBack_Type) |
205 | ) { |
206 | return NULL; |
207 | } |
208 | |
209 | // Move to the traceback of the exception |
210 | while (1) { |
211 | PyTracebackObject *next = traceback->tb_next; |
212 | if (next == NULL || !Py_IS_TYPE(next, &PyTraceBack_Type)) { |
213 | break; |
214 | } |
215 | else { |
216 | traceback = next; |
217 | } |
218 | } |
219 | |
220 | PyFrameObject *frame = traceback->tb_frame; |
221 | assert(frame != NULL); |
222 | PyCodeObject *code = frame->f_code; |
223 | assert(code != NULL && code->co_varnames != NULL); |
224 | PyObject *dir = PySequence_List(code->co_varnames); |
225 | if (dir == NULL) { |
226 | return NULL; |
227 | } |
228 | |
229 | PyObject *suggestions = calculate_suggestions(dir, name); |
230 | Py_DECREF(dir); |
231 | if (suggestions != NULL) { |
232 | return suggestions; |
233 | } |
234 | |
235 | dir = PySequence_List(frame->f_globals); |
236 | if (dir == NULL) { |
237 | return NULL; |
238 | } |
239 | suggestions = calculate_suggestions(dir, name); |
240 | Py_DECREF(dir); |
241 | if (suggestions != NULL) { |
242 | return suggestions; |
243 | } |
244 | |
245 | dir = PySequence_List(frame->f_builtins); |
246 | if (dir == NULL) { |
247 | return NULL; |
248 | } |
249 | suggestions = calculate_suggestions(dir, name); |
250 | Py_DECREF(dir); |
251 | |
252 | return suggestions; |
253 | } |
254 | |
255 | // Offer suggestions for a given exception. Returns a python string object containing the |
256 | // suggestions. This function returns NULL if no suggestion was found or if an exception happened, |
257 | // users must call PyErr_Occurred() to disambiguate. |
258 | PyObject * |
259 | _Py_Offer_Suggestions(PyObject *exception) |
260 | { |
261 | PyObject *result = NULL; |
262 | assert(!PyErr_Occurred()); |
263 | if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_AttributeError)) { |
264 | result = offer_suggestions_for_attribute_error((PyAttributeErrorObject *) exception); |
265 | } else if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_NameError)) { |
266 | result = offer_suggestions_for_name_error((PyNameErrorObject *) exception); |
267 | } |
268 | return result; |
269 | } |
270 | |
271 | Py_ssize_t |
272 | _Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost) |
273 | { |
274 | assert(PyUnicode_Check(a) && PyUnicode_Check(b)); |
275 | Py_ssize_t size_a, size_b; |
276 | const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a); |
277 | if (utf8_a == NULL) { |
278 | return -1; |
279 | } |
280 | const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b); |
281 | if (utf8_b == NULL) { |
282 | return -1; |
283 | } |
284 | if (max_cost == -1) { |
285 | max_cost = MOVE_COST * Py_MAX(size_a, size_b); |
286 | } |
287 | return levenshtein_distance(utf8_a, size_a, utf8_b, size_b, max_cost); |
288 | } |
289 | |
290 | |