1 | /* stringlib: split implementation */ |
2 | |
3 | #ifndef STRINGLIB_FASTSEARCH_H |
4 | #error must include "stringlib/fastsearch.h" before including this module |
5 | #endif |
6 | |
7 | /* Overallocate the initial list to reduce the number of reallocs for small |
8 | split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three |
9 | resizes, to sizes 4, 8, then 16. Most observed string splits are for human |
10 | text (roughly 11 words per line) and field delimited data (usually 1-10 |
11 | fields). For large strings the split algorithms are bandwidth limited |
12 | so increasing the preallocation likely will not improve things.*/ |
13 | |
14 | #define MAX_PREALLOC 12 |
15 | |
16 | /* 5 splits gives 6 elements */ |
17 | #define PREALLOC_SIZE(maxsplit) \ |
18 | (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) |
19 | |
20 | #define SPLIT_APPEND(data, left, right) \ |
21 | sub = STRINGLIB_NEW((data) + (left), \ |
22 | (right) - (left)); \ |
23 | if (sub == NULL) \ |
24 | goto onError; \ |
25 | if (PyList_Append(list, sub)) { \ |
26 | Py_DECREF(sub); \ |
27 | goto onError; \ |
28 | } \ |
29 | else \ |
30 | Py_DECREF(sub); |
31 | |
32 | #define SPLIT_ADD(data, left, right) { \ |
33 | sub = STRINGLIB_NEW((data) + (left), \ |
34 | (right) - (left)); \ |
35 | if (sub == NULL) \ |
36 | goto onError; \ |
37 | if (count < MAX_PREALLOC) { \ |
38 | PyList_SET_ITEM(list, count, sub); \ |
39 | } else { \ |
40 | if (PyList_Append(list, sub)) { \ |
41 | Py_DECREF(sub); \ |
42 | goto onError; \ |
43 | } \ |
44 | else \ |
45 | Py_DECREF(sub); \ |
46 | } \ |
47 | count++; } |
48 | |
49 | |
50 | /* Always force the list to the expected size. */ |
51 | #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count) |
52 | |
53 | Py_LOCAL_INLINE(PyObject *) |
54 | STRINGLIB(split_whitespace)(PyObject* str_obj, |
55 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
56 | Py_ssize_t maxcount) |
57 | { |
58 | Py_ssize_t i, j, count=0; |
59 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
60 | PyObject *sub; |
61 | |
62 | if (list == NULL) |
63 | return NULL; |
64 | |
65 | i = j = 0; |
66 | while (maxcount-- > 0) { |
67 | while (i < str_len && STRINGLIB_ISSPACE(str[i])) |
68 | i++; |
69 | if (i == str_len) break; |
70 | j = i; i++; |
71 | while (i < str_len && !STRINGLIB_ISSPACE(str[i])) |
72 | i++; |
73 | #ifndef STRINGLIB_MUTABLE |
74 | if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { |
75 | /* No whitespace in str_obj, so just use it as list[0] */ |
76 | Py_INCREF(str_obj); |
77 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
78 | count++; |
79 | break; |
80 | } |
81 | #endif |
82 | SPLIT_ADD(str, j, i); |
83 | } |
84 | |
85 | if (i < str_len) { |
86 | /* Only occurs when maxcount was reached */ |
87 | /* Skip any remaining whitespace and copy to end of string */ |
88 | while (i < str_len && STRINGLIB_ISSPACE(str[i])) |
89 | i++; |
90 | if (i != str_len) |
91 | SPLIT_ADD(str, i, str_len); |
92 | } |
93 | FIX_PREALLOC_SIZE(list); |
94 | return list; |
95 | |
96 | onError: |
97 | Py_DECREF(list); |
98 | return NULL; |
99 | } |
100 | |
101 | Py_LOCAL_INLINE(PyObject *) |
102 | STRINGLIB(split_char)(PyObject* str_obj, |
103 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
104 | const STRINGLIB_CHAR ch, |
105 | Py_ssize_t maxcount) |
106 | { |
107 | Py_ssize_t i, j, count=0; |
108 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
109 | PyObject *sub; |
110 | |
111 | if (list == NULL) |
112 | return NULL; |
113 | |
114 | i = j = 0; |
115 | while ((j < str_len) && (maxcount-- > 0)) { |
116 | for(; j < str_len; j++) { |
117 | /* I found that using memchr makes no difference */ |
118 | if (str[j] == ch) { |
119 | SPLIT_ADD(str, i, j); |
120 | i = j = j + 1; |
121 | break; |
122 | } |
123 | } |
124 | } |
125 | #ifndef STRINGLIB_MUTABLE |
126 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
127 | /* ch not in str_obj, so just use str_obj as list[0] */ |
128 | Py_INCREF(str_obj); |
129 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
130 | count++; |
131 | } else |
132 | #endif |
133 | if (i <= str_len) { |
134 | SPLIT_ADD(str, i, str_len); |
135 | } |
136 | FIX_PREALLOC_SIZE(list); |
137 | return list; |
138 | |
139 | onError: |
140 | Py_DECREF(list); |
141 | return NULL; |
142 | } |
143 | |
144 | Py_LOCAL_INLINE(PyObject *) |
145 | STRINGLIB(split)(PyObject* str_obj, |
146 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
147 | const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, |
148 | Py_ssize_t maxcount) |
149 | { |
150 | Py_ssize_t i, j, pos, count=0; |
151 | PyObject *list, *sub; |
152 | |
153 | if (sep_len == 0) { |
154 | PyErr_SetString(PyExc_ValueError, "empty separator" ); |
155 | return NULL; |
156 | } |
157 | else if (sep_len == 1) |
158 | return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); |
159 | |
160 | list = PyList_New(PREALLOC_SIZE(maxcount)); |
161 | if (list == NULL) |
162 | return NULL; |
163 | |
164 | i = j = 0; |
165 | while (maxcount-- > 0) { |
166 | pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); |
167 | if (pos < 0) |
168 | break; |
169 | j = i + pos; |
170 | SPLIT_ADD(str, i, j); |
171 | i = j + sep_len; |
172 | } |
173 | #ifndef STRINGLIB_MUTABLE |
174 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
175 | /* No match in str_obj, so just use it as list[0] */ |
176 | Py_INCREF(str_obj); |
177 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
178 | count++; |
179 | } else |
180 | #endif |
181 | { |
182 | SPLIT_ADD(str, i, str_len); |
183 | } |
184 | FIX_PREALLOC_SIZE(list); |
185 | return list; |
186 | |
187 | onError: |
188 | Py_DECREF(list); |
189 | return NULL; |
190 | } |
191 | |
192 | Py_LOCAL_INLINE(PyObject *) |
193 | STRINGLIB(rsplit_whitespace)(PyObject* str_obj, |
194 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
195 | Py_ssize_t maxcount) |
196 | { |
197 | Py_ssize_t i, j, count=0; |
198 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
199 | PyObject *sub; |
200 | |
201 | if (list == NULL) |
202 | return NULL; |
203 | |
204 | i = j = str_len - 1; |
205 | while (maxcount-- > 0) { |
206 | while (i >= 0 && STRINGLIB_ISSPACE(str[i])) |
207 | i--; |
208 | if (i < 0) break; |
209 | j = i; i--; |
210 | while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) |
211 | i--; |
212 | #ifndef STRINGLIB_MUTABLE |
213 | if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
214 | /* No whitespace in str_obj, so just use it as list[0] */ |
215 | Py_INCREF(str_obj); |
216 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
217 | count++; |
218 | break; |
219 | } |
220 | #endif |
221 | SPLIT_ADD(str, i + 1, j + 1); |
222 | } |
223 | |
224 | if (i >= 0) { |
225 | /* Only occurs when maxcount was reached */ |
226 | /* Skip any remaining whitespace and copy to beginning of string */ |
227 | while (i >= 0 && STRINGLIB_ISSPACE(str[i])) |
228 | i--; |
229 | if (i >= 0) |
230 | SPLIT_ADD(str, 0, i + 1); |
231 | } |
232 | FIX_PREALLOC_SIZE(list); |
233 | if (PyList_Reverse(list) < 0) |
234 | goto onError; |
235 | return list; |
236 | |
237 | onError: |
238 | Py_DECREF(list); |
239 | return NULL; |
240 | } |
241 | |
242 | Py_LOCAL_INLINE(PyObject *) |
243 | STRINGLIB(rsplit_char)(PyObject* str_obj, |
244 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
245 | const STRINGLIB_CHAR ch, |
246 | Py_ssize_t maxcount) |
247 | { |
248 | Py_ssize_t i, j, count=0; |
249 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
250 | PyObject *sub; |
251 | |
252 | if (list == NULL) |
253 | return NULL; |
254 | |
255 | i = j = str_len - 1; |
256 | while ((i >= 0) && (maxcount-- > 0)) { |
257 | for(; i >= 0; i--) { |
258 | if (str[i] == ch) { |
259 | SPLIT_ADD(str, i + 1, j + 1); |
260 | j = i = i - 1; |
261 | break; |
262 | } |
263 | } |
264 | } |
265 | #ifndef STRINGLIB_MUTABLE |
266 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
267 | /* ch not in str_obj, so just use str_obj as list[0] */ |
268 | Py_INCREF(str_obj); |
269 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
270 | count++; |
271 | } else |
272 | #endif |
273 | if (j >= -1) { |
274 | SPLIT_ADD(str, 0, j + 1); |
275 | } |
276 | FIX_PREALLOC_SIZE(list); |
277 | if (PyList_Reverse(list) < 0) |
278 | goto onError; |
279 | return list; |
280 | |
281 | onError: |
282 | Py_DECREF(list); |
283 | return NULL; |
284 | } |
285 | |
286 | Py_LOCAL_INLINE(PyObject *) |
287 | STRINGLIB(rsplit)(PyObject* str_obj, |
288 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
289 | const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, |
290 | Py_ssize_t maxcount) |
291 | { |
292 | Py_ssize_t j, pos, count=0; |
293 | PyObject *list, *sub; |
294 | |
295 | if (sep_len == 0) { |
296 | PyErr_SetString(PyExc_ValueError, "empty separator" ); |
297 | return NULL; |
298 | } |
299 | else if (sep_len == 1) |
300 | return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); |
301 | |
302 | list = PyList_New(PREALLOC_SIZE(maxcount)); |
303 | if (list == NULL) |
304 | return NULL; |
305 | |
306 | j = str_len; |
307 | while (maxcount-- > 0) { |
308 | pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); |
309 | if (pos < 0) |
310 | break; |
311 | SPLIT_ADD(str, pos + sep_len, j); |
312 | j = pos; |
313 | } |
314 | #ifndef STRINGLIB_MUTABLE |
315 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
316 | /* No match in str_obj, so just use it as list[0] */ |
317 | Py_INCREF(str_obj); |
318 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
319 | count++; |
320 | } else |
321 | #endif |
322 | { |
323 | SPLIT_ADD(str, 0, j); |
324 | } |
325 | FIX_PREALLOC_SIZE(list); |
326 | if (PyList_Reverse(list) < 0) |
327 | goto onError; |
328 | return list; |
329 | |
330 | onError: |
331 | Py_DECREF(list); |
332 | return NULL; |
333 | } |
334 | |
335 | Py_LOCAL_INLINE(PyObject *) |
336 | STRINGLIB(splitlines)(PyObject* str_obj, |
337 | const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
338 | int keepends) |
339 | { |
340 | /* This does not use the preallocated list because splitlines is |
341 | usually run with hundreds of newlines. The overhead of |
342 | switching between PyList_SET_ITEM and append causes about a |
343 | 2-3% slowdown for that common case. A smarter implementation |
344 | could move the if check out, so the SET_ITEMs are done first |
345 | and the appends only done when the prealloc buffer is full. |
346 | That's too much work for little gain.*/ |
347 | |
348 | Py_ssize_t i; |
349 | Py_ssize_t j; |
350 | PyObject *list = PyList_New(0); |
351 | PyObject *sub; |
352 | |
353 | if (list == NULL) |
354 | return NULL; |
355 | |
356 | for (i = j = 0; i < str_len; ) { |
357 | Py_ssize_t eol; |
358 | |
359 | /* Find a line and append it */ |
360 | while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) |
361 | i++; |
362 | |
363 | /* Skip the line break reading CRLF as one line break */ |
364 | eol = i; |
365 | if (i < str_len) { |
366 | if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') |
367 | i += 2; |
368 | else |
369 | i++; |
370 | if (keepends) |
371 | eol = i; |
372 | } |
373 | #ifndef STRINGLIB_MUTABLE |
374 | if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { |
375 | /* No linebreak in str_obj, so just use it as list[0] */ |
376 | if (PyList_Append(list, str_obj)) |
377 | goto onError; |
378 | break; |
379 | } |
380 | #endif |
381 | SPLIT_APPEND(str, j, eol); |
382 | j = i; |
383 | } |
384 | return list; |
385 | |
386 | onError: |
387 | Py_DECREF(list); |
388 | return NULL; |
389 | } |
390 | |
391 | |