File Lib/sre_compile.py changed (mode: 100644) (index db8b8a2778..cebecb93c0) |
... |
... |
def _compile(code, pattern, flags): |
69 |
69 |
REPEATING_CODES = _REPEATING_CODES |
REPEATING_CODES = _REPEATING_CODES |
70 |
70 |
SUCCESS_CODES = _SUCCESS_CODES |
SUCCESS_CODES = _SUCCESS_CODES |
71 |
71 |
ASSERT_CODES = _ASSERT_CODES |
ASSERT_CODES = _ASSERT_CODES |
|
72 |
|
iscased = None |
72 |
73 |
tolower = None |
tolower = None |
73 |
74 |
fixes = None |
fixes = None |
74 |
75 |
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: |
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: |
75 |
76 |
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: |
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: |
|
77 |
|
iscased = _sre.unicode_iscased |
76 |
78 |
tolower = _sre.unicode_tolower |
tolower = _sre.unicode_tolower |
77 |
79 |
fixes = _ignorecase_fixes |
fixes = _ignorecase_fixes |
78 |
80 |
else: |
else: |
|
81 |
|
iscased = _sre.ascii_iscased |
79 |
82 |
tolower = _sre.ascii_tolower |
tolower = _sre.ascii_tolower |
80 |
83 |
for op, av in pattern: |
for op, av in pattern: |
81 |
84 |
if op in LITERAL_CODES: |
if op in LITERAL_CODES: |
|
... |
... |
def _compile(code, pattern, flags): |
85 |
88 |
elif flags & SRE_FLAG_LOCALE: |
elif flags & SRE_FLAG_LOCALE: |
86 |
89 |
emit(OP_LOC_IGNORE[op]) |
emit(OP_LOC_IGNORE[op]) |
87 |
90 |
emit(av) |
emit(av) |
|
91 |
|
elif not iscased(av): |
|
92 |
|
emit(op) |
|
93 |
|
emit(av) |
88 |
94 |
else: |
else: |
89 |
95 |
lo = tolower(av) |
lo = tolower(av) |
90 |
96 |
if fixes and lo in fixes: |
if fixes and lo in fixes: |
|
... |
... |
def _compile(code, pattern, flags): |
101 |
107 |
emit(OP_IGNORE[op]) |
emit(OP_IGNORE[op]) |
102 |
108 |
emit(lo) |
emit(lo) |
103 |
109 |
elif op is IN: |
elif op is IN: |
104 |
|
if not flags & SRE_FLAG_IGNORECASE: |
|
105 |
|
emit(op) |
|
106 |
|
elif flags & SRE_FLAG_LOCALE: |
|
|
110 |
|
charset, hascased = _optimize_charset(av, iscased, tolower, fixes) |
|
111 |
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: |
107 |
112 |
emit(IN_LOC_IGNORE) |
emit(IN_LOC_IGNORE) |
108 |
|
else: |
|
|
113 |
|
elif hascased: |
109 |
114 |
emit(IN_IGNORE) |
emit(IN_IGNORE) |
|
115 |
|
else: |
|
116 |
|
emit(IN) |
110 |
117 |
skip = _len(code); emit(0) |
skip = _len(code); emit(0) |
111 |
|
_compile_charset(av, flags, code, tolower, fixes) |
|
|
118 |
|
_compile_charset(charset, flags, code) |
112 |
119 |
code[skip] = _len(code) - skip |
code[skip] = _len(code) - skip |
113 |
120 |
elif op is ANY: |
elif op is ANY: |
114 |
121 |
if flags & SRE_FLAG_DOTALL: |
if flags & SRE_FLAG_DOTALL: |
|
... |
... |
def _compile(code, pattern, flags): |
223 |
230 |
else: |
else: |
224 |
231 |
raise error("internal: unsupported operand type %r" % (op,)) |
raise error("internal: unsupported operand type %r" % (op,)) |
225 |
232 |
|
|
226 |
|
def _compile_charset(charset, flags, code, fixup=None, fixes=None): |
|
|
233 |
|
def _compile_charset(charset, flags, code): |
227 |
234 |
# compile charset subprogram |
# compile charset subprogram |
228 |
235 |
emit = code.append |
emit = code.append |
229 |
|
for op, av in _optimize_charset(charset, fixup, fixes): |
|
|
236 |
|
for op, av in charset: |
230 |
237 |
emit(op) |
emit(op) |
231 |
238 |
if op is NEGATE: |
if op is NEGATE: |
232 |
239 |
pass |
pass |
|
... |
... |
def _compile_charset(charset, flags, code, fixup=None, fixes=None): |
250 |
257 |
raise error("internal: unsupported set operator %r" % (op,)) |
raise error("internal: unsupported set operator %r" % (op,)) |
251 |
258 |
emit(FAILURE) |
emit(FAILURE) |
252 |
259 |
|
|
253 |
|
def _optimize_charset(charset, fixup, fixes): |
|
|
260 |
|
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): |
254 |
261 |
# internal: optimize character set |
# internal: optimize character set |
255 |
262 |
out = [] |
out = [] |
256 |
263 |
tail = [] |
tail = [] |
257 |
264 |
charmap = bytearray(256) |
charmap = bytearray(256) |
|
265 |
|
hascased = False |
258 |
266 |
for op, av in charset: |
for op, av in charset: |
259 |
267 |
while True: |
while True: |
260 |
268 |
try: |
try: |
|
... |
... |
def _optimize_charset(charset, fixup, fixes): |
265 |
273 |
if fixes and lo in fixes: |
if fixes and lo in fixes: |
266 |
274 |
for k in fixes[lo]: |
for k in fixes[lo]: |
267 |
275 |
charmap[k] = 1 |
charmap[k] = 1 |
|
276 |
|
if not hascased and iscased(av): |
|
277 |
|
hascased = True |
268 |
278 |
else: |
else: |
269 |
279 |
charmap[av] = 1 |
charmap[av] = 1 |
270 |
280 |
elif op is RANGE: |
elif op is RANGE: |
271 |
281 |
r = range(av[0], av[1]+1) |
r = range(av[0], av[1]+1) |
272 |
282 |
if fixup: |
if fixup: |
273 |
|
r = map(fixup, r) |
|
274 |
|
if fixup and fixes: |
|
275 |
|
for i in r: |
|
276 |
|
charmap[i] = 1 |
|
277 |
|
if i in fixes: |
|
278 |
|
for k in fixes[i]: |
|
279 |
|
charmap[k] = 1 |
|
|
283 |
|
if fixes: |
|
284 |
|
for i in map(fixup, r): |
|
285 |
|
charmap[i] = 1 |
|
286 |
|
if i in fixes: |
|
287 |
|
for k in fixes[i]: |
|
288 |
|
charmap[k] = 1 |
|
289 |
|
else: |
|
290 |
|
for i in map(fixup, r): |
|
291 |
|
charmap[i] = 1 |
|
292 |
|
if not hascased: |
|
293 |
|
hascased = any(map(iscased, r)) |
280 |
294 |
else: |
else: |
281 |
295 |
for i in r: |
for i in r: |
282 |
296 |
charmap[i] = 1 |
charmap[i] = 1 |
|
... |
... |
def _optimize_charset(charset, fixup, fixes): |
290 |
304 |
charmap += b'\0' * 0xff00 |
charmap += b'\0' * 0xff00 |
291 |
305 |
continue |
continue |
292 |
306 |
# Character set contains non-BMP character codes. |
# Character set contains non-BMP character codes. |
293 |
|
# There are only two ranges of cased non-BMP characters: |
|
294 |
|
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), |
|
295 |
|
# and for both ranges RANGE_IGNORE works. |
|
296 |
|
if fixup and op is RANGE: |
|
297 |
|
op = RANGE_IGNORE |
|
|
307 |
|
if fixup: |
|
308 |
|
hascased = True |
|
309 |
|
# There are only two ranges of cased non-BMP characters: |
|
310 |
|
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), |
|
311 |
|
# and for both ranges RANGE_IGNORE works. |
|
312 |
|
if op is RANGE: |
|
313 |
|
op = RANGE_IGNORE |
298 |
314 |
tail.append((op, av)) |
tail.append((op, av)) |
299 |
315 |
break |
break |
300 |
316 |
|
|
|
... |
... |
def _optimize_charset(charset, fixup, fixes): |
322 |
338 |
out.append((RANGE, (p, q - 1))) |
out.append((RANGE, (p, q - 1))) |
323 |
339 |
out += tail |
out += tail |
324 |
340 |
# if the case was changed or new representation is more compact |
# if the case was changed or new representation is more compact |
325 |
|
if fixup or len(out) < len(charset): |
|
326 |
|
return out |
|
|
341 |
|
if hascased or len(out) < len(charset): |
|
342 |
|
return out, hascased |
327 |
343 |
# else original character set is good enough |
# else original character set is good enough |
328 |
|
return charset |
|
|
344 |
|
return charset, hascased |
329 |
345 |
|
|
330 |
346 |
# use bitmap |
# use bitmap |
331 |
347 |
if len(charmap) == 256: |
if len(charmap) == 256: |
332 |
348 |
data = _mk_bitmap(charmap) |
data = _mk_bitmap(charmap) |
333 |
349 |
out.append((CHARSET, data)) |
out.append((CHARSET, data)) |
334 |
350 |
out += tail |
out += tail |
335 |
|
return out |
|
|
351 |
|
return out, hascased |
336 |
352 |
|
|
337 |
353 |
# To represent a big charset, first a bitmap of all characters in the |
# To represent a big charset, first a bitmap of all characters in the |
338 |
354 |
# set is constructed. Then, this bitmap is sliced into chunks of 256 |
# set is constructed. Then, this bitmap is sliced into chunks of 256 |
|
... |
... |
def _optimize_charset(charset, fixup, fixes): |
371 |
387 |
data[0:0] = [block] + _bytes_to_codes(mapping) |
data[0:0] = [block] + _bytes_to_codes(mapping) |
372 |
388 |
out.append((BIGCHARSET, data)) |
out.append((BIGCHARSET, data)) |
373 |
389 |
out += tail |
out += tail |
374 |
|
return out |
|
|
390 |
|
return out, hascased |
375 |
391 |
|
|
376 |
392 |
_CODEBITS = _sre.CODESIZE * 8 |
_CODEBITS = _sre.CODESIZE * 8 |
377 |
393 |
MAXCODE = (1 << _CODEBITS) - 1 |
MAXCODE = (1 << _CODEBITS) - 1 |
|
... |
... |
def _generate_overlap_table(prefix): |
414 |
430 |
table[i] = idx + 1 |
table[i] = idx + 1 |
415 |
431 |
return table |
return table |
416 |
432 |
|
|
417 |
|
def _get_literal_prefix(pattern): |
|
|
433 |
|
def _get_iscased(flags): |
|
434 |
|
if not flags & SRE_FLAG_IGNORECASE: |
|
435 |
|
return None |
|
436 |
|
elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: |
|
437 |
|
return _sre.unicode_iscased |
|
438 |
|
else: |
|
439 |
|
return _sre.ascii_iscased |
|
440 |
|
|
|
441 |
|
def _get_literal_prefix(pattern, flags): |
418 |
442 |
# look for literal prefix |
# look for literal prefix |
419 |
443 |
prefix = [] |
prefix = [] |
420 |
444 |
prefixappend = prefix.append |
prefixappend = prefix.append |
421 |
445 |
prefix_skip = None |
prefix_skip = None |
|
446 |
|
iscased = _get_iscased(flags) |
422 |
447 |
for op, av in pattern.data: |
for op, av in pattern.data: |
423 |
448 |
if op is LITERAL: |
if op is LITERAL: |
|
449 |
|
if iscased and iscased(av): |
|
450 |
|
break |
424 |
451 |
prefixappend(av) |
prefixappend(av) |
425 |
452 |
elif op is SUBPATTERN: |
elif op is SUBPATTERN: |
426 |
453 |
group, add_flags, del_flags, p = av |
group, add_flags, del_flags, p = av |
427 |
|
if add_flags & SRE_FLAG_IGNORECASE: |
|
|
454 |
|
flags1 = (flags | add_flags) & ~del_flags |
|
455 |
|
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: |
428 |
456 |
break |
break |
429 |
|
prefix1, prefix_skip1, got_all = _get_literal_prefix(p) |
|
|
457 |
|
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) |
430 |
458 |
if prefix_skip is None: |
if prefix_skip is None: |
431 |
459 |
if group is not None: |
if group is not None: |
432 |
460 |
prefix_skip = len(prefix) |
prefix_skip = len(prefix) |
|
... |
... |
def _get_literal_prefix(pattern): |
441 |
469 |
return prefix, prefix_skip, True |
return prefix, prefix_skip, True |
442 |
470 |
return prefix, prefix_skip, False |
return prefix, prefix_skip, False |
443 |
471 |
|
|
444 |
|
def _get_charset_prefix(pattern): |
|
445 |
|
charset = [] # not used |
|
446 |
|
charsetappend = charset.append |
|
447 |
|
if pattern.data: |
|
|
472 |
|
def _get_charset_prefix(pattern, flags): |
|
473 |
|
while True: |
|
474 |
|
if not pattern.data: |
|
475 |
|
return None |
448 |
476 |
op, av = pattern.data[0] |
op, av = pattern.data[0] |
449 |
|
if op is SUBPATTERN: |
|
450 |
|
group, add_flags, del_flags, p = av |
|
451 |
|
if p and not (add_flags & SRE_FLAG_IGNORECASE): |
|
452 |
|
op, av = p[0] |
|
453 |
|
if op is LITERAL: |
|
454 |
|
charsetappend((op, av)) |
|
455 |
|
elif op is BRANCH: |
|
456 |
|
c = [] |
|
457 |
|
cappend = c.append |
|
458 |
|
for p in av[1]: |
|
459 |
|
if not p: |
|
460 |
|
break |
|
461 |
|
op, av = p[0] |
|
462 |
|
if op is LITERAL: |
|
463 |
|
cappend((op, av)) |
|
464 |
|
else: |
|
465 |
|
break |
|
466 |
|
else: |
|
467 |
|
charset = c |
|
468 |
|
elif op is BRANCH: |
|
469 |
|
c = [] |
|
470 |
|
cappend = c.append |
|
471 |
|
for p in av[1]: |
|
472 |
|
if not p: |
|
473 |
|
break |
|
474 |
|
op, av = p[0] |
|
475 |
|
if op is LITERAL: |
|
476 |
|
cappend((op, av)) |
|
477 |
|
else: |
|
478 |
|
break |
|
|
477 |
|
if op is not SUBPATTERN: |
|
478 |
|
break |
|
479 |
|
group, add_flags, del_flags, pattern = av |
|
480 |
|
flags = (flags | add_flags) & ~del_flags |
|
481 |
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: |
|
482 |
|
return None |
|
483 |
|
|
|
484 |
|
iscased = _get_iscased(flags) |
|
485 |
|
if op is LITERAL: |
|
486 |
|
if iscased and iscased(av): |
|
487 |
|
return None |
|
488 |
|
return [(op, av)] |
|
489 |
|
elif op is BRANCH: |
|
490 |
|
charset = [] |
|
491 |
|
charsetappend = charset.append |
|
492 |
|
for p in av[1]: |
|
493 |
|
if not p: |
|
494 |
|
return None |
|
495 |
|
op, av = p[0] |
|
496 |
|
if op is LITERAL and not (iscased and iscased(av)): |
|
497 |
|
charsetappend((op, av)) |
479 |
498 |
else: |
else: |
480 |
|
charset = c |
|
481 |
|
elif op is IN: |
|
482 |
|
charset = av |
|
483 |
|
return charset |
|
|
499 |
|
return None |
|
500 |
|
return charset |
|
501 |
|
elif op is IN: |
|
502 |
|
charset = av |
|
503 |
|
if iscased: |
|
504 |
|
for op, av in charset: |
|
505 |
|
if op is LITERAL: |
|
506 |
|
if iscased(av): |
|
507 |
|
return None |
|
508 |
|
elif op is RANGE: |
|
509 |
|
if av[1] > 0xffff: |
|
510 |
|
return None |
|
511 |
|
if any(map(iscased, range(av[0], av[1]+1))): |
|
512 |
|
return None |
|
513 |
|
return charset |
|
514 |
|
return None |
484 |
515 |
|
|
485 |
516 |
def _compile_info(code, pattern, flags): |
def _compile_info(code, pattern, flags): |
486 |
517 |
# internal: compile an info block. in the current version, |
# internal: compile an info block. in the current version, |
|
... |
... |
def _compile_info(code, pattern, flags): |
496 |
527 |
prefix = [] |
prefix = [] |
497 |
528 |
prefix_skip = 0 |
prefix_skip = 0 |
498 |
529 |
charset = [] # not used |
charset = [] # not used |
499 |
|
if not (flags & SRE_FLAG_IGNORECASE): |
|
|
530 |
|
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): |
500 |
531 |
# look for literal prefix |
# look for literal prefix |
501 |
|
prefix, prefix_skip, got_all = _get_literal_prefix(pattern) |
|
|
532 |
|
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) |
502 |
533 |
# if no prefix, look for charset prefix |
# if no prefix, look for charset prefix |
503 |
534 |
if not prefix: |
if not prefix: |
504 |
|
charset = _get_charset_prefix(pattern) |
|
|
535 |
|
charset = _get_charset_prefix(pattern, flags) |
505 |
536 |
## if prefix: |
## if prefix: |
506 |
537 |
## print("*** PREFIX", prefix, prefix_skip) |
## print("*** PREFIX", prefix, prefix_skip) |
507 |
538 |
## if charset: |
## if charset: |
|
... |
... |
def _compile_info(code, pattern, flags): |
536 |
567 |
# generate overlap table |
# generate overlap table |
537 |
568 |
code.extend(_generate_overlap_table(prefix)) |
code.extend(_generate_overlap_table(prefix)) |
538 |
569 |
elif charset: |
elif charset: |
|
570 |
|
charset, hascased = _optimize_charset(charset) |
|
571 |
|
assert not hascased |
539 |
572 |
_compile_charset(charset, flags, code) |
_compile_charset(charset, flags, code) |
540 |
573 |
code[skip] = len(code) - skip |
code[skip] = len(code) - skip |
541 |
574 |
|
|
File Modules/clinic/_sre.c.h changed (mode: 100644) (index 8056eda3b7..1e60686038) |
... |
... |
exit: |
29 |
29 |
return return_value; |
return return_value; |
30 |
30 |
} |
} |
31 |
31 |
|
|
|
32 |
|
PyDoc_STRVAR(_sre_ascii_iscased__doc__, |
|
33 |
|
"ascii_iscased($module, character, /)\n" |
|
34 |
|
"--\n" |
|
35 |
|
"\n"); |
|
36 |
|
|
|
37 |
|
#define _SRE_ASCII_ISCASED_METHODDEF \ |
|
38 |
|
{"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__}, |
|
39 |
|
|
|
40 |
|
static int |
|
41 |
|
_sre_ascii_iscased_impl(PyObject *module, int character); |
|
42 |
|
|
|
43 |
|
static PyObject * |
|
44 |
|
_sre_ascii_iscased(PyObject *module, PyObject *arg) |
|
45 |
|
{ |
|
46 |
|
PyObject *return_value = NULL; |
|
47 |
|
int character; |
|
48 |
|
int _return_value; |
|
49 |
|
|
|
50 |
|
if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) { |
|
51 |
|
goto exit; |
|
52 |
|
} |
|
53 |
|
_return_value = _sre_ascii_iscased_impl(module, character); |
|
54 |
|
if ((_return_value == -1) && PyErr_Occurred()) { |
|
55 |
|
goto exit; |
|
56 |
|
} |
|
57 |
|
return_value = PyBool_FromLong((long)_return_value); |
|
58 |
|
|
|
59 |
|
exit: |
|
60 |
|
return return_value; |
|
61 |
|
} |
|
62 |
|
|
|
63 |
|
PyDoc_STRVAR(_sre_unicode_iscased__doc__, |
|
64 |
|
"unicode_iscased($module, character, /)\n" |
|
65 |
|
"--\n" |
|
66 |
|
"\n"); |
|
67 |
|
|
|
68 |
|
#define _SRE_UNICODE_ISCASED_METHODDEF \ |
|
69 |
|
{"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__}, |
|
70 |
|
|
|
71 |
|
static int |
|
72 |
|
_sre_unicode_iscased_impl(PyObject *module, int character); |
|
73 |
|
|
|
74 |
|
static PyObject * |
|
75 |
|
_sre_unicode_iscased(PyObject *module, PyObject *arg) |
|
76 |
|
{ |
|
77 |
|
PyObject *return_value = NULL; |
|
78 |
|
int character; |
|
79 |
|
int _return_value; |
|
80 |
|
|
|
81 |
|
if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) { |
|
82 |
|
goto exit; |
|
83 |
|
} |
|
84 |
|
_return_value = _sre_unicode_iscased_impl(module, character); |
|
85 |
|
if ((_return_value == -1) && PyErr_Occurred()) { |
|
86 |
|
goto exit; |
|
87 |
|
} |
|
88 |
|
return_value = PyBool_FromLong((long)_return_value); |
|
89 |
|
|
|
90 |
|
exit: |
|
91 |
|
return return_value; |
|
92 |
|
} |
|
93 |
|
|
32 |
94 |
PyDoc_STRVAR(_sre_ascii_tolower__doc__, |
PyDoc_STRVAR(_sre_ascii_tolower__doc__, |
33 |
95 |
"ascii_tolower($module, character, /)\n" |
"ascii_tolower($module, character, /)\n" |
34 |
96 |
"--\n" |
"--\n" |
|
... |
... |
_sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored)) |
715 |
777 |
{ |
{ |
716 |
778 |
return _sre_SRE_Scanner_search_impl(self); |
return _sre_SRE_Scanner_search_impl(self); |
717 |
779 |
} |
} |
718 |
|
/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/ |
|
|
780 |
|
/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/ |