RocketGit

benf_wspdigital / cpython (public) (License: PSF) (since 2017-05-12) (hash sha1)

Personal fork from https://github.com/python/cpython.git

Clone URLs: https://rocketgit.com/user/benf_wspdigital/cpython ssh://rocketgit@ssh.rocketgit.com/user/benf_wspdigital/cpython git://git.rocketgit.com/user/benf_wspdigital/cpython

master wip/issue/issue30181_parse-docstring-using-pydoc

/Lib/sre_compile.py (2cc39007ac71288d8deb620a65b8d21c32aa6524) (19345 bytes) (mode 100644) (type blob)

#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#

"""Internal support module for sre"""

import _sre
import sre_parse
from sre_constants import *

assert _sre.MAGIC == MAGIC, "SRE module mismatch"

_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}

# Sets of lowercase characters which have the same uppercase.
_equivalences = (
    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
    (0x69, 0x131), # iı
    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
    (0x73, 0x17f), # sſ
    # MICRO SIGN, GREEK SMALL LETTER MU
    (0xb5, 0x3bc), # µμ
    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
    (0x390, 0x1fd3), # ΐΐ
    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
    (0x3b0, 0x1fe3), # ΰΰ
    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
    (0x3b2, 0x3d0), # βϐ
    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
    (0x3b5, 0x3f5), # εϵ
    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
    (0x3b8, 0x3d1), # θϑ
    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
    (0x3ba, 0x3f0), # κϰ
    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
    (0x3c0, 0x3d6), # πϖ
    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
    (0x3c1, 0x3f1), # ρϱ
    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
    (0x3c2, 0x3c3), # ςσ
    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
    (0x3c6, 0x3d5), # φϕ
    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
    (0x1e61, 0x1e9b), # ṡẛ
    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
    (0xfb05, 0xfb06), # ﬅﬆ
)

# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
                     for t in _equivalences for i in t}

def _compile(code, pattern, flags):
    # internal: compile a (sub)pattern
    emit = code.append
    _len = len
    LITERAL_CODES = _LITERAL_CODES
    REPEATING_CODES = _REPEATING_CODES
    SUCCESS_CODES = _SUCCESS_CODES
    ASSERT_CODES = _ASSERT_CODES
    if (flags & SRE_FLAG_IGNORECASE and
            not (flags & SRE_FLAG_LOCALE) and
            flags & SRE_FLAG_UNICODE and
            not (flags & SRE_FLAG_ASCII)):
        fixes = _ignorecase_fixes
    else:
        fixes = None
    for op, av in pattern:
        if op in LITERAL_CODES:
            if flags & SRE_FLAG_IGNORECASE:
                lo = _sre.getlower(av, flags)
                if fixes and lo in fixes:
                    emit(IN_IGNORE)
                    skip = _len(code); emit(0)
                    if op is NOT_LITERAL:
                        emit(NEGATE)
                    for k in (lo,) + fixes[lo]:
                        emit(LITERAL)
                        emit(k)
                    emit(FAILURE)
                    code[skip] = _len(code) - skip
                else:
                    emit(OP_IGNORE[op])
                    emit(lo)
            else:
                emit(op)
                emit(av)
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OP_IGNORE[op])
                def fixup(literal, flags=flags):
                    return _sre.getlower(literal, flags)
            else:
                emit(op)
                fixup = None
            skip = _len(code); emit(0)
            _compile_charset(av, flags, code, fixup, fixes)
            code[skip] = _len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
                emit(ANY_ALL)
            else:
                emit(ANY)
        elif op in REPEATING_CODES:
            if flags & SRE_FLAG_TEMPLATE:
                raise error("internal: unsupported template operator %r" % (op,))
            elif _simple(av) and op is not REPEAT:
                if op is MAX_REPEAT:
                    emit(REPEAT_ONE)
                else:
                    emit(MIN_REPEAT_ONE)
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(SUCCESS)
                code[skip] = _len(code) - skip
            else:
                emit(REPEAT)
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                code[skip] = _len(code) - skip
                if op is MAX_REPEAT:
                    emit(MAX_UNTIL)
                else:
                    emit(MIN_UNTIL)
        elif op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            if group:
                emit(MARK)
                emit((group-1)*2)
            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
            _compile(code, p, (flags | add_flags) & ~del_flags)
            if group:
                emit(MARK)
                emit((group-1)*2+1)
        elif op in SUCCESS_CODES:
            emit(op)
        elif op in ASSERT_CODES:
            emit(op)
            skip = _len(code); emit(0)
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
                    raise error("look-behind requires fixed-width pattern")
                emit(lo) # look behind
            _compile(code, av[1], flags)
            emit(SUCCESS)
            code[skip] = _len(code) - skip
        elif op is CALL:
            emit(op)
            skip = _len(code); emit(0)
            _compile(code, av, flags)
            emit(SUCCESS)
            code[skip] = _len(code) - skip
        elif op is AT:
            emit(op)
            if flags & SRE_FLAG_MULTILINE:
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = AT_UNICODE.get(av, av)
            emit(av)
        elif op is BRANCH:
            emit(op)
            tail = []
            tailappend = tail.append
            for av in av[1]:
                skip = _len(code); emit(0)
                # _compile_info(code, av, flags)
                _compile(code, av, flags)
                emit(JUMP)
                tailappend(_len(code)); emit(0)
                code[skip] = _len(code) - skip
            emit(FAILURE) # end of branch
            for tail in tail:
                code[tail] = _len(code) - tail
        elif op is CATEGORY:
            emit(op)
            if flags & SRE_FLAG_LOCALE:
                av = CH_LOCALE[av]
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = CH_UNICODE[av]
            emit(av)
        elif op is GROUPREF:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OP_IGNORE[op])
            else:
                emit(op)
            emit(av-1)
        elif op is GROUPREF_EXISTS:
            emit(op)
            emit(av[0]-1)
            skipyes = _len(code); emit(0)
            _compile(code, av[1], flags)
            if av[2]:
                emit(JUMP)
                skipno = _len(code); emit(0)
                code[skipyes] = _len(code) - skipyes + 1
                _compile(code, av[2], flags)
                code[skipno] = _len(code) - skipno
            else:
                code[skipyes] = _len(code) - skipyes + 1
        else:
            raise error("internal: unsupported operand type %r" % (op,))

def _compile_charset(charset, flags, code, fixup=None, fixes=None):
    # compile charset subprogram
    emit = code.append
    for op, av in _optimize_charset(charset, fixup, fixes):
        emit(op)
        if op is NEGATE:
            pass
        elif op is LITERAL:
            emit(av)
        elif op is RANGE or op is RANGE_IGNORE:
            emit(av[0])
            emit(av[1])
        elif op is CHARSET:
            code.extend(av)
        elif op is BIGCHARSET:
            code.extend(av)
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CH_LOCALE[av])
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                emit(CH_UNICODE[av])
            else:
                emit(av)
        else:
            raise error("internal: unsupported set operator %r" % (op,))
    emit(FAILURE)

def _optimize_charset(charset, fixup, fixes):
    # internal: optimize character set
    out = []
    tail = []
    charmap = bytearray(256)
    for op, av in charset:
        while True:
            try:
                if op is LITERAL:
                    if fixup:
                        lo = fixup(av)
                        charmap[lo] = 1
                        if fixes and lo in fixes:
                            for k in fixes[lo]:
                                charmap[k] = 1
                    else:
                        charmap[av] = 1
                elif op is RANGE:
                    r = range(av[0], av[1]+1)
                    if fixup:
                        r = map(fixup, r)
                    if fixup and fixes:
                        for i in r:
                            charmap[i] = 1
                            if i in fixes:
                                for k in fixes[i]:
                                    charmap[k] = 1
                    else:
                        for i in r:
                            charmap[i] = 1
                elif op is NEGATE:
                    out.append((op, av))
                else:
                    tail.append((op, av))
            except IndexError:
                if len(charmap) == 256:
                    # character set contains non-UCS1 character codes
                    charmap += b'\0' * 0xff00
                    continue
                # Character set contains non-BMP character codes.
                # There are only two ranges of cased non-BMP characters:
                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
                # and for both ranges RANGE_IGNORE works.
                if fixup and op is RANGE:
                    op = RANGE_IGNORE
                tail.append((op, av))
            break

    # compress character map
    runs = []
    q = 0
    while True:
        p = charmap.find(1, q)
        if p < 0:
            break
        if len(runs) >= 2:
            runs = None
            break
        q = charmap.find(0, p)
        if q < 0:
            runs.append((p, len(charmap)))
            break
        runs.append((p, q))
    if runs is not None:
        # use literal/range
        for p, q in runs:
            if q - p == 1:
                out.append((LITERAL, p))
            else:
                out.append((RANGE, (p, q - 1)))
        out += tail
        # if the case was changed or new representation is more compact
        if fixup or len(out) < len(charset):
            return out
        # else original character set is good enough
        return charset

    # use bitmap
    if len(charmap) == 256:
        data = _mk_bitmap(charmap)
        out.append((CHARSET, data))
        out += tail
        return out

    # To represent a big charset, first a bitmap of all characters in the
    # set is constructed. Then, this bitmap is sliced into chunks of 256
    # characters, duplicate chunks are eliminated, and each chunk is
    # given a number. In the compiled expression, the charset is
    # represented by a 32-bit word sequence, consisting of one word for
    # the number of different chunks, a sequence of 256 bytes (64 words)
    # of chunk numbers indexed by their original chunk position, and a
    # sequence of 256-bit chunks (8 words each).

    # Compression is normally good: in a typical charset, large ranges of
    # Unicode will be either completely excluded (e.g. if only cyrillic
    # letters are to be matched), or completely included (e.g. if large
    # subranges of Kanji match). These ranges will be represented by
    # chunks of all one-bits or all zero-bits.

    # Matching can be also done efficiently: the more significant byte of
    # the Unicode character is an index into the chunk number, and the
    # less significant byte is a bit index in the chunk (just like the
    # CHARSET matching).

    charmap = bytes(charmap) # should be hashable
    comps = {}
    mapping = bytearray(256)
    block = 0
    data = bytearray()
    for i in range(0, 65536, 256):
        chunk = charmap[i: i + 256]
        if chunk in comps:
            mapping[i // 256] = comps[chunk]
        else:
            mapping[i // 256] = comps[chunk] = block
            block += 1
            data += chunk
    data = _mk_bitmap(data)
    data[0:0] = [block] + _bytes_to_codes(mapping)
    out.append((BIGCHARSET, data))
    out += tail
    return out

_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    s = bits.translate(_BITS_TRANS)[::-1]
    return [_int(s[i - _CODEBITS: i], 2)
            for i in range(len(s), 0, -_CODEBITS)]

def _bytes_to_codes(b):
    # Convert block indices to word array
    a = memoryview(b).cast('I')
    assert a.itemsize == _sre.CODESIZE
    assert len(a) * a.itemsize == len(b)
    return a.tolist()

def _simple(av):
    # check if av is a "simple" operator
    lo, hi = av[2].getwidth()
    return lo == hi == 1 and av[2][0][0] != SUBPATTERN

def _generate_overlap_table(prefix):
    """
    Generate an overlap table for the following prefix.
    An overlap table is a table of the same size as the prefix which
    informs about the potential self-overlap for each index in the prefix:
    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
      prefix[0:k]
    """
    table = [0] * len(prefix)
    for i in range(1, len(prefix)):
        idx = table[i - 1]
        while prefix[i] != prefix[idx]:
            if idx == 0:
                table[i] = 0
                break
            idx = table[idx - 1]
        else:
            table[i] = idx + 1
    return table

def _get_literal_prefix(pattern):
    # look for literal prefix
    prefix = []
    prefixappend = prefix.append
    prefix_skip = None
    for op, av in pattern.data:
        if op is LITERAL:
            prefixappend(av)
        elif op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            if add_flags & SRE_FLAG_IGNORECASE:
                break
            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
            if prefix_skip is None:
                if group is not None:
                    prefix_skip = len(prefix)
                elif prefix_skip1 is not None:
                    prefix_skip = len(prefix) + prefix_skip1
            prefix.extend(prefix1)
            if not got_all:
                break
        else:
            break
    else:
        return prefix, prefix_skip, True
    return prefix, prefix_skip, False

def _get_charset_prefix(pattern):
    charset = [] # not used
    charsetappend = charset.append
    if pattern.data:
        op, av = pattern.data[0]
        if op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            if p and not (add_flags & SRE_FLAG_IGNORECASE):
                op, av = p[0]
                if op is LITERAL:
                    charsetappend((op, av))
                elif op is BRANCH:
                    c = []
                    cappend = c.append
                    for p in av[1]:
                        if not p:
                            break
                        op, av = p[0]
                        if op is LITERAL:
                            cappend((op, av))
                        else:
                            break
                    else:
                        charset = c
        elif op is BRANCH:
            c = []
            cappend = c.append
            for p in av[1]:
                if not p:
                    break
                op, av = p[0]
                if op is LITERAL:
                    cappend((op, av))
                else:
                    break
            else:
                charset = c
        elif op is IN:
            charset = av
    return charset

def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
    lo, hi = pattern.getwidth()
    if hi > MAXCODE:
        hi = MAXCODE
    if lo == 0:
        code.extend([INFO, 4, 0, lo, hi])
        return
    # look for a literal prefix
    prefix = []
    prefix_skip = 0
    charset = [] # not used
    if not (flags & SRE_FLAG_IGNORECASE):
        # look for literal prefix
        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
        # if no prefix, look for charset prefix
        if not prefix:
            charset = _get_charset_prefix(pattern)
##     if prefix:
##         print("*** PREFIX", prefix, prefix_skip)
##     if charset:
##         print("*** CHARSET", charset)
    # add an info block
    emit = code.append
    emit(INFO)
    skip = len(code); emit(0)
    # literal flag
    mask = 0
    if prefix:
        mask = SRE_INFO_PREFIX
        if prefix_skip is None and got_all:
            mask = mask | SRE_INFO_LITERAL
    elif charset:
        mask = mask | SRE_INFO_CHARSET
    emit(mask)
    # pattern length
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
    emit(min(hi, MAXCODE))
    # add literal prefix
    if prefix:
        emit(len(prefix)) # length
        if prefix_skip is None:
            prefix_skip =  len(prefix)
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
        code.extend(_generate_overlap_table(prefix))
    elif charset:
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip

def isstring(obj):
    return isinstance(obj, (str, bytes))

def _code(p, flags):

    flags = p.pattern.flags | flags
    code = []

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
    _compile(code, p.data, flags)

    code.append(SUCCESS)

    return code

def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print(code)

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags | p.pattern.flags, code,
        p.pattern.groups-1,
        groupindex, tuple(indexgroup)
        )

Mode	Type	Size	Ref	File
100644	blob	582	58471109208922c9ee8c4b06135725f03ed16814	.bzrignore
100644	blob	545	fcf9df6a7a698e4bd87ed0c1cc4ed70bad8b9887	.codecov.yml
100644	blob	255	82694d81f276b2c59a0a93a4f678e1852e625052	.gitattributes
040000	tree	-	7e849e161267e730810fbbe6a848b14d5d002788	.github
100644	blob	1397	8b54c2c4861389f6e8bbfbab5ae0c8b6bbbad041	.gitignore
100644	blob	1060	eb19a6c88d28d05588db25d21525ee2e19c22666	.hgeol
100644	blob	1358	68c607f2e8d420c8dfd0748efcd3b3b5447def16	.hgignore
100644	blob	8917	8f51c2ced49aed46d8b480280b630ea4264c57c3	.hgtags
100644	blob	1328	b9be0f11fdb829f16e9de1921257eb7ee45fac57	.hgtouch
100644	blob	248	0614a299b6221dc7faedaa9139ae8b034e618a85	.mention-bot
100644	blob	3512	e7e8694530ca21a6d7a19da3fab687a3e9d79e9c	.travis.yml
040000	tree	-	ab6ef0c3da91d215c813859260aa9d0724504633	Doc
040000	tree	-	5dd6fc9dc09374506491247872c868eca111e256	Grammar
040000	tree	-	df0de9d4359f11311c74fd0dbad471bb2613a2d4	Include
100644	blob	12773	f5d0b39a0cdddb91a31a537052b7d8d31a4aa79f	LICENSE
040000	tree	-	35e9c80068a1b6441f6a676002e031d908be567f	Lib
040000	tree	-	1db7415d4375525eaf8d05ddd5b088de3321041c	Mac
100644	blob	58983	4145634c032d543d02295bd2c28a0c6ce839fa86	Makefile.pre.in
040000	tree	-	6854ababa88443950a60516508b6994cfd8888db	Misc
040000	tree	-	92e4f07c6b277cc3dae87514f9cebce860ec55ba	Modules
040000	tree	-	cec92311ba9c836d7f68a2d6e24b27e8287ac690	Objects
040000	tree	-	ed4f35810e9633502c16ae038c2ce697d3987201	PC
040000	tree	-	37a613ac0022a9cfefaf3f13913fec7debe59259	PCbuild
040000	tree	-	bfcd1ca2e85b8724b1b7be4e0673b90220a04e7c	Parser
040000	tree	-	3efbcc80237ab7c3d4eb5bf31c893ca6de88e747	Programs
040000	tree	-	8f832869b53d99ee02d78ea0cc8491d3882222da	Python
100644	blob	9325	9c95815d9e9d91b8dae8e05d8bbc696fe19f796b	README.rst
040000	tree	-	66b8a7e032e5538a9a2e08422da3716c50e91a4b	Tools
100644	blob	10910	9a9cc557281571f0d46c506c0e9d1b9fb25e063c	aclocal.m4
100755	blob	42856	1f5c50c0d1529d50b94dc3533ca72a47f0fa5849	config.guess
100755	blob	35740	d654d03cdcd2226a5d7584890717e674a8122f4f	config.sub
100755	blob	485283	87504d206837baf5a5781b6e1cc44dcce7138af9	configure
100644	blob	160661	f9bd92ce3da29ea7674a32bd5fe511b1fc4c4d0a	configure.ac
100755	blob	7122	0ec27bcd488da5cad6ead13d70accbdbc40d31ef	install-sh
100644	blob	41449	21354a5cb84fe5530dd0d460561ba95569abe1d4	pyconfig.h.in
100644	blob	98743	3b3d097454211c790c1602d759918bb65a622c97	setup.py

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/benf_wspdigital/cpython

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/benf_wspdigital/cpython

Clone this repository using git:

git clone git://git.rocketgit.com/user/benf_wspdigital/cpython

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main