GiPSy — Python Parser

pytoken.py

"""
Provides Python token classes for the GiPSy scanner.

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


from gipsy.token import Token
from gipsy.programtoken import IdentifierToken


class MLStringToken(Token):

    """
    Generic multi-line string token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Multi-line String"


class BacktickToken(Token):

    """
    Generic string token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Backtick"


class FirstIdentifierToken(IdentifierToken):

    """
    Token class for the first identifier in a qualified
    set, e.g. first.notfirst.stillnotfirst.end.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        IdentifierToken.__init__(self, value)
        self._token_type_string = "FirstIdentifier"


class DefinitionToken(IdentifierToken):

    """
    Generic definition token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        IdentifierToken.__init__(self, value)
        self._token_type_string = "Definition"


class DecoratorToken(IdentifierToken):

    """
    Generic decorator token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        IdentifierToken.__init__(self, value)
        self._token_type_string = "Decorator"


class BuiltinToken(IdentifierToken):

    """
    Generic built-in token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        IdentifierToken.__init__(self, value)
        self._token_type_string = "Builtin"

pyparser.py

"""
Implements a Python source scanner based on the GiPSy.

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


import keyword
import re
import __builtin__
from gipsy.token import WhitespaceToken
from gipsy.parser import Parser, TokenMatch
from gipsy.programtoken import CommentToken, KeywordToken, StringToken
from gipsy.programtoken import IdentifierToken, FloatToken, SeparatorToken
from gipsy.programtoken import IntegerToken, DelimiterToken, OperatorToken
from gipsy.pytoken import BacktickToken, DecoratorToken, MLStringToken
from gipsy.pytoken import DefinitionToken, BuiltinToken, FirstIdentifierToken


class PyParser(Parser):

    """
    A Python source code scanner class.
    """

    def __init__(self):

        """
        Class initializer.
        """

        # Call the superclass initializer

        Parser.__init__(self)

        # Set the containers we're going to recognize

        self._containers = [('(', ')'), ('{', '}'), ('[', ']')]

        # Define our token pattern matches

        tmat = self._token_matches

        # Search order matters, here. Patterns will be matched in
        # the order in which they appear in this list. Look for
        # comments, first, as they override everything, and then
        # look for multi-line strings, then strings, then backticks.
        # Backticks are deprecated, and removed in Python 3, but
        # retained here for backwards compatibility.

        tmat.append(TokenMatch(re.compile(r"#.*"),
                               [CommentToken]))
        tmat.append(TokenMatch(re.compile(r"r?([\"|\']{3})[^\1]*?" +
                                          r"(?<!\\)(\\\\)*\1"),
                               [MLStringToken]))
        tmat.append(TokenMatch(re.compile(r'''r?(["|']).*?(?<!\\)(\\\\)*\1'''),
                               [StringToken]))
        tmat.append(TokenMatch(re.compile(r"r?([`]).*?(?<!\\)(\\\\)*\1"),
                               [BacktickToken]))

        # Decorators and definitions go next. Note that we include
        # periods within the match for a decorator, unlike for regular
        # identifiers

        tmat.append(TokenMatch(re.compile(r"@[a-zA-Z_][\w\.]*"),
                               [DecoratorToken]))
        tmat.append(TokenMatch(re.compile(r"(def)(\s+)([a-zA-Z_][\w]*)"),
                               [KeywordToken,
                                WhitespaceToken,
                                DefinitionToken]))
        tmat.append(TokenMatch(re.compile(r"(class)(\s+)([a-zA-Z_][\w]*)"),
                               [KeywordToken,
                                WhitespaceToken,
                                DefinitionToken]))

        # Match regular identifiers, next. First check if an identifier
        # is preceded by a period, since if it is, we should not treat
        # it as a builtin or a keyword. If it's not, then label it a
        # FirstIdentifierToken, and _get_token() will replace it with a
        # KeywordToken or a BuiltinToken, if necessary.

        tmat.append(TokenMatch(re.compile(r"(\.)([a-zA-Z_][\w]*)"),
                               [SeparatorToken, IdentifierToken]))
        tmat.append(TokenMatch(re.compile(r"[a-zA-Z_][\w]*"),
                               [FirstIdentifierToken]))

        # Match numbers. Start with floats which start with a number
        # rather than a period, then floats which start with a period.
        # Note that we cannot make numbers both before and after the
        # period as optional in the same regular expression, or a plain
        # period will match as a float.
        #
        # Then match integers. Include an optional j on the end to
        # catch complex numbers. Note that, currently, the real and
        # imaginary parts of a complex number will be captured as two
        # separate numbers with an operator between them, which is
        # probably not ideal, and may be changed in the future.

        tmat.append(TokenMatch(re.compile(r"[0-9]+[\.][0-9]*((e|E)[\+\-]" +
                                          r"?[0-9]+)?(J|j)?"),
                               [FloatToken]))
        tmat.append(TokenMatch(re.compile(r"[\.][0-9]+((e|E)[\+\-]?" +
                                          r"[0-9]+)?(j|J)?"),
                               [FloatToken]))
        tmat.append(TokenMatch(re.compile(r"(0x)?[0-9]+(L|l)?(J|j)?"),
                               [IntegerToken]))

        # Look for assignment delimiter tokens, except the
        # regular '=' operator

        tmat.append(TokenMatch(re.compile(r"(\+=|\-=|\*=|/=|%=|//=|\*\*=)"),
                               [DelimiterToken]))

        # Look for multi-character operators

        tmat.append(TokenMatch(re.compile(r"(\*\*|<<|>>|<=|>=|<>|==|!=|//)"),
                               [OperatorToken]))

        # Look for the '=' operator only after matching any
        # multi-line operators, in particular we would never
        # match the '==' operator if we looked for the '='
        # operator first

        tmat.append(TokenMatch(re.compile(r"="),
                               [DelimiterToken]))

        # Look for single character operators

        tmat.append(TokenMatch(re.compile(r"[\+\*\-\/%~&\^\|<>]"),
                               [OperatorToken]))

        # Finally, look for single character separators

        tmat.append(TokenMatch(re.compile(r"[,:\.]"),
                               [SeparatorToken]))

    def _get_token(self, tclass, value):

        # Override the superclass method to distinguish
        # keywords from other plain identifiers. If we
        # don't find a keyword (or if we're not dealing
        # with an identifier) then just call the superclass
        # method

        if tclass == FirstIdentifierToken:

            # Note: This routine will highlight keywords and
            # builtins even if they're not intended to be such,
            # e.g. if a keyword is used as a function argument
            # it will be highlighted as a keyword. This is probably
            # desired behaviour, since that's not a good practice,
            # and vi seems to do the same.

            if value in keyword.kwlist:
                return KeywordToken(value)
            elif value in dir(__builtin__):
                return BuiltinToken(value)
            else:
                return IdentifierToken(value)

        return Parser._get_token(self, tclass, value)