GiPSy — General Purpose Scanner

__init__.py

"""
Package for GiPSy -- General Purpose Scanner

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""

from gipsy.parser import Parser
from gipsy.pyparser import PyParser
from gipsy.py2htmlparser import Py2HTMLParser
from gipsy.code2htmlmixin import Code2HTMLMixin

token.py

"""
Provides basic token classes for the GiPSy scanner.

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


import cgi


class Token:

    """
    Base Token class.

    Class attributes:
    dec_open  : opening token decorator for marked-up output.
    dec_close : closing token decorator for marked-up output
    prefix    : indentation string for outputting in tree view.

    dec_open and dec_close are intended to be directly set by a parser
    or scanner, to implement class-wide decorator strings.

    prefix is not used by the base class, but can be used by subclasses
    (e.g. the container class).

    Instance attributes:
    _token_type_string : a string containing a description of the token type
    _value : the original, unmodified token string

    Public methods:
    __init__()
    read()

    """

    dec_open = ""
    dec_close = ""
    prefix = ""
    tabs = None

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        self._token_type_string = "Unknown token"
        if Token.tabs:
            self._value = value.expandtabs(Token.tabs)
        else:
            self._value = value

    def read(self, decorated=False, html=False, tree=False):

        """
        Returns a string representing the unmodified or modified token.

        Arguments:
        decorated -- set to True to decorate with dec_open and dec_close
        html -- set to True to HTML-escape the characters
        tree -- set to True to output in an indented tree view
        """

        if tree:
            return "%s%s: %s\n" % (Token.prefix,
                                   self._token_type_string,
                                   self._value)
        elif not decorated and not html:
            return self._value
        elif not decorated and html:
            return cgi.escape(self._value, True)
        elif decorated and not html:
            return "%s%s%s" % (self.__class__.dec_open,
                               self._value,
                               self.__class__.dec_close)
        else:
            return "%s%s%s" % (self.__class__.dec_open,
                               cgi.escape(self._value, True),
                               self.__class__.dec_close)

    def get_value(self):

        """
        Returns the value of the _value attribute.
        """

        return self._value


class WhitespaceToken(Token):

    """
    Generic whitespace token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Whitespace"

    def read(self, decorated=False, html=False, tree=False):

        # Override superclass function to suppress annoying
        # output of whitespace tokens in the tree view

        if tree:
            return ""
        else:
            return Token.read(self, decorated=decorated, html=html, tree=tree)


class NewlineToken(Token):

    """
    Generic new line token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Newline"

    def read(self, decorated=False, html=False, tree=False):

        # Override superclass function to suppress annoying
        # output of whitespace tokens in the tree view

        if tree:
            return ""
        else:
            return Token.read(self, decorated=decorated, html=html, tree=tree)

containertoken.py

"""
Provides container token classes for the GiPSy scanner.

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


from gipsy.tokenlist import TokenList
from gipsy.token import Token


class OpeningToken(Token):

    """
    Generic matched opening container token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Opening"


class ClosingToken(Token):

    """
    Generic matched closing container token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Closing"


class UnmatchedToken(Token):

    """
    Generic unmatched opening or closing container token class.
    """

    def __init__(self, value):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        """

        Token.__init__(self, value)
        self._token_type_string = "Unmatched"


class ContainerToken(Token):

    """
    Container token class.

    Allows a container (e.g. matched pair of parentheses,
    braces, brackets) to be represented by a single token
    in another list of tokens, and treated as such.

    Instance attributes:
    _tlist : a TokenList instance containing the contained tokens

    """

    def __init__(self, value, parser):

        """
        Class initializer.

        Arguments:
        value -- the string to be tokenized.
        parser -- a reference to the parser which instantiated
        the instance, used to parse the contained tokens.
        """

        Token.__init__(self, value)
        self._token_type_string = "Container"
        self._tlist = TokenList()

        # Push opening container token, parsed contained
        # tokens, and closing container token onto the list

        c_open, c_middle, c_close = value[0], value[1:-1], value[-1]
        self._tlist.push(OpeningToken(c_open))
        self._tlist.push(parser.list_tokenize(c_middle))
        self._tlist.push(ClosingToken(c_close))

    def read(self, decorated=False, html=False, tree=False):

        """
        Returns a string representing the unmodified or modified tokens.

        Arguments:
        decorated -- set to True to decorate with dec_open and dec_close
        html -- set to True to HTML-escape the characters
        tree -- set to True to output in an indented tree view
        """

        if tree:
            old_prefix = Token.prefix
            Token.prefix += "  "

        out_str = self._tlist.read(decorated=decorated, html=html, tree=tree)

        if tree:
            Token.prefix = old_prefix

        return out_str

tokenlist.py

"""
Provides a token list class for the GiPSy scanner.

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


class TokenList:

    """
    Class to maintain a list of tokens.

    Instance attributes:
    _tlist -- a Python list containing the tokens.

    Public methods:
    __init__()
    push()
    read()

    """

    def __init__(self):

        """
        Class initializer.
        """

        self._tlist = []

    def push(self, token):

        """
        Pushes a new token onto the top of the list.

        Arguments:
        token -- the new token to push onto the list
        """

        self._tlist.append(token)

    def read(self, decorated=False, html=False, tree=False):

        """
        Returns a string representing the unmodified or modified tokens.

        Arguments:
        decorated -- set to True to decorate with dec_open and dec_close
        html -- set to True to HTML-escape the characters
        tree -- set to True to output in an indented tree view
        """

        out_str = ""
        for token in self._tlist:
            out_str += token.read(decorated=decorated, html=html, tree=tree)
        return out_str

parser.py

"""
Provides the basic GiPSy (General Purpose Scanner).

Library Version 1.1

Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net

Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""


import re
from gipsy.token import Token, WhitespaceToken, NewlineToken
from gipsy.containertoken import ContainerToken, UnmatchedToken
from gipsy.tokenlist import TokenList


class TokenMatch:

    """
    Provides a class of token matches for use when parsing.

    Instance attributes:
    pattern : a compiled regular expression describing the token pattern
    clrefs : a Python list of class objects containing either one
    object when the entire regular expression should match one token,
    or the same number of objects are there are groups in the regular
    expression.
    """

    def __init__(self, pattern, clrefs):

        """
        Class initializer.

        Arguments:
        pattern -- the compiled regular expression
        clrefs -- the list of class objects
        """

        self.pattern = pattern
        self.clrefs = clrefs


class Parser:

    """
    General purpose scanner class.
    """

    def __init__(self):

        """
        Class initializer.

        Subclasses should call this at the beginning of their own
        initializers, to ensure _token_matches and _containers are
        declared before appending to them.
        """

        self._tlist = []
        self._unknown = ""

        # Populate _token_matches with basic tokens

        self._token_matches = []
        self._token_matches.append(TokenMatch(re.compile(r"[^\S\n]+"),
                                   [WhitespaceToken]))
        self._token_matches.append(TokenMatch(re.compile(r"[\n]{1}"),
                                   [NewlineToken]))

        # Declare, but do not populate, _containers

        self._containers = [('(', ')'), ('{', '}')]

        # Decorate tokens, if necessary

        self._set_token_decoration()

    def _set_token_decoration(self):

        """
        Sets decoration for tokens.

        Subclasses should override this method and set
        token decorators, if desired.
        """

        pass

    def _get_token(self, tclass, value):

        """
        Returns a token.

        The default behavior is to return a token of the specified
        class with the specified value. Subclasses should override
        this if intervention is required. For instance, an alphanumeric
        identifier could be a variable name, or a keyword, so
        'value' could be checked for keywords here, and a 'Keyword'
        token returned if there is a match, and a plain 'Identifier'
        token returned if there is not.

        Arguments:
        tclass -- a class object representing the desired (or tentatively
        desired, if further processing is necessary) class of the new token
        value -- the value of the new token
        """

        return tclass(value)

    def tokenize(self, instr, tabs=None):

        """
        Populates the 'tlist' instance attribute with a list of tokens.

        Arguments:
        instr -- a string containing the input to tokenize.
        tabs -- replace tabs with this number of spaces.
        """

        if tabs:
            Token.tabs = tabs

        self._tlist = self.list_tokenize(instr)

    def list_tokenize(self, instr):

        """
        Returns a list of tokens, tokenized from given input.

        This method can be called by other classes (e.g. the
        ContainerToken class) when further input needs to be
        processed using the same parser.

        Arguments:
        instr -- a string containing the input to tokenize.
        """

        tlist = TokenList()
        self._unknown = ""

        while instr:
            (instr, container_found) = self._parse_containers(instr, tlist)

            if not container_found:
                (instr, token_found) = self._parse_tokens(instr, tlist)

                if not token_found:
                    self._unknown += instr[0]
                    instr = instr[1:]

        if self._unknown:
            tlist.push(Token(self._unknown))
            self._unknown = ""

        return tlist

    def _parse_tokens(self, instr, tlist):

        """
        Parses the beginning of a string for a token match, and
        adds a new token to a token list if it finds one.

        Arguments:
        instr -- the string to parse
        tlist -- the token list to add to on success

        Returns a two-element tuple, (instr, flag):
        instr -- the original input with any matched token removed
        from the beginning
        flag -- True if a token was found, False if not.
        """

        for mtc in self._token_matches:
            mobj = mtc.pattern.match(instr)
            if mobj:
                if self._unknown:

                    # If we've been accumulating unrecognized
                    # input, then add it to the token list as
                    # a generic (and therefore unknown) token
                    # before proceeding

                    tlist.push(Token(self._unknown))
                    self._unknown = ""

                for num, toc in enumerate(mtc.clrefs):

                    # We've found a recognized token, so instantiate
                    # all the token class objects that the matching
                    # pattern demands

                    result = mobj.group(num + 1 if len(mtc.clrefs) > 1 else 0)
                    tlist.push(self._get_token(toc, result))
                    instr = instr[len(result):]

                return (instr, True)
        else:
            return (instr, False)

    def _parse_containers(self, instr, tlist):

        """
        Parses the beginning of a string for a container, and
        adds a new ContainerToken to a token list if it finds one.
        Also recognizes an unmatched opening or closing container
        token, and adds an UnmatchedToken to the list if it finds
        one.

        Arguments:
        instr -- the string to parse
        tlist -- the token list to add to on success

        Returns a two-element tuple, (instr, flag):
        instr -- the original input with any found container or
        unmatched container symbol removed from the beginning
        flag -- True if a container or unmatched container symbol
        was found, False if not.
        """

        if instr[0] in [c[1] for c in self._containers]:

            # If we get here, then we have a closing
            # container symbol without having encountered
            # an opening one, so add it to the token list
            # as an unmatched token

            if self._unknown:

                # If we've been accumulating unrecognized
                # input, then add it to the token list as
                # a generic (and therefore unknown) token
                # before proceeding

                tlist.push(Token(self._unknown))
                self._unknown = ""

            tlist.push(UnmatchedToken(instr[0]))
            instr = instr[1:]
            return (instr, True)

        elif instr[0] in [c[0] for c in self._containers]:

            # If we get here, we've found an opening
            # container symbol, so begin to look for
            # a matching closing one.

            if self._unknown:

                # If we've been accumulating unrecognized
                # input, then add it to the token list as
                # a generic (and therefore unknown) token
                # before proceeding

                tlist.push(Token(self._unknown))
                self._unknown = ""

            # Get the (two-item) list of opening and closing
            # symbols for the particular type of container
            # that we've potentially found

            i = [c[0] for c in self._containers].index(instr[0])
            ctr = self._containers[i]

            # Set up some variables we'll need.
            # - 'cstr' is used to collect all the characters
            # in the potential container, to be passed to
            # a ContainerToken object for further processing
            # - 'tstr' is used to loop through the remaining
            # input, without destroying the integrity of
            # 'instr', since if we discover we don't have a
            # matching closing container symbol, then we don't
            # have a real container, and we'll need to continue
            # parsing 'instr' as before
            # - 'ccount' is used to count the number of unmatched
            # container symbols, if it gets to zero then we'll
            # know we have a properly matching pair.

            cstr, tstr, ccount = instr[0], instr[1:], 1

            # Loop until we're out of input, or until
            # we find a matching closing container symbol

            while tstr and ccount:
                if tstr[0] == ctr[0]:
                    ccount += 1
                elif tstr[0] == ctr[1]:
                    ccount -= 1
                cstr += tstr[0]
                tstr = tstr[1:]

            if not ccount:

                # If we have a matching pair, push the matched
                # input into a new ContainerToken, and continue

                tlist.push(ContainerToken(cstr, self))
                instr = instr[len(cstr):]
                return (instr, True)

            if not tstr:

                # If we didn't find a matching pair, then push
                # the original symbol onto the token list as an
                # unmatched token, and continue

                tlist.push(UnmatchedToken(instr[0]))
                instr = instr[1:]
                return (instr, True)

        else:
            return (instr, False)

    def read(self, decorated=False, html=False, tree=False):

        """
        Returns a string representing the unmodified or modified tokens.

        Arguments:
        decorated -- set to True to decorate with dec_open and dec_close
        html -- set to True to HTML-escape the characters
        tree -- set to True to output in an indented tree view
        """

        return self._tlist.read(decorated=decorated, html=html, tree=tree)