GiPSy — General Purpose Scanner
__init__.py
"""
Package for GiPSy -- General Purpose Scanner
Library Version 1.1
Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net
Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""
from gipsy.parser import Parser
from gipsy.pyparser import PyParser
from gipsy.py2htmlparser import Py2HTMLParser
from gipsy.code2htmlmixin import Code2HTMLMixin
token.py
"""
Provides basic token classes for the GiPSy scanner.
Library Version 1.1
Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net
Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""
import cgi
class Token:
"""
Base Token class.
Class attributes:
dec_open : opening token decorator for marked-up output.
dec_close : closing token decorator for marked-up output
prefix : indentation string for outputting in tree view.
dec_open and dec_close are intended to be directly set by a parser
or scanner, to implement class-wide decorator strings.
prefix is not used by the base class, but can be used by subclasses
(e.g. the container class).
Instance attributes:
_token_type_string : a string containing a description of the token type
_value : the original, unmodified token string
Public methods:
__init__()
read()
"""
dec_open = ""
dec_close = ""
prefix = ""
tabs = None
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
self._token_type_string = "Unknown token"
if Token.tabs:
self._value = value.expandtabs(Token.tabs)
else:
self._value = value
def read(self, decorated=False, html=False, tree=False):
"""
Returns a string representing the unmodified or modified token.
Arguments:
decorated -- set to True to decorate with dec_open and dec_close
html -- set to True to HTML-escape the characters
tree -- set to True to output in an indented tree view
"""
if tree:
return "%s%s: %s\n" % (Token.prefix,
self._token_type_string,
self._value)
elif not decorated and not html:
return self._value
elif not decorated and html:
return cgi.escape(self._value, True)
elif decorated and not html:
return "%s%s%s" % (self.__class__.dec_open,
self._value,
self.__class__.dec_close)
else:
return "%s%s%s" % (self.__class__.dec_open,
cgi.escape(self._value, True),
self.__class__.dec_close)
def get_value(self):
"""
Returns the value of the _value attribute.
"""
return self._value
class WhitespaceToken(Token):
"""
Generic whitespace token class.
"""
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
Token.__init__(self, value)
self._token_type_string = "Whitespace"
def read(self, decorated=False, html=False, tree=False):
if tree:
return ""
else:
return Token.read(self, decorated=decorated, html=html, tree=tree)
class NewlineToken(Token):
"""
Generic new line token class.
"""
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
Token.__init__(self, value)
self._token_type_string = "Newline"
def read(self, decorated=False, html=False, tree=False):
if tree:
return ""
else:
return Token.read(self, decorated=decorated, html=html, tree=tree)
containertoken.py
"""
Provides container token classes for the GiPSy scanner.
Library Version 1.1
Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net
Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""
from gipsy.tokenlist import TokenList
from gipsy.token import Token
class OpeningToken(Token):
"""
Generic matched opening container token class.
"""
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
Token.__init__(self, value)
self._token_type_string = "Opening"
class ClosingToken(Token):
"""
Generic matched closing container token class.
"""
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
Token.__init__(self, value)
self._token_type_string = "Closing"
class UnmatchedToken(Token):
"""
Generic unmatched opening or closing container token class.
"""
def __init__(self, value):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
"""
Token.__init__(self, value)
self._token_type_string = "Unmatched"
class ContainerToken(Token):
"""
Container token class.
Allows a container (e.g. matched pair of parentheses,
braces, brackets) to be represented by a single token
in another list of tokens, and treated as such.
Instance attributes:
_tlist : a TokenList instance containing the contained tokens
"""
def __init__(self, value, parser):
"""
Class initializer.
Arguments:
value -- the string to be tokenized.
parser -- a reference to the parser which instantiated
the instance, used to parse the contained tokens.
"""
Token.__init__(self, value)
self._token_type_string = "Container"
self._tlist = TokenList()
c_open, c_middle, c_close = value[0], value[1:-1], value[-1]
self._tlist.push(OpeningToken(c_open))
self._tlist.push(parser.list_tokenize(c_middle))
self._tlist.push(ClosingToken(c_close))
def read(self, decorated=False, html=False, tree=False):
"""
Returns a string representing the unmodified or modified tokens.
Arguments:
decorated -- set to True to decorate with dec_open and dec_close
html -- set to True to HTML-escape the characters
tree -- set to True to output in an indented tree view
"""
if tree:
old_prefix = Token.prefix
Token.prefix += " "
out_str = self._tlist.read(decorated=decorated, html=html, tree=tree)
if tree:
Token.prefix = old_prefix
return out_str
tokenlist.py
"""
Provides a token list class for the GiPSy scanner.
Library Version 1.1
Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net
Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""
class TokenList:
"""
Class to maintain a list of tokens.
Instance attributes:
_tlist -- a Python list containing the tokens.
Public methods:
__init__()
push()
read()
"""
def __init__(self):
"""
Class initializer.
"""
self._tlist = []
def push(self, token):
"""
Pushes a new token onto the top of the list.
Arguments:
token -- the new token to push onto the list
"""
self._tlist.append(token)
def read(self, decorated=False, html=False, tree=False):
"""
Returns a string representing the unmodified or modified tokens.
Arguments:
decorated -- set to True to decorate with dec_open and dec_close
html -- set to True to HTML-escape the characters
tree -- set to True to output in an indented tree view
"""
out_str = ""
for token in self._tlist:
out_str += token.read(decorated=decorated, html=html, tree=tree)
return out_str
parser.py
"""
Provides the basic GiPSy (General Purpose Scanner).
Library Version 1.1
Copyright 2013 Paul Griffiths
Email: mail@paulgriffiths.net
Distributed under the terms of the GNU General Public License.
http://www.gnu.org/licenses/
"""
import re
from gipsy.token import Token, WhitespaceToken, NewlineToken
from gipsy.containertoken import ContainerToken, UnmatchedToken
from gipsy.tokenlist import TokenList
class TokenMatch:
"""
Provides a class of token matches for use when parsing.
Instance attributes:
pattern : a compiled regular expression describing the token pattern
clrefs : a Python list of class objects containing either one
object when the entire regular expression should match one token,
or the same number of objects are there are groups in the regular
expression.
"""
def __init__(self, pattern, clrefs):
"""
Class initializer.
Arguments:
pattern -- the compiled regular expression
clrefs -- the list of class objects
"""
self.pattern = pattern
self.clrefs = clrefs
class Parser:
"""
General purpose scanner class.
"""
def __init__(self):
"""
Class initializer.
Subclasses should call this at the beginning of their own
initializers, to ensure _token_matches and _containers are
declared before appending to them.
"""
self._tlist = []
self._unknown = ""
self._token_matches = []
self._token_matches.append(TokenMatch(re.compile(r"[^\S\n]+"),
[WhitespaceToken]))
self._token_matches.append(TokenMatch(re.compile(r"[\n]{1}"),
[NewlineToken]))
self._containers = [('(', ')'), ('{', '}')]
self._set_token_decoration()
def _set_token_decoration(self):
"""
Sets decoration for tokens.
Subclasses should override this method and set
token decorators, if desired.
"""
pass
def _get_token(self, tclass, value):
"""
Returns a token.
The default behavior is to return a token of the specified
class with the specified value. Subclasses should override
this if intervention is required. For instance, an alphanumeric
identifier could be a variable name, or a keyword, so
'value' could be checked for keywords here, and a 'Keyword'
token returned if there is a match, and a plain 'Identifier'
token returned if there is not.
Arguments:
tclass -- a class object representing the desired (or tentatively
desired, if further processing is necessary) class of the new token
value -- the value of the new token
"""
return tclass(value)
def tokenize(self, instr, tabs=None):
"""
Populates the 'tlist' instance attribute with a list of tokens.
Arguments:
instr -- a string containing the input to tokenize.
tabs -- replace tabs with this number of spaces.
"""
if tabs:
Token.tabs = tabs
self._tlist = self.list_tokenize(instr)
def list_tokenize(self, instr):
"""
Returns a list of tokens, tokenized from given input.
This method can be called by other classes (e.g. the
ContainerToken class) when further input needs to be
processed using the same parser.
Arguments:
instr -- a string containing the input to tokenize.
"""
tlist = TokenList()
self._unknown = ""
while instr:
(instr, container_found) = self._parse_containers(instr, tlist)
if not container_found:
(instr, token_found) = self._parse_tokens(instr, tlist)
if not token_found:
self._unknown += instr[0]
instr = instr[1:]
if self._unknown:
tlist.push(Token(self._unknown))
self._unknown = ""
return tlist
def _parse_tokens(self, instr, tlist):
"""
Parses the beginning of a string for a token match, and
adds a new token to a token list if it finds one.
Arguments:
instr -- the string to parse
tlist -- the token list to add to on success
Returns a two-element tuple, (instr, flag):
instr -- the original input with any matched token removed
from the beginning
flag -- True if a token was found, False if not.
"""
for mtc in self._token_matches:
mobj = mtc.pattern.match(instr)
if mobj:
if self._unknown:
tlist.push(Token(self._unknown))
self._unknown = ""
for num, toc in enumerate(mtc.clrefs):
result = mobj.group(num + 1 if len(mtc.clrefs) > 1 else 0)
tlist.push(self._get_token(toc, result))
instr = instr[len(result):]
return (instr, True)
else:
return (instr, False)
def _parse_containers(self, instr, tlist):
"""
Parses the beginning of a string for a container, and
adds a new ContainerToken to a token list if it finds one.
Also recognizes an unmatched opening or closing container
token, and adds an UnmatchedToken to the list if it finds
one.
Arguments:
instr -- the string to parse
tlist -- the token list to add to on success
Returns a two-element tuple, (instr, flag):
instr -- the original input with any found container or
unmatched container symbol removed from the beginning
flag -- True if a container or unmatched container symbol
was found, False if not.
"""
if instr[0] in [c[1] for c in self._containers]:
if self._unknown:
tlist.push(Token(self._unknown))
self._unknown = ""
tlist.push(UnmatchedToken(instr[0]))
instr = instr[1:]
return (instr, True)
elif instr[0] in [c[0] for c in self._containers]:
if self._unknown:
tlist.push(Token(self._unknown))
self._unknown = ""
i = [c[0] for c in self._containers].index(instr[0])
ctr = self._containers[i]
cstr, tstr, ccount = instr[0], instr[1:], 1
while tstr and ccount:
if tstr[0] == ctr[0]:
ccount += 1
elif tstr[0] == ctr[1]:
ccount -= 1
cstr += tstr[0]
tstr = tstr[1:]
if not ccount:
tlist.push(ContainerToken(cstr, self))
instr = instr[len(cstr):]
return (instr, True)
if not tstr:
tlist.push(UnmatchedToken(instr[0]))
instr = instr[1:]
return (instr, True)
else:
return (instr, False)
def read(self, decorated=False, html=False, tree=False):
"""
Returns a string representing the unmodified or modified tokens.
Arguments:
decorated -- set to True to decorate with dec_open and dec_close
html -- set to True to HTML-escape the characters
tree -- set to True to output in an indented tree view
"""
return self._tlist.read(decorated=decorated, html=html, tree=tree)