"""tokenizer_format.py

A Martel format usable for tokenization.

A token is:
  - a punctuation character
  - a string of whitespace
  - a newline
  - a string of alphanumeric characters
  - an unprintable character

Groups:
TEXT
WHITESPACE
NEWLINE
PUNCTUATION

Formats:
format

"""
import string
from Martel import *

TEXT = "TEXT"
WHITESPACE = "WHITESPACE"
NEWLINE = "NEWLINE"
PUNCTUATION = "PUNCTUATION"
UNPRINTABLE = "UNPRINTABLE"

text = Group(TEXT, Re(r"\w+"))

# Get a list of all the whitespace characters, excluding newline.
x = string.replace(string.whitespace, "\n", "")
space = Group(WHITESPACE, Re("[%s]+" % x))

newline = Group(NEWLINE, Str("\n"))

# Create an Expression for each punctuation character.
punctuation_expressions = [Group(PUNCTUATION, Str(x))
                           for x in string.punctuation]
# A punctuation token is any punctuation character.
punctuation = Alt(*punctuation_expressions)

unprintable = AnyBut(string.printable)

# The tokenizer format is any combination of text, whitespaces, or
# punctuation.
format = Rep(Alt(text, punctuation, space, newline, unprintable))
