"""nlpseg_format.py

A Martel format usable to parse the "seg" mode from nlp.

Formats:
format

"""
from Martel import *


# Each sentence starts out with two lines that indicate the original
# sentence and the preprocessed form of the sentence.
# Original : Several different methods of transcriptional regulation of the AR 
# PreProc  : Several different methods of transcriptional regulation of the >AS
original = Group("original", Re(r"[^\R]+"))
original_line = Str("Original : ") + original + AnyEol()
preproc = Group("preproc", Re(r"[^\R]+"))
preproc_line = Str("PreProc  : ") + preproc + AnyEol()


# A word consists of the word, an optional root, the tag source, and
# the part of speech.
# [hCG (?)(UNK)]
# [reasons (root: reason) (MOR)(N PLURAL)]
# [is (root: be) (LEX)(COP)]
# [>PERIOD (LEX)(PUNC)]
# [brilliant (INF-LEX)(ADJ) (N)]
# [years (root: year) (MOR)(N PLURAL(TIME))]
# [and (LEX)]
word = Rep1(AnyBut(" "))
word = Group("word", word)
root = Rep1(AnyBut(" )"))
root = Str("root: ") + Group("root", root)
tag_source = Rep1(AnyBut(")"))
tag_source = Group("tag_source", tag_source)
paren_pos = Str("(") + Rep1(AnyBut(")")) + Str(")")
part_of_speech = Rep1(AnyBut("()") + Opt(paren_pos))
part_of_speech = Group("part_of_speech", part_of_speech)
word_line = Str("[") + \
            word + Str(" ") + \
            Opt(Str("(") + root + Str(")") + Str(" ")) + \
            Str("(") + tag_source + Str(")") + \
            Rep(Opt(Str(" ")) + Str("(") + part_of_speech + Str(")")) + \
            Str("]") + \
            AnyEol()


# End of sentence has its own weird little line.
# [<EOS (?)]
# eos_line = Str("[<EOS (?)]") + AnyEol()

# I can also see an EOS that fulfills a syntactic structure.  For
# example, if PubMed cuts an abstract short (10740894), the fragment
# "a cellular level a" will end with a "[<EOS (INF-LEX)(ADJ) (N)]"

eos_line = Str("[<EOS") + Re(r"[^]]+") + Str("]") + AnyEol()


# Parentheses are blocked off according to:
# PARENS:
# ...
# END PARENS
parens_start_line = Str("PARENS:") + AnyEol()
parens_end_line = Str("END PARENS") + AnyEol()
parens_line = Alt(parens_start_line, parens_end_line)

# Brackets are blocked off according to:
# BRACKETS:
# ...
# END BRACKETS
brackets_start_line = Str("BRACKETS:") + AnyEol()
brackets_end_line = Str("END BRACKETS") + AnyEol()
brackets_line = Alt(brackets_start_line, brackets_end_line)


# Segments mark off lexical blocks of text.  The syntactic role is
# given when appropriate.
# ADJ SEGMENT:
# VP SEGMENT (ACTIVE_VERB):
synrole = Rep1(Any(string.ascii_letters + "_"))
synrole = Group("synrole", synrole)
# Match ascii+"-" or spaces that aren't followed by parentheses
segment = Rep1(Alt(Rep1(Any(string.ascii_letters + "-")), Re(r" (?!\()")))
# cons = Re(r"(?:[%s-]+| (?!\())+" % string.ascii_letters)  # same as above
segment = Group("segment", segment)
segment_line = segment + \
               Opt(Str(" (") + synrole + Str(")")) + \
               Str(":") + AnyEol()


# Before preprositional phrases, there are lines to indicate where it
# attaches.
# Following PP attaches to: Several different methods
pp_attachment = Re(r"[^\R]+")
pp_attachment = Group("pp_attachment", pp_attachment)
pp_attachment_line = Str("Following PP attaches to: ") + \
                     pp_attachment + AnyEol()

# Blank lines separate segments.
blank_line = Rep(Str(" ")) + AnyEol()


# Lines are indented by different amounts.
indent = Rep1(Str(" "))
indent = Group("indent", indent)

# Make an expression for anything that can be a constituent of a
# segment.
indented_line = indent + Alt(eos_line, word_line, segment_line,
                             parens_line, brackets_line)
constituent_line = Alt(indented_line, pp_attachment_line, blank_line)


# A clause is a "CLAUSE:" line followed by constituents and blank
# lines.
clause_line = Str("CLAUSE:") + AnyEol()
clause_group = clause_line + Rep1(constituent_line)
clause_group = Group("clause", clause_group)


# Sentence is the "Original" and "PreProc" lines followed by one or
# more clauses.
sentence = original_line + preproc_line + Rep1(clause_group)
sentence = Group("sentence", sentence)


# The format is one or more sentences.
format = Rep1(sentence)