#!/usr/bin/env python

"""This module provides functions to hyphenate words according to TeX.

If you are going to hyphenate words, it is much faster to create and
reuse a SyllableSplitter object than to repeatedly call the split
function.


Class:
SyllableSplitter         Class that can split words into syllables.

Function:
split                    Split a word into syllables.

"""
import os
import types
import operator
from xml.sax import handler

from Bio import listfns

import datafile
import ctype
from formats import ushyph_format

class SyllableSplitter:
    """Class to split words into syllables.

    Methods:
    split   Return a list of syllables.

    """
    def __init__(self, name_or_handle='ushyph.tex', exceptions={}):
        # Get a handle from name_or_handle.
        if type(name_or_handle) is types.InstanceType and \
           name_or_handle.hasattr('read'):
            handle = name_or_handle
        else:
            filename = datafile.find(name_or_handle)
            if not filename:
                raise ValueError, "I couldn't find %s" % name_or_handle
            handle = open(filename)
        # Make sure exceptions is reasonable.
        if filter(lambda x: not operator.isSequenceType(x),
                  exceptions.values()):
            raise ValueError, "exceptions should be dict of word -> syllables"
        
        x = load_hyphenation_table(open(filename))
        patterns, hyphenation = x
        # Trie lookup takes much longer than just dict lookup.
        #self._patterns = Trie.Trie()  # word -> alternating syllable, score
        self._patterns = {}    # word -> alternating syllable, score
        for p in patterns:
            parts = _split_pattern(p)
            word = ''
            for i in range(len(parts)):
                if ctype.isdigit(parts[i]):
                    parts[i] = int(parts[i])
                else:
                    word += parts[i]
            self._patterns[word] = parts
        lengths = map(len, self._patterns.keys())
        self._wordlen = min(lengths), max(lengths)

        self._hyphenation = {}   # word -> list of syllables
        for h in hyphenation:
            syllables = h.split("-")
            word = ''.join(syllables)
            self._hyphenation[word] = syllables
        # Now add all the exception cases.
        keys = exceptions.keys()
        for k in keys:
            self._hyphenation[k.lower()] = exceptions[k]

    # Since the same words are used over and over (Zipf's Law), cache
    # them.
    
    _SPLIT_CACHE = {}   # word -> syllables
    def split(self, word):
        """S.split(word) -> list of syllables"""
        word = word.strip().lower()     # clean up the word
        if self._hyphenation.has_key(word):
            return self._hyphenation[word]

        if self._SPLIT_CACHE.has_key(word):
            return self._SPLIT_CACHE[word]
        syllables = self._split(word)
        if len(self._SPLIT_CACHE) < 1000:
            self._SPLIT_CACHE[word] = syllables
        return syllables

    def _split(self, word):
        values = self._make_interletter_values(word)
        indexes = listfns.indexesof(values, _is_odd)
        if not indexes:
            return [word]
        
        syllables = []
        start = 0
        for i in indexes:
            syllables.append(word[start:i])
            start = i
        syllables.append(word[start:])
        
        if not syllables[0]:
            del syllables[0]
        if not syllables[-1]:
            del syllables[-1]
        return syllables

    def _make_interletter_values(self, word):
        # return a list of values for whether to split before that letter.
        minlen, maxlen = self._wordlen
        return _make_interletter_values_helper(
            word, self._patterns, minlen, maxlen)

def _make_interletter_values_helper(word, patterns, minlen, maxlen):
    values = [0] * (len(word)+1)
    word = ".%s." % word
    for patlen in range(minlen, maxlen+1):
        for start in range(len(word)-patlen):
            pat = word[start:start+patlen]
            if not patterns.has_key(pat):
                continue
            i = start-1  # Subtract 1 because values is before the word.
            for score_or_part in patterns[pat]:
                if type(score_or_part) is types.IntType:
                    if score_or_part > values[i]:
                        values[i] = score_or_part
                    #values[i] = max(values[i], score_or_part)
                else:
                    i += len(score_or_part)
    return values

def _is_odd(n):
    return n & 1

def split(word, *args, **keywds):
    """split(word) -> list of syllables"""
    return SyllableSplitter(*args, **keywds).split(word)

class _SyllableExtractor(handler.ContentHandler):
    def __init__(self):
        self.parts = []
    def characters(self, content):
        self.parts.append(content)

_pattern_parser = ushyph_format.pattern_format.make_parser()
def _split_pattern(pattern):
    extractor = _SyllableExtractor()
    parser = _pattern_parser
    parser.setContentHandler(extractor)
    parser.setErrorHandler(handler.ErrorHandler())
    parser.parseString(pattern)
    return extractor.parts
    

class _PatternExtractor(handler.ContentHandler):
    def __init__(self):
        self.patterns = []
        self.hyphenation = []
        self._name = None
    def startElement(self, name, attrs):
        self._name = name
    def endElement(self, name):
        self._name = None
    def characters(self, content):
        if self._name == 'patterns':
            self.patterns.append(content)
        elif self._name == 'hyphenation':
            self.hyphenation.append(content)

_table_parser = ushyph_format.format.make_parser()
def load_hyphenation_table(handle):
    """load_hyphenation_table(handle) -> patterns, hyphenation"""
    extractor = _PatternExtractor()
    parser = _table_parser
    parser.setContentHandler(extractor)
    parser.setErrorHandler(handler.ErrorHandler())
    parser.parseFile(handle)
    return extractor.patterns, extractor.hyphenation

try:
    import ctexhyphen
except ImportError:
    pass
else:
    _make_interletter_values_helper = ctexhyphen._make_interletter_values_helper
    _is_odd = ctexhyphen._is_odd
