Source code for alex.components.nlg.tectotpl.block.a2w.cs.concatenatetokens

#!/usr/bin/env python
# coding=utf-8
#
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re

__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class ConcatenateTokens(Block):
    """\
    Detokenize the sentence, spread whitespace correctly.
    """

    def __init__(self, scenario, args):
        "Constructor, checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')

[docs]    def process_zone(self, zone):
        """\
        Detokenize the sentence and assign the result to the sentence attribute
        of the current zone.
        """
        aroot = zone.atree
        sent = ' '.join([a.form for a
                         in aroot.get_descendants(ordered=True)
                         if a.form and not re.match(r'^(#[A-Z]|[A-Z]{3}$)',
                                                    a.form)])
        # whitespace around punctuation
        sent = re.sub(r' ([“,.?:;])', r'\1', sent)
        sent = re.sub(r'(["“])\.', r'.\1', sent)
        sent = re.sub(r'„ ', r'„', sent)
        # normalizing
        sent = re.sub(r' -- ', r' – ', sent)
        sent = re.sub(r'_', r' ', sent)
        # space around parentheses
        sent = re.sub(r',?\(,? ?', r'(', sent)
        sent = re.sub(r' ?,? ?\)', r')', sent)
        if sent.startswith('('):
            sent = re.sub(r'\)\.', r'.)', sent)
        zone.sentence = sent