Source code for alex.components.nlg.tectotpl.block.a2w.cs.removerepeatedtokens

#!/usr/bin/env python
# coding=utf-8
#
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re

__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class RemoveRepeatedTokens(Block):
    """\
    Remove two identical neighboring tokens.
    """

    def __init__(self, scenario, args):
        "Constructor, checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')

[docs]    def process_zone(self, zone):
        """\
        Remove two identical neighboring tokens in the given sentence.
        """
        tokens = re.split(r'(\W+)', zone.sentence)
        sent = ''
        prev = None
        for i, token in enumerate(tokens):
            if i == 0 or re.match(r'^\s+$', token) or token.lower() != prev:
                sent += token
            if not re.match(r'^\s+$', token):
                prev = token.lower()
        # normalize accidentally messed-up spaces
        sent = re.sub(r'\s+', ' ', sent)
        sent = re.sub(r' ([.,])$', r'\1', sent)
        # assign back
        zone.sentence = sent