Source code for alex.components.nlg.tectotpl.block.a2w.cs.removerepeatedtokens

#!/usr/bin/env python
# coding=utf-8
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re

__author__ = "Ondřej Dušek"
__date__ = "2012"

[docs]class RemoveRepeatedTokens(Block): """\ Remove two identical neighboring tokens. """ def __init__(self, scenario, args): "Constructor, checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!')
[docs] def process_zone(self, zone): """\ Remove two identical neighboring tokens in the given sentence. """ tokens = re.split(r'(\W+)', zone.sentence) sent = '' prev = None for i, token in enumerate(tokens): if i == 0 or re.match(r'^\s+$', token) or token.lower() != prev: sent += token if not re.match(r'^\s+$', token): prev = token.lower() # normalize accidentally messed-up spaces sent = re.sub(r'\s+', ' ', sent) sent = re.sub(r' ([.,])$', r'\1', sent) # assign back zone.sentence = sent