Source code for alex.components.nlg.tectotpl.block.t2a.cs.vocalizeprepos

#!/usr/bin/env python
# coding=utf-8
#
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re

__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class VocalizePrepos(Block):
    """\
    This block replaces the forms of prepositions 'k', 'v', 'z', 's'
    with their vocalized variants 'ke'/'ku', 've', 'ze', 'se' according
    to the following word.
    """

    def __init__(self, scenario, args):
        "Constructor, checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')

[docs]    def process_atree(self, aroot):
        """\
        Find and vocalize prepositions according to their context.
        """
        anodes = aroot.get_descendants(ordered=True)
        for anode, anext in zip(anodes[:-1], anodes[1:]):
            if anode.morphcat_pos == 'R' and \
                    anode.lemma in {'k', 'v', 'z', 's'}:
                anode.form = self.vocalize(anode.lemma, anext.form.lower())

[docs]    def vocalize(self, prep, follow):
        """\
        Given a preposition lemma and the form of the word following it,
        return the appropriate form (base or vocalized).
        """
        if prep == 'k' and re.match('^(prospěch|příklad)', follow):
            return 'ku'
        if prep == 'k' and re.match('^(k|g|sp|sn|zv|zm|sc|zl|sl|sk|zp|zk|šk|' +
                                    'zd|zt|zb|zr|sv|mn|vš|vs|ct|sj|dv|zř|zh|' +
                                    'vč|šp|lá|šť|mř|zc|št|vk|sta|vzn|stu|' +
                                    'vzd|smí|stě|dnu|vzo|sti|sty|sro|dnů|' +
                                    'sdr|sbl|sbí|čty|zná)', follow):
            return 'ke'
        if prep == 'v' and re.match('^(v|f|st|sp|čt|sk|sv|kt|fr|fi|sl|sn|fu|' +
                                    'zl|fo|šv|zn|zp|šk|wa|ii|hř|dv|zd|sb|šp|' +
                                    'sh|št|zb|fa|fá|rw|zk|wi|tm|jm|we|fs|fy|' +
                                    'fó|žď|hv|gy|mz|žd|šl|gi|zh|sj|zt|žr|šr|' +
                                    'cv|sw|sro|sml|tří|tva|srá|obž|zví|psa|' +
                                    'smr|žlu|sca|zrů|sce|zvo|zme|mně$|mne$)',
                                    follow):
            return 've'
        if prep == 's' and re.match('^(s|z|kt|vz|vš|mn|šk|že|čt|šv|št|ps|vs|' +
                                    'šp|ži|cm|ža|ct|cv|dž|šl|še|bý|čle|jmě|' +
                                    'ple|šam|lst|prs|dvě|dře|7|17$|1\d\d\D?)',
                                    follow):
            return 'se'
        if prep == 'z' and re.match('^(s|z|kt|dn|šk|vs|šv|vš|št|šu|dř|mz|ži|' +
                                    'tm|kb|šp|pé|ša|kč|hv|nk|ši|rt|lh|ký|ža|' +
                                    'lv|šl|žď|žl|hry|vzd|tří|rom|jmě|šes|' +
                                    'mne|řet|hři|lan|žel|pan|wil|dou|thp|' +
                                    'pak|půt|cih|brá|hrd|mik|idy|psů|mst|' +
                                    'mag|vas|4|7|17|1\d\d\D?)', follow):
            return 'ze'
        return prep