Source code for alex.components.nlg.tectotpl.block.t2a.cs.capitalizesentstart

#!/usr/bin/env python
# coding=utf-8
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re
from alex.components.nlg.tectotpl.core.util import first

__author__ = "Ondřej Dušek"
__date__ = "2012"

[docs]class CapitalizeSentStart(Block): """\ Capitalize the first word in the sentence (skip punctuation etc.). """ OPEN_PUNCT = r'^[({[‚„«‹|*"\']+$' def __init__(self, scenario, args): "Constructor, checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!')
[docs] def process_zone(self, zone): """\ Find the first valid word in the sentence and capitalize it. """ aroot = zone.atree troot = zone.ttree # take the first non-technical root (assume other to be parsing errors) sent_roots = aroot.get_children(ordered=True) if sent_roots: sent_roots = sent_roots[0:1] # add all direct speech roots sent_roots.extend([t.lex_anode for t in troot.get_descendants() if t.is_dsp_root and t.lex_anode]) # capitalize the 1st words under the selected roots for sent_root in sent_roots: # find the first word word1 = first(lambda n: n.morphcat_pos != 'Z' and not re.match(self.OPEN_PUNCT, n.form or n.lemma or ''), sent_root.get_descendants(ordered=True, add_self=True)) # skip empty sentences if not word1 or not word1.form: continue # compensate wrong parses in direct speech: check if the node # either starts the sentence or follows punctuation word0 = word1.get_prev_node() if word0 and word0.morphcat_pos != 'Z' and \ not re.match(self.OPEN_PUNCT, word0.form or word0.lemma or ''): continue # make it uppercase word1.form = word1.form[0].upper() + word1.form[1:]