Source code for alex.components.nlg.tectotpl.block.t2a.cs.addsubordclausepunct

#!/usr/bin/env python
# coding=utf-8
#
# A Treex block
#
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
import re
from alex.components.nlg.tectotpl.block.t2a.cs.addclausalpunct import AddClausalPunct
from alex.components.nlg.tectotpl.tool.lexicon.cs import Lexicon

__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class AddSubordClausePunct(AddClausalPunct): """ Add commas separating subordinate clauses. Arguments: language: the language of the target tree selector: the selector of the target tree """ def __init__(self, scenario, args): "Constructor, just checking the argument values" Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.lexicon = Lexicon()
[docs] def process_atree(self, aroot): "Add subordinate clause punctuation to the given sentence." anodes = aroot.get_descendants(ordered=True) # examine all places between two nodes for (aleft, aright) in zip(anodes[:-1], anodes[1:]): # exclude all places where we don't want a comma # within the same clause if aleft.clause_number == aright.clause_number: continue # clause boundaries, such as brackets if aright.clause_number == 0: continue # some punctuation is here already if [an for an in (aleft, aright) if re.match(r'^[,:;.?!-]', an.lemma)]: continue # coordinating conjunctions or nodes in clauses belonging # to the same coordination if [an for an in (aleft, aright) if self.lexicon.is_coord_conj(an.lemma)]: continue if self.are_in_coord_clauses(aleft, aright): continue # left token is an opening quote or bracket if re.match(r'^[„(]', aleft.lemma): continue # right token is a closing bracket or quote followed by a period if aright.lemma == ')' or \ (aright.lemma == '“' and not aright.is_last_node() and aright.get_next_node().lemma == '.'): continue # left token is a closing quote or bracket preceded by a comma # (which has been inserted in the last step) if re.match(r'^[“)]', aleft.lemma) and not aleft.is_first_node() \ and aright.get_prev_node().lemma == ',': continue # now we know we want to insert a comma acomma = self.insert_comma_between(aleft, aright) # move the comma if the left token marks # the end of an enquoted clause if self.is_clause_in_quotes(aleft): acomma.shift_before_node(aleft) # move the comma after clausal expletives in expression "poté co" if aright.lemma == 'poté': acomma.shift_after_node(aright)
[docs] def are_in_coord_clauses(self, aleft, aright): "Check if the given nodes are in two coordinated clauses." alparent = self.get_clause_parent(aleft) arparent = self.get_clause_parent(aright) return alparent == arparent and \ not alparent.is_root and is_coord_conj(alparent.lemma)
[docs] def get_clause_parent(self, anode): """Return the parent of the clause the given node belongs to; the result may be the root of the tree.""" if anode.clause_number == 0: parent = anode else: parent = anode.get_clause_root().parent while parent.is_coap_root() and parent.is_member: parent = parent.parent return parent
[docs] def insert_comma_between(self, aleft, aright): """Insert a comma node between these two nodes, find out where to hang it.""" # find out the parent aleft_clause_root = aleft.get_clause_root() aright_clause_root = aright.get_clause_root() ahigher_clause_root = aleft_clause_root.get_depth() > \ aright_clause_root.get_depth() and \ aleft_clause_root or aright_clause_root # insert the new node acomma = ahigher_clause_root.create_child(\ data={'form': ',', 'lemma': ',', 'afun': 'AuxX', 'morphcat': {'pos': 'Z'}, 'clause_number': 0}) # shift the new node to its rightful place acomma.shift_after_node(aleft) return acomma