Source code for alex.components.nlg.tectotpl.block.read.tectotemplates

#!/usr/bin/env python
# coding=utf-8
#
# Block for reading Treex YAML files
#
from __future__ import absolute_import
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core import Document

from alex.components.nlg.tectotpl.core.exception import LoadingException
from alex.components.nlg.tectotpl.core.util import file_stream
import re
from alex.components.nlg.tectotpl.core.log import log_info

__author__ = "Ondřej Dušek"
__date__ = "2013"


[docs]class TectoTemplates(Block):
    """\
    Reader for partial t-tree dialog system templates, where treelets
    can be intermixed with linear text.

    Example template:

    Vlak přijede v [[7|adj:attr] hodina|n:4|gender:fem].

    All linear text is inserted into t-lemmas of atomic nodes, while
    treelets have their formeme and grammateme values filled in.
    """

    def __init__(self, scenario, args):
        """\
        Constructor, checks if language is set and selects encoding according
        to args, defauts to UTF-8.
        """
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')
        self.encoding = args.get('encoding', 'UTF-8')

[docs]    def process_document(self, filename):
        """\
        Read a Tecto-Template file and return its contents as
        a Document object.
        """
        fh = file_stream(filename, encoding=self.encoding)
        doc = Document(filename)
        for line in fh:
            bundle = doc.create_bundle()
            zone = bundle.create_zone(self.language, self.selector)
            ttree = zone.create_ttree()
            self.parse_line(line, ttree)
            log_info('Parsed a tree with %d nodes.' %
                     len(ttree.get_descendants()))
        fh.close()
        return doc

[docs]    def parse_line(self, text, troot):
        """\
        Parse a template to a t-tree.
        """
        off = 0
        last_tnode = troot
        while off < len(text):
            # search for next treelet
            parse_treelet = True
            pos = text.find('[', off)
            if pos == -1:  # no treelets, everything until the end is linear
                pos = len(text)
                parse_treelet = False
            # create a tree with the linear part up to the next treelet
            tnode = troot.create_child(data={'t_lemma': text[off:pos],
                                             'nodetype': 'atom',
                                             'functor': '???',
                                             'formeme': 'x'})
            tnode.shift_after_subtree(last_tnode)
            last_tnode = tnode
            # parse the next treelet and move after it
            if parse_treelet:
                tnode = troot.create_child()
                tnode.shift_after_node(last_tnode)
                off = pos + 1 + self.parse_treelet(text[pos + 1:], tnode)
                last_tnode = tnode
            # no more treelets to parse, we just added everything till the end
            else:
                break

[docs]    def parse_treelet(self, text, tnode):
        """\
        Parse a treelet in the template, filling the required values.
        Returns the position in the text after the treelet.
        """
        pos = 0
        right = False
        while pos < len(text):
            # skip space
            if text[pos].isspace():
                pos += 1
            # delve deeper
            elif text[pos] == '[':
                tchild = tnode.create_child()
                if not right:
                    tchild.shift_before_node(tnode)
                else:
                    tchild.shift_after_subtree(tnode)
                pos += 1 + self.parse_treelet(text[pos + 1:], tchild)
            # return
            elif text[pos] == ']':
                return pos + 1
            # fill in node attributes
            else:
                # may even contain multiple words
                values = re.match(r'^([^\]\[]+)', text[pos:]).group(1)
                pos += len(values)
                values = values.split('|')
                tnode.t_lemma = values[0]
                # set dummy functor
                tnode.functor = '???'
                # fill in formeme (if applicable, default to x/atom)
                if len(values) >= 2:
                    tnode.formeme = values[1]
                    tnode.nodetype = 'complex'
                else:
                    tnode.formeme = 'x'
                    tnode.nodetype = 'atom'
                # fill in grammatemes
                if len(values) >= 3:
                    gram_dict = {}
                    for gram in values[2].split(','):
                        gram_name, gram_val = gram.split(':')
                        gram_dict[gram_name.strip()] = gram_val.strip()
                    tnode.gram = gram_dict
                right = True
        return len(text)