Source code for alex.applications.PublicTransportInfoEN.slu.add_to_bootstrap

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
A simple script for adding new utterances along with their semantics to
bootstrap.sem and bootstrap.trn.

Usage:

./add_to_bootsrap < input.tsv

The script expects input with tab-separated transcriptions + semantics (one
utterance per line). It automatically generates the dummy 'bootstrap_XXXX.wav'
identifiers and separates the transcription and semantics into two files.
"""

from __future__ import unicode_literals

import codecs
import re
import sys


BOOTSTRAP_SEM = 'bootstrap.sem'
BOOTSTRAP_TRN = 'bootstrap.trn'

[docs]def main():
    # get lowest available number
    hi = 0
    with codecs.open(BOOTSTRAP_TRN, 'r', 'UTF-8') as fh:
        for line in fh:
            line = re.sub(r'=>.*$', '', line)
            line = re.sub(r'[^0-9]', '', line)
            if not line:
                continue
            num = int(line)
            if hi < num:
                hi = num

    # add to both files
    sem_out = codecs.open(BOOTSTRAP_SEM, 'a', 'UTF-8')
    trn_out = codecs.open(BOOTSTRAP_TRN, 'a', 'UTF-8')
    stdin_utf = codecs.getreader('UTF-8')(sys.stdin)

    utt_no = hi + 1

    for line in stdin_utf:
        line = line.strip()
        if not line:
            continue
        trn, sem = line.split('\t')
        print >> trn_out, 'bootstrap_%04d.vaw => %s' % (utt_no, trn)
        print >> sem_out, 'bootstrap_%04d.vaw => %s' % (utt_no, sem)
        utt_no += 1

    sem_out.close()
    trn_out.close()


if __name__ == '__main__':
    main()