Source code for alex.corpustools.cued2wavaskey

#!/usr/bin/env python
# vim: set fileencoding=utf-8
# This code is PEP8-compliant. See http://www.python.org/dev/peps/pep-0008.
"""
Finds CUED XML files describing calls in the directory specified, extracts
a couple of fields from them for each turn (transcription, ASR 1-best,
semantics transcription, SLU 1-best) and outputs them to separate files in
the following format:
  {wav_filename} => {field}

An example ignore list file could contain the following three lines:

/some-path/call-logs/log_dir/some_id.wav
some_id.wav
jurcic-??[13579]*.wav

The first one is an example of an ignored path. On UNIX, it has to start with
a slash. On other platforms, an analogic convention has to be used.

The second one is an example of a literal glob.

The last one is an example of a more advanced glob. It says basically that
all odd dialogue turns should be ignored.

"""

# 2013-06
# Matěj Korvas

import argparse
import os
import os.path

if __name__ == "__main__":
    import autopath

from alex.corpustools.cued2utt_da_pairs import extract_trns_sems, write_data


_xmlname2recname = {'transcription': 'transcription',
                    'semitran': 'cued_da',
                    'semihyp': 'cued_dahyp',
                    'asrhyp': 'asrhyp',
                    'rec': 'audio'}
_suffixes = {'transcription': 'trs',
             'semitran': 'sem',
             'semihyp': 'shyp',
             'asrhyp': 'asr'}


[docs]def main(args):
    # Interpret the arguments.
    req_fields = list() if args.all else args.fields
    if 'rec' not in req_fields:
        req_fields.append('rec')  # We require the rec fname for all the
                                  # records.
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Read in the dictionary.
    if args.dictionary:
        known_words = set(line.split()[0] for line in args.dictionary)
        args.dictionary.close()
    else:
        known_words = None

    # Extract the records.
    print 'Extracting semantics from the call logs...'
    recs = extract_trns_sems(args.infname, args.verbose, fields=req_fields,
                             ignore_list_file=args.ignore, normalise=True,
                             do_exclude=True, known_words=known_words)
    print "Total number of annotated user turns:", len(recs)

    # Save all the files in the requested format.
    if args.fields is None:
        fields = ("transcription", "semitran", "semihyp", "asrhyp")
    else:
        fields = args.fields

    for fldname in fields:
        print 'Saving {fld}s...'.format(fld=fldname)
        outfname = '{base}.{suf}'.format(base=args.out_basename,
                                         suf=_suffixes[fldname])
        write_data(args.outdir, outfname, recs,
                   '{{rec.audio}} => {{rec.{recname}}}\n'.format(
                       recname=_xmlname2recname[fldname]))
    # Print a final message.
    print 'Done.  Output written to "{outdir}{base}.*".'.format(
        outdir=args.outdir + os.sep, base=args.out_basename)


if __name__ == "__main__":
    arger = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""
    Finds CUED XML files describing calls in the directory specified, extracts
    a couple of fields from them for each turn (transcription, ASR 1-best,
    semantics transcription, SLU 1-best) and outputs them to separate files in
    the following format:

        {wav_filename} => {field}

    It scans for 'user-transcription.norm.xml' (or `user-transcription.xml'
    if the former is not found in the log directory) to extract the
    transcriptions and the semantics.

      """)

    arger.add_argument('-i', '--infname',
                       help="an input directory with CUED audio files and "
                            "call logs or a file listing these files' "
                            "immediate parent dirs")
    arger.add_argument('-o', '--outdir', default='./cued_data',
                       help='an output directory for files with audio and '
                            'their transcription (default: ./cued_data)')
    arger.add_argument('-b', '--out-basename', metavar='NAME',
                       default='extracted',
                       help='output files will have names NAME.EXT; here you '
                            'can specify the NAME used; EXT is chosen '
                            'automatically (default: "extracted")')
    arger.add_argument('-f', '--fields', nargs='+',
                       help='fields of the XML transcription file that '
                            'should be extracted (default: all of them)')
    arger.add_argument('-a', '--all', action='store_true',
                       help='ignore missing values for required fields '
                            '(i.e., process all turns)')
    arger.add_argument('-d', '--dictionary',
                       type=argparse.FileType('r'),
                       metavar='FILE',
                       help='Path towards a phonetic dictionary constraining '
                            'what words should be allowed in transcriptions. '
                            'The dictionary is expected to contain the words '
                            'in the first whitespace-separated column.')
    arger.add_argument('-g', '--ignore',
                       type=argparse.FileType('r'),
                       metavar='FILE',
                       help='Path towards a file listing globs of CUED '
                            'call log directoriess that should be ignored.\n'
                            'The globs are interpreted wrt. the current '
                            'working directory. For an example, see the '
                            'source code.')
    arger.add_argument('-v', '--verbose', action="store_true",
                       help='set verbose output')
    main(arger.parse_args())