Source code for alex.corpustools.cued2wavaskey

#!/usr/bin/env python
# vim: set fileencoding=utf-8
# This code is PEP8-compliant. See http://www.python.org/dev/peps/pep-0008.
"""
Finds CUED XML files describing calls in the directory specified, extracts
a couple of fields from them for each turn (transcription, ASR 1-best,
semantics transcription, SLU 1-best) and outputs them to separate files in
the following format:
  {wav_filename} => {field}

An example ignore list file could contain the following three lines:

/some-path/call-logs/log_dir/some_id.wav
some_id.wav
jurcic-??[13579]*.wav

The first one is an example of an ignored path. On UNIX, it has to start with
a slash. On other platforms, an analogic convention has to be used.

The second one is an example of a literal glob.

The last one is an example of a more advanced glob. It says basically that
all odd dialogue turns should be ignored.

"""

# 2013-06
# Matěj Korvas

import argparse
import os
import os.path

if __name__ == "__main__":
    import autopath

from alex.corpustools.cued2utt_da_pairs import extract_trns_sems, write_data


_xmlname2recname = {'transcription': 'transcription',
                    'semitran': 'cued_da',
                    'semihyp': 'cued_dahyp',
                    'asrhyp': 'asrhyp',
                    'rec': 'audio'}
_suffixes = {'transcription': 'trs',
             'semitran': 'sem',
             'semihyp': 'shyp',
             'asrhyp': 'asr'}


[docs]def main(args): # Interpret the arguments. req_fields = list() if args.all else args.fields if 'rec' not in req_fields: req_fields.append('rec') # We require the rec fname for all the # records. if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Read in the dictionary. if args.dictionary: known_words = set(line.split()[0] for line in args.dictionary) args.dictionary.close() else: known_words = None # Extract the records. print 'Extracting semantics from the call logs...' recs = extract_trns_sems(args.infname, args.verbose, fields=req_fields, ignore_list_file=args.ignore, normalise=True, do_exclude=True, known_words=known_words) print "Total number of annotated user turns:", len(recs) # Save all the files in the requested format. if args.fields is None: fields = ("transcription", "semitran", "semihyp", "asrhyp") else: fields = args.fields for fldname in fields: print 'Saving {fld}s...'.format(fld=fldname) outfname = '{base}.{suf}'.format(base=args.out_basename, suf=_suffixes[fldname]) write_data(args.outdir, outfname, recs, '{{rec.audio}} => {{rec.{recname}}}\n'.format( recname=_xmlname2recname[fldname])) # Print a final message. print 'Done. Output written to "{outdir}{base}.*".'.format( outdir=args.outdir + os.sep, base=args.out_basename)
if __name__ == "__main__": arger = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=""" Finds CUED XML files describing calls in the directory specified, extracts a couple of fields from them for each turn (transcription, ASR 1-best, semantics transcription, SLU 1-best) and outputs them to separate files in the following format: {wav_filename} => {field} It scans for 'user-transcription.norm.xml' (or `user-transcription.xml' if the former is not found in the log directory) to extract the transcriptions and the semantics. """) arger.add_argument('-i', '--infname', help="an input directory with CUED audio files and " "call logs or a file listing these files' " "immediate parent dirs") arger.add_argument('-o', '--outdir', default='./cued_data', help='an output directory for files with audio and ' 'their transcription (default: ./cued_data)') arger.add_argument('-b', '--out-basename', metavar='NAME', default='extracted', help='output files will have names NAME.EXT; here you ' 'can specify the NAME used; EXT is chosen ' 'automatically (default: "extracted")') arger.add_argument('-f', '--fields', nargs='+', help='fields of the XML transcription file that ' 'should be extracted (default: all of them)') arger.add_argument('-a', '--all', action='store_true', help='ignore missing values for required fields ' '(i.e., process all turns)') arger.add_argument('-d', '--dictionary', type=argparse.FileType('r'), metavar='FILE', help='Path towards a phonetic dictionary constraining ' 'what words should be allowed in transcriptions. ' 'The dictionary is expected to contain the words ' 'in the first whitespace-separated column.') arger.add_argument('-g', '--ignore', type=argparse.FileType('r'), metavar='FILE', help='Path towards a file listing globs of CUED ' 'call log directoriess that should be ignored.\n' 'The globs are interpreted wrt. the current ' 'working directory. For an example, see the ' 'source code.') arger.add_argument('-v', '--verbose', action="store_true", help='set verbose output') main(arger.parse_args())