Source code for alex.corpustools.merge_uttcns

#!/usr/bin/env python
# vim: set fileencoding=utf-8
#
# Merges ASR-decoded confusion networks from several files produced by
# multiple runs of the script `get_jasr_confnets.py' into one file.
#
# 2013-06
# Matěj Korvas

from __future__ import unicode_literals

import codecs
from collections import Counter


[docs]def find_best_cn(cns): """Determines which one of decoded confnets seems the best.""" non_none_cns = [cn for cn in cns if cn != 'None'] if non_none_cns: # Restrict the choice to those confnets that occur most often. counts = Counter(non_none_cns) most_common, highest_count = counts.most_common(1)[0] non_none_cns = [cn for (cn, count) in counts.iteritems() if count == highest_count] # Choose the longest confnet (measured by its representation). return max((len(cn), cn) for cn in non_none_cns)[1] else: return 'None'
[docs]def merge_files(fnames, outfname): cndict = dict() for fname in fnames: with codecs.open(fname, encoding='UTF-8') as cnfile: for line in cnfile: key, cn = line.strip().split(' => ') cndict.setdefault(key, list()).append(cn) with codecs.open(outfname, 'w', encoding='UTF-8') as outfile: for key, cns in sorted(cndict.viewitems()): if len(cns) > 1: best_cn = find_best_cn(cns) else: best_cn = cns[0] outfile.write('{key} => {val}\n'.format(key=key, val=best_cn))
if __name__ == "__main__": import sys merge_files(sys.argv[1:-1], sys.argv[-1])