Source code for alex.corpustools.semscore

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
if __name__ == '__main__':
    import autopath

import argparse
import re
import sys
import codecs

from collections import defaultdict

from alex.utils.text import split_by

[docs]def load_semantics(file_name): f = codecs.open(file_name,encoding = 'UTF-8') semantics = defaultdict(list) for l in f: l = l.strip() if not l: continue l = l.split("=>") key = l[0].strip() sem = l[1].strip() sem = split_by(sem, '&', '(', ')', '"') semantics[key] = sem f.close() return semantics
[docs]def score_da(ref_da, test_da, daid): """Computed according to http://en.wikipedia.org/wiki/Precision_and_recall""" tp = 0.0 fp = 0.0 fn = 0.0 statsp = defaultdict(lambda : defaultdict(float)) epp = [] for i in test_da: ri = re.sub(ur'([\w]+|\B)(="[\w\'!\., :\-)(]+")', r'\1="*"', i, flags=re.UNICODE) if i in ref_da: tp += 1.0 statsp[ri]['tp'] += 1.0 else: fp += 1.0 statsp[ri]['fp'] += 1.0 epp.append("""DAid {daid} in hyp da: {hypda} is EXTRA dai: {dai} when compared with ref da: {refda}\n""".format(daid=daid, hypda='&'.join(test_da), dai=i, refda='&'.join(ref_da))) for i in ref_da: ri = re.sub(ur'([\w]+|\B)(="[\w\'!\., :\-)(]+")', r'\1="*"', i, flags=re.UNICODE) if i not in test_da: fn += 1.0 statsp[ri]['fn'] += 1.0 epp.append("""DAid {daid} in hyp da: {hypda} is MISSING dai: {dai} when compared with ref da: {refda}\n""".format(daid=daid, hypda='&'.join(test_da), dai=i, refda='&'.join(ref_da))) return tp, fp, fn, statsp, epp
[docs]def score_file(refsem, testsem): tp = 0.0 fp = 0.0 fn = 0.0 stats = defaultdict(lambda : defaultdict(float)) error_output = [] for k in sorted(refsem): tpp, fpp, fnp, statsp, epp = score_da(refsem[k], testsem[k], k) tp += tpp fp += fpp fn += fnp if epp: error_output += [''.join(epp),] for kk in statsp: for kkk in statsp[kk]: stats[kk][kkk] += statsp[kk][kkk] precision = 100.0*tp/(tp+fp) recall = 100.0*tp/(tp+fn) for k in stats: try: stats[k]['precision'] = 100.0*stats[k]['tp']/(stats[k]['tp']+stats[k]['fp']) except ZeroDivisionError: stats[k]['precision'] = 0.001 try: stats[k]['recall'] = 100.0*stats[k]['tp']/(stats[k]['tp']+stats[k]['fn']) except ZeroDivisionError: stats[k]['recall'] = 0.001 stats[k]['precision'] += 0.000001 stats[k]['recall'] += 0.000001 return precision, recall, stats, '\n'.join(error_output)
[docs]def score(fn_refsem, fn_testsem, item_level = False, detailed_error_output = False, outfile = sys.stdout): refsem = load_semantics(fn_refsem) testsem = load_semantics(fn_testsem) precision, recall, stats, error_output = score_file(refsem, testsem) outfile.write("Ref: {r}\n".format(r=fn_refsem)) outfile.write("Tst: {t}\n".format(t=fn_testsem)) outfile.write("The results are based on {num_das} DAs\n".format(num_das=len(refsem))) outfile.write("-"*80) outfile.write("\n") outfile.write("Total precision: %6.2f" % precision) outfile.write("\n") outfile.write("Total recall: %6.2f" % recall) outfile.write("\n") outfile.write("Total F-measure: %6.2f" % (2*precision*recall/(precision+recall), )) outfile.write("\n") if item_level: outfile.write("-"*80) outfile.write("\n") outfile.write("%40s %10s %10s %10s " % ('Dialogue act', 'Precision', 'Recall', 'F-measure')) outfile.write("\n") for k in sorted(stats): outfile.write("%40s %10.2f %10.2f %10.2f " % (k, stats[k]['precision'], stats[k]['recall'], 2*stats[k]['precision']*stats[k]['recall']/(stats[k]['precision']+stats[k]['recall']) )) outfile.write("\n") outfile.write("-"*80) outfile.write("\n") if detailed_error_output: outfile.write("-"*80) outfile.write("\n") outfile.write(error_output) outfile.write("-"*80) outfile.write("\n")
if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=""" Compute scores for semantic parser output against reference semantics. The scores include total item precision and recall, and slot level precision and recall. The files structures must be as follows: sem_name => sem_content ---------------------------------------- 0000001.wav => inform(food="Chinese") 0000002.wav => request(phone) The semantics from the test file and the reference file is matched based on the sem_name. """) parser.add_argument('refsem', action="store", help='a file with reference semantics') parser.add_argument('testsem', action="store", help='a file with tested semantics') parser.add_argument('-i', action="store_true", default=False, dest="item_level", help='print item level precision and recall') parser.add_argument('-d', action="store_true", default=False, dest="detailed_error_output", help='print missing and extra hypothesis dialogue act items') args = parser.parse_args() score(args.refsem, args.testsem, args.item_level, args.detailed_error_output)