Source code for alex.corpustools.semscore

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
if __name__ == '__main__':
    import autopath

import argparse
import re
import sys
import codecs

from collections import defaultdict

from alex.utils.text import split_by

[docs]def load_semantics(file_name):
    f = codecs.open(file_name,encoding = 'UTF-8')

    semantics = defaultdict(list)
    for l in f:
        l = l.strip()
        if not l:
            continue

        l = l.split("=>")

        key = l[0].strip()
        sem = l[1].strip()

        sem = split_by(sem, '&', '(', ')', '"')

        semantics[key] = sem
    f.close()

    return semantics

[docs]def score_da(ref_da, test_da, daid):
    """Computed according to http://en.wikipedia.org/wiki/Precision_and_recall"""

    tp = 0.0
    fp = 0.0
    fn = 0.0

    statsp = defaultdict(lambda : defaultdict(float))
    epp = []

    for i in test_da:
        ri = re.sub(ur'([\w]+|\B)(="[\w\'!\., :\-)(]+")', r'\1="*"', i, flags=re.UNICODE)
        if i in ref_da:
            tp += 1.0
            statsp[ri]['tp'] += 1.0
        else:
            fp += 1.0
            statsp[ri]['fp'] += 1.0

            epp.append("""DAid {daid}
  in hyp da:                 {hypda}
  is EXTRA dai:              {dai}
  when compared with ref da: {refda}\n""".format(daid=daid, hypda='&'.join(test_da), dai=i, refda='&'.join(ref_da)))

    for i in ref_da:
        ri = re.sub(ur'([\w]+|\B)(="[\w\'!\., :\-)(]+")', r'\1="*"', i, flags=re.UNICODE)
        if i not in test_da:
            fn += 1.0
            statsp[ri]['fn'] += 1.0
            epp.append("""DAid {daid}
  in hyp da:                 {hypda}
  is MISSING dai:            {dai}
  when compared with ref da: {refda}\n""".format(daid=daid, hypda='&'.join(test_da), dai=i, refda='&'.join(ref_da)))

    return tp, fp, fn, statsp, epp

[docs]def score_file(refsem, testsem):
    tp = 0.0
    fp = 0.0
    fn = 0.0

    stats = defaultdict(lambda : defaultdict(float))
    error_output = []

    for k in sorted(refsem):
        tpp, fpp, fnp, statsp, epp  = score_da(refsem[k], testsem[k], k)
        tp += tpp
        fp += fpp
        fn += fnp
        if epp:
            error_output += [''.join(epp),]

        for kk in statsp:
            for kkk in statsp[kk]:
                stats[kk][kkk] += statsp[kk][kkk]

    precision = 100.0*tp/(tp+fp)
    recall    = 100.0*tp/(tp+fn)

    for k in stats:
        try:
            stats[k]['precision'] = 100.0*stats[k]['tp']/(stats[k]['tp']+stats[k]['fp'])
        except ZeroDivisionError:
            stats[k]['precision'] = 0.001

        try:
            stats[k]['recall'] = 100.0*stats[k]['tp']/(stats[k]['tp']+stats[k]['fn'])
        except ZeroDivisionError:
            stats[k]['recall'] = 0.001

        stats[k]['precision'] += 0.000001
        stats[k]['recall']    += 0.000001

    return precision, recall, stats, '\n'.join(error_output)

[docs]def score(fn_refsem, fn_testsem, item_level = False, detailed_error_output = False, outfile = sys.stdout):
    refsem  = load_semantics(fn_refsem)
    testsem = load_semantics(fn_testsem)

    precision, recall, stats, error_output = score_file(refsem, testsem)

    outfile.write("Ref: {r}\n".format(r=fn_refsem))
    outfile.write("Tst: {t}\n".format(t=fn_testsem))

    outfile.write("The results are based on {num_das} DAs\n".format(num_das=len(refsem)))

    outfile.write("-"*80)
    outfile.write("\n")
    outfile.write("Total precision: %6.2f" % precision)
    outfile.write("\n")
    outfile.write("Total recall:    %6.2f" % recall)
    outfile.write("\n")
    outfile.write("Total F-measure: %6.2f" % (2*precision*recall/(precision+recall), ))
    outfile.write("\n")

    if item_level:
        outfile.write("-"*80)
        outfile.write("\n")
        outfile.write("%40s %10s %10s %10s " % ('Dialogue act', 'Precision',  'Recall', 'F-measure'))
        outfile.write("\n")
        for k in sorted(stats):
            outfile.write("%40s %10.2f %10.2f %10.2f " % (k,
                                                          stats[k]['precision'],
                                                          stats[k]['recall'],
                                                          2*stats[k]['precision']*stats[k]['recall']/(stats[k]['precision']+stats[k]['recall'])
            ))
            outfile.write("\n")

        outfile.write("-"*80)
        outfile.write("\n")


    if detailed_error_output:
        outfile.write("-"*80)
        outfile.write("\n")
        outfile.write(error_output)
        outfile.write("-"*80)
        outfile.write("\n")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description="""
    Compute scores for semantic parser output against reference semantics.
    The scores include total item precision and recall, and slot level
    precision and recall.

    The files structures must be as follows:
      sem_name    => sem_content
      ----------------------------------------
      0000001.wav => inform(food="Chinese")
      0000002.wav => request(phone)

    The semantics from the test file and the reference file is matched
    based on the sem_name.
    """)

    parser.add_argument('refsem', action="store", help='a file with reference semantics')
    parser.add_argument('testsem', action="store", help='a file with tested semantics')
    parser.add_argument('-i', action="store_true", default=False, dest="item_level", help='print item level precision and recall')
    parser.add_argument('-d', action="store_true", default=False, dest="detailed_error_output",
                        help='print missing and extra hypothesis dialogue act items')

    args = parser.parse_args()

    score(args.refsem, args.testsem, args.item_level, args.detailed_error_output)