Source code for alex.corpustools.text_norm_es

#!/usr/bin/env python
# vim: set fileencoding=utf-8 fdm=marker :
"""
This module provides tools for **ENGLISH** normalisation of transcriptions, mainly for
those obtained from human transcribers.
"""

from __future__ import unicode_literals

import re

__all__ = ['normalise_text', 'exclude', 'exclude_by_dict']

_nonspeech_events = ['_SIL_', '_INHALE_', '_LAUGH_', '_EHM_HMM_', '_NOISE_', '_EXCLUDE_',]

for idx, ne in enumerate(_nonspeech_events):
    _nonspeech_events[idx] = (re.compile(r'((\b|\s){pat}(\b|\s))+'.format(pat=ne)), ' '+ne+' ')

# nonspeech event transcriptions {{{
_nonspeech_map = {
    '_SIL_': (
        '(SIL)',
        '(SILENCE)',
        '(QUIET)',
        '(CLEARING)',
        '<SILENCE>',
    ),
    '_INHALE_': (
        '(INHALE)',
        '(BREATH)',
        '(BREATHING)',
        '(SNIFFING)',
        '<INHALE>',
    ),
    '_LAUGH_': (
        '(LAUGH)',
        '(LAUGHING)',
        '<LAUGH>',
    ),
    '_EHM_HMM_': (
        '(EHM_HMM)',
        '(HESITATION)',
        '(HUM)',
        '<COUGH>',
        '<MOUTH>',
        '<EHM A>',
        '<EHM N>',
        '<EHM >',
        '<EHM>',
    ),
    '_NOISE_': (
        '(NOISE)',
        '(NOISES)',
        '(COUCHING)',
        '(COUGH)',
        '(COUGHING)',
        '(LIPSMACK)',
        '(POUNDING)',
        '(RING)',
        '(RINGING)',
        '(INTERFERENCE)',
        '(KNOCKING)',
        '(BANG)',
        '(BANGING)',
        '(BACKGROUNDNOISE)',
        '(BABY)',
        '(BARK)',
        '(BARKING)',
        '(NOISE)',
        '(NOISES)',
        '(STATIC)',
        '(SCRAPE)',
        '(SQUEAK)',
        '(TVNOISE)',
        '<NOISE>',
    ),
    '_EXCLUDE_': (
        '(EXCLUDE)',
        '(PERSONAL)',
        '(VULGARISM)',
        '(UNINTELLIGIBLE)',
        '(UNINT)',
    )
}
#}}}
_nonspeech_trl = dict()
for uscored, forms in _nonspeech_map.iteritems():
    for form in forms:
        _nonspeech_trl[form] = uscored

# substitutions {{{
_subst = [
          ('_EXCLUDE_', '_EXCLUDE_'),
          ('ACUESTATE', 'ACUÉSTATE'),
          ('ALÓ', 'HALÓ'),
          ('AYUDAME', 'AYÚDAME'),
          ('BIOLOGIA', 'BIOLOGÍA'),
          ('CIENTIFICOS', 'CIENTÍFICOS'),
          ('DEMAS', 'DEMÁS'),
          ('FISCALIA', 'FISCALÍA'),
          ('GANACIA', 'GANANCIA'),
          ('GARABOA', 'GARAGOA'),
          ('INJUSTSICIA', 'INJUSTICIA'),
          ('INMANULADA', 'INMACULADA'),
          ('UDSTED', 'USTED'),
#          ('', ''),
           ]
#}}}
for idx, tup in enumerate(_subst):
    pat, sub = tup
    _subst[idx] = (re.compile(r'(^|\s){pat}($|\s)'.format(pat=pat)), ' '+sub+' ')

# hesitation expressions {{{
_hesitation = ['AAAA', 'AAA', 'AA', 'AAH', 'A-', "-AH-", "AH-", "AH.", "AH",
               "AHA", "AHH", "AHHH", "AHMA", "AHM", "ANH", "ARA", "-AR",
               "AR-", "-AR", "ARRH", "AW", "EA-", "-EAR", "-EECH", "\"EECH\"",
               "-EEP", "-E", "E-", "EH", "EM", "--", "ER", "ERM", "ERR",
               "ERRM", "EX-", "F-", "HM", "HMM", "HMMM", "-HO", "HUH", "HU",
               "HUM", "HUMM", "HUMN", "HUMN", "HUMPH", "HUP", "HUU", "-",
               "MM", "MMHMM", "MMM", "NAH", "OHH", "OH", "SH", "UHHH", "EMMM"
               "UHH", "UHM", "UH'", "UH", "UHUH", "UHUM", "UMH", "UMM", "UMN",
               "UM", "URM", "URUH", "UUH", "ARRH", "AW", "EM", "ERM", "ERR",
               "ERRM", "HUMN", "UM", "UMN", "URM", "AH", "ER", "ERM", "HUH",
               "HUMPH", "HUMN", "HUM", "HU", "SH", "UH", "UHUM", "UM", "UMH",
               "URUH", "MMMM", "MMM", "OHM", "UMMM", "MHMM", "EMPH", "HMPH",
               "UGH", "UHH", "UMMMMM", "SHH", "OOH", ]
# }}}
for idx, word in enumerate(_hesitation):
    _hesitation[idx] = re.compile(r'(^|\s){word}($|\s)'.format(word=word))

_more_spaces = re.compile(r'\s{2,}')
_sure_punct_rx = re.compile(r'[.?!",_\n]')
_parenthesized_rx = re.compile(r'\(+([^)]*)\)+')


[docs]def normalise_text(text): """ Normalises the transcription. This is the main function of this module. """ text = _sure_punct_rx.sub(' ', text) text = text.strip().upper() # Do dictionary substitutions. for pat, sub in _subst: text = pat.sub(sub, text) for word in _hesitation: text = word.sub(' (HESITATION) ', text) text = _more_spaces.sub(' ', text).strip() # Handle non-speech events (separate them from words they might be # agglutinated to, remove doubled parentheses, and substitute the known # non-speech events with the forms with underscores). # # This step can incur superfluous whitespace. if '(' in text or '<' in text: text = _parenthesized_rx.sub(r' (\1) ', text) for parenized, uscored in _nonspeech_trl.iteritems(): text = text.replace(parenized, uscored) text = _more_spaces.sub(' ', text.strip()) # remove duplicate non-speech events for pat, sub in _nonspeech_events: text = pat.sub(sub, text) text = _more_spaces.sub(' ', text).strip() for char in '^': text = text.replace(char, '') return text
_excluded_characters = set(['\n', '=', '-', '*', '+', '~', '(', ')', '[', ']', '{', '}', '<', '>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']) def exclude_asr(text): """ This function is used for determining whether the transcription can be used for training ASR. Determines whether `text' is not good enough and should be excluded. "Good enough" is defined as containing none of `_excluded_characters' and being longer than one word. """ if '_EXCLUDE_' in text: return True if text in ['_SIL_', ]: return True if text in ['_NOISE_', '_EHM_HMM_', '_INHALE_', '_LAUGH_']: return False # allow for sentences with these non-speech events if mixed with text for s in ['_NOISE_', '_INHALE_', '_LAUGH_']: text = text.replace(s,'') for char in _excluded_characters: if char in text: return True if '_' in text: return True if len(text) < 2: return True return False def exclude_lm(text): """ This function is used for determining whether the transcription can be used for Language Modeling. Determines whether `text' is not good enough and should be excluded. "Good enough" is defined as containing none of `_excluded_characters' and being longer than one word. """ if '_EXCLUDE_' in text: return True for char in _excluded_characters: if char in text: return True return False def exclude_slu(text): """ This function is used for determining whether the transcription can be used for training Spoken Language Understanding. """ return exclude_lm(text)
[docs]def exclude_by_dict(text, known_words): """ Determines whether text is not good enough and should be excluded. "Good enough" is defined as having all its words present in the `known_words' collection.""" return not all(map(lambda word: word in known_words, text.split()))