Source code for alex.utils.text

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import re

[docs]def findall(text, char, start=0, end=-1): idxs = list() if end == -1: end = len(text) nextidx = text.find(char, start, end) while nextidx != -1: idxs.append(nextidx) nextidx = text.find(char, nextidx + 1, end) return idxs
[docs]def split_by_comma(text): parentheses = 0 splitList = [] oldI = 0 for i in xrange(len(text)): if text[i] == '(': parentheses += 1 elif text[i] == ')': parentheses -= 1 if parentheses < 0: raise ValueError("Missing a left parenthesis.") elif text[i] == ',': if parentheses == 0: if oldI == i: raise ValueError( "Split segment must not start with a comma.") else: splitList.append(text[oldI:i].strip()) oldI = i + 1 else: splitList.append(text[oldI:].strip()) return splitList
[docs]def split_by(text, splitter, opening_parentheses='', closing_parentheses='', quotes="'\""): """ Splits the input text at each occurrence of the splitter only if it is not enclosed in parentheses. text - the input text string splitter - multi-character string which is used to determine the position of splitting of the text opening_parentheses - an iterable of opening parentheses that has to be respected when splitting, e.g. "{(" (default: '') closing_parentheses - an iterable of closing parentheses that has to be respected when splitting, e.g. "})" (default: '') quotes - an iterable of quotes that have to come in pairs, e.g. '"' """ split_list = [] # Interpret the arguments. parentheses_counter = dict((char, 0) for char in opening_parentheses + quotes) map_closing_to_opening = dict(zip(closing_parentheses, opening_parentheses)) segment_start = 0 segment_end = 0 while segment_end < len(text): cur_char = text[segment_end] if cur_char in opening_parentheses: parentheses_counter[cur_char] += 1 elif cur_char in closing_parentheses: parentheses_counter[map_closing_to_opening[cur_char]] -= 1 if parentheses_counter[map_closing_to_opening[cur_char]] < 0: raise ValueError(("Missing an opening parenthesis for: {par} " "in the text: {text}").format(par=cur_char, text=text)) elif cur_char in quotes: parentheses_counter[cur_char] = ( parentheses_counter[cur_char] + 1) % 2 elif text[segment_end:].startswith(splitter): # Test that all parentheses are closed. if not any(parentheses_counter.values()): split_list.append(text[segment_start:segment_end].strip()) segment_end += len(splitter) segment_start = segment_end segment_end += 1 else: split_list.append(text[segment_start:segment_end].strip()) return split_list
[docs]def parse_command(command): """Parse the command name(var1="val1",...) into a dictionary structure: E.g. call(destination="1245",opt="X") will be parsed into: { "__name__": "call", "destination": "1245", "opt": "X"} Return the parsed command in a dictionary. """ try: i = command.index('(') except ValueError: raise Exception( "Parsing error in: %s. Missing opening parenthesis." % command) name = command[:i] d = {"__name__": name} # remove the parentheses command_svs = command[i + 1:len(command) - 1] if not command_svs: # there are no parameters return d command_svs = split_by(command_svs, ',', '', '', '"') for command_sv in command_svs: i = split_by(command_sv, '=', '', '', '"') if len(i) == 1: raise Exception(("Parsing error in: {cmd}: {slot}. There is only " "variable name") .format(cmd=command, slot=unicode(i))) elif len(i) == 2: # There is a slot name and a value. d[i[0]] = i[1][1:-1] else: raise Exception("Parsing error in: %s: %s" % (command, str(i))) return d
[docs]def min_edit_dist(target, source): ''' Computes the min edit distance from target to source. ''' n = len(target) m = len(source) distance = [[0.0 for i in range(m)] for j in range(n)] for i in range(1,n): distance[i][0] = distance[i-1][0] + 1 for j in range(1,m): distance[0][j] = distance[0][j-1] + 1 for i in range(1,n): for j in range(1,m): distance[i][j] = min(distance[i-1][j] + 1, distance[i][j-1] + 1, distance[i-1][j-1] + (0 if target[i] == source[j] else 2)) return distance[n-1][m-1]
[docs]def min_edit_ops(target, source, cost=lambda insertions, deletions, substitutions: insertions + deletions + 2.0 * substitutions): """ Computes the min edit operations from target to source. :param target: a target sequence :param source: a source sequence :param cost: an expression for computing cost of the edit operations :return: a tuple of (insertions, deletions, substitutions) """ n = len(target) m = len(source) ops = [[(0, 0, 0) for i in range(m + 1)] for j in range(n + 1)] for i in range(1, n + 1): ops[i][0] = (ops[i - 1][0][0] + 1, ops[i - 1][0][1], ops[i - 1][0][2]) for j in range(1, m + 1): ops[0][j] = (ops[0][j - 1][0], ops[0][j - 1][1] + 1, ops[0][j - 1][2]) for i in range(1, n + 1): for j in range(1, m + 1): insertion = cost(ops[i - 1][j][0] + 1, ops[i - 1][j][1], ops[i - 1][j][2]) deletion = cost(ops[i][j - 1][0], ops[i][j - 1][1] + 1, ops[i][j - 1][2]) if source[j - 1] != target[i - 1]: substitution = cost(ops[i - 1][j - 1][0], ops[i - 1][j - 1][1], ops[i - 1][j - 1][2] + 1) else: substitution = cost(ops[i - 1][j - 1][0], ops[i - 1][j - 1][1], ops[i - 1][j - 1][2]) if substitution <= insertion and substitution <= deletion: if source[j - 1] != target[i - 1]: ops[i][j] = (ops[i - 1][j - 1][0], ops[i - 1][j - 1][1], ops[i - 1][j - 1][2] + 1) else: ops[i][j] = (ops[i - 1][j - 1][0], ops[i - 1][j - 1][1], ops[i - 1][j - 1][2]) elif insertion <= deletion and insertion <= deletion: ops[i][j] = (ops[i - 1][j][0] + 1, ops[i - 1][j][1], ops[i - 1][j][2]) elif deletion <= insertion and deletion <= substitution: ops[i][j] = (ops[i][j - 1][0], ops[i][j - 1][1] + 1, ops[i][j - 1][2]) else: raise Exception("min_edit_ops unexpected state") return ops[n][m]
[docs]class Escaper(object): """ Creates a customised escaper for strings. The characters that need escaping, as well as the one used for escaping can be specified. """ # TODO Write tests. # Constants for types of characters in a text that has been escaped. ESCAPER = 0 ESCAPED = 1 NORMAL = 2 def __init__(self, chars="'\"", escaper='\\', re_flags=0): """Constructs an escaper for escaping the specified characters. Arguments: chars -- a collection of characters to escape (default: "'\"") escaper -- the character used as the escaper (default: '\\') re_flags -- any regex flags (as defined in the built-in `re' module) to use when building the escaping regexp (default: 0) """ self.rx = re.compile(Escaper.re_literal_list(chars + escaper), re_flags) escaper_lit = Escaper.re_literal(escaper) self.sub = escaper_lit + '\\g<0>' self.unrx = re.compile(escaper_lit + '(.)') self.unsub = '\\1' _re_br_spec_chars_rx = re.compile('[]\\\\^-]') @staticmethod
[docs] def re_literal_list(chars): """ Builds a [] group for a regular expression that matches exactly the characters specified. """ return '[{esced}]'.format( esced=Escaper._re_br_spec_chars_rx.sub('\\\\\\g<0>', chars))
_re_combining_chars = '1234567890AbBdDsSwWZafnrtvx' @staticmethod
[docs] def re_literal(char): """ Escapes the character so that when it is used in a regexp, it matches itself. """ return char if (char in Escaper._re_combining_chars) else ('\\' + char)
[docs] def escape(self, text): """Escapes the text using the parameters defined in the constructor.""" return self.rx.sub(self.sub, text)
[docs] def unescape(self, text): """ Unescapes the text using the parameters defined in the constructor.""" # TODO Test whether this picks disjunct matches (yes, it should). return self.unrx.sub(self.unsub, text)
[docs] def annotate(self, esced): """ Annotates each character of a text that has been escaped whether: Escaper.ESCAPER - it is the escape character Escaper.ESCAPED - it is a character that was escaped Escaper.NORMAL - otherwise. It is expected that only parts of the text may have actually been escaped. Returns a list with the annotation values, co-indexed with characters of the input text. """ annion = [Escaper.NORMAL] * len(esced) for match in self.unrx.finditer(esced): first = match.start() annion[first] = Escaper.ESCAPER annion[first + 1] = Escaper.ESCAPED return annion
[docs]def escape_special_characters_shell(text, characters="'\""): """ Simple function that tries to escape quotes. Not guaranteed to produce the correct result!! If that is needed, use the new `Escaper' class. """ for character in characters: text = text.replace(character, '\\' + character) return text