1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
"""This module implements the WordSub class, modelled after a recipe
in "Python Cookbook" (Recipe 3.14, "Replacing Multiple Patterns in a
Single Pass" by Xavier Defrang).
Usage:
Use this class like a dictionary to add before/after pairs:
> subber = TextSub()
> subber["before"] = "after"
> subber["begin"] = "end"
Use the sub() method to perform the substitution:
> print subber.sub("before we begin")
after we end
All matching is intelligently case-insensitive:
> print subber.sub("Before we BEGIN")
After we END
The 'before' words must be complete words -- no prefixes.
The following example illustrates this point:
> subber["he"] = "she"
> print subber.sub("he says he'd like to help her")
she says she'd like to help her
Note that "he" and "he'd" were replaced, but "help" and "her" were
not.
"""
# 'dict' objects weren't available to subclass from until version 2.2.
# Get around this by importing UserDict.UserDict if the built-in dict
# object isn't available.
try: dict
except: from UserDict import UserDict as dict
import ConfigParser
import re
import string
class WordSub(dict):
"""All-in-one multiple-string-substitution class."""
def _wordToRegex(self, word):
"""Convert a word to a regex object which matches the word."""
return r"\b%s\b" % re.escape(word)
def _update_regex(self):
"""Build re object based on the keys of the current
dictionary.
"""
self._regex = re.compile("|".join(map(self._wordToRegex, self.keys())))
self._regexIsDirty = False
def __init__(self, defaults = {}):
"""Initialize the object, and populate it with the entries in
the defaults dictionary.
"""
self._regex = None
self._regexIsDirty = True
for k,v in defaults.items():
self[k] = v
def __call__(self, match):
"""Handler invoked for each regex match."""
return self[match.group(0)]
def __setitem__(self, i, y):
self._regexIsDirty = True
# for each entry the user adds, we actually add three entrys:
super(type(self),self).__setitem__(string.lower(i),string.lower(y)) # key = value
super(type(self),self).__setitem__(string.capwords(i), string.capwords(y)) # Key = Value
super(type(self),self).__setitem__(string.upper(i), string.upper(y)) # KEY = VALUE
def sub(self, text):
"""Translate text, returns the modified text."""
if self._regexIsDirty:
self._update_regex()
return self._regex.sub(self, text)
# self-test
if __name__ == "__main__":
subber = WordSub()
subber["apple"] = "banana"
subber["orange"] = "pear"
subber["banana" ] = "apple"
subber["he"] = "she"
subber["I'd"] = "I would"
# test case insensitivity
inStr = "I'd like one apple, one Orange and one BANANA."
outStr = "I Would like one banana, one Pear and one APPLE."
if subber.sub(inStr) == outStr: print "Test #1 PASSED"
else: print "Test #1 FAILED: '%s'" % subber.sub(inStr)
inStr = "He said he'd like to go with me"
outStr = "She said she'd like to go with me"
if subber.sub(inStr) == outStr: print "Test #2 PASSED"
else: print "Test #2 FAILED: '%s'" % subber.sub(inStr)
|