理解完正则学会一半:
import re def test_patterns(text,patterns=[]): """Given source text and a list of patterns,look for matches for each pattern within the text and print them to stdout. """ print print ''.join(str(i/10 or ' ') for i in range(len(text))) print ''.join(str(i%10) for i in range(len(text))) print text # Look for each pattern in the text and print the results for pattern in patterns: print print 'Matching "%s"' % pattern for match in re.finditer(pattern,text): s = match.start() e = match.end() print ' %2d : %2d = "%s"' % \ (s,e-1,text[s:e]) return if __name__ == '__main__': print "*"*50 #Pattern Syntax test_patterns('abbaaabbbbaaaaa',['ab']) print "*"*50 #Repetition test_patterns('abbaaabbbbaaaaa',[ 'ab*',# a followed by zero or more b 'ab+',# a followed by one or more b 'ab?',# a followed by zero or one b 'ab{3}',# a followed by three b 'ab{2,3}',# a followed by two to three b ]) print "*"*50 #Character Sets test_patterns('abbaaabbbbaaaaa',[ '[ab]',# either a or b 'a[ab]+',# a followed by one or more a or b 'a[ab]+?',# a followed by one or more a or b,not greedy ]) print "*"*50 test_patterns('This is some text -- with punctuation.',[ '[^-. ]+',# sequences without -,.,or space ]) print "*"*50 test_patterns('This is some text -- with punctuation.',[ '[a-z]+',# sequences of lower case letters '[A-Z]+',# sequences of upper case letters '[a-zA-Z]+',# sequences of lower or upper case letters '[A-Z][a-z]+',# one upper case letter followed by lower case letters ]) print "*"*50 test_patterns('abbaaabbbbaaaaa',[ 'a.',# a followed by any one character 'b.',# b followed by any one character 'a.*b',# a followed by anything,ending in b 'a.*?b',ending in b ]) print "*"*50 #Escape Codes # Code Meaning # \d a digit # \D a non-digit # \s whitespace (tab,space,newline,etc.) # \S non-whitespace # \w alphanumeric # \W non-alphanumeric test_patterns('This is a prime #1 example!',[ r'\d+',# sequence of digits r'\D+',# sequence of non-digits r'\s+',# sequence of whitespace r'\S+',# sequence of non-whitespace r'\w+',# alphanumeric characters r'\W+',# non-alphanumeric ]) print "*"*50 test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+',[ r'\\d\+',r'\\D\+',r'\\s\+',r'\\S\+',r'\\w\+',r'\\W\+',]) #Anchoring # Code Meaning # ^ start of string,or line # $ end of string,or line # \A start of string # \Z end of string # \b empty string at the beginning or end of a word # \B empty string not at the beginning or end of a word print "*"*50 test_patterns('This is some text -- with punctuation.',[ r'^\w+',# word at start of string r'\A\w+',# word at start of string r'\w+\S*$',# word at end of string,with optional punctuation r'\w+\S*\Z',with optional punctuation r'\w*t\w*',# word containing 't' r'\bt\w+',# 't' at start of word r'\w+t\b',# 't' at end of word r'\Bt\B',# 't',not start or end of word ])输出结果:
************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "ab" 0 : 1 = "ab" 5 : 6 = "ab" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "ab*" 0 : 2 = "abb" 3 : 3 = "a" 4 : 4 = "a" 5 : 9 = "abbbb" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "ab+" 0 : 2 = "abb" 5 : 9 = "abbbb" Matching "ab?" 0 : 1 = "ab" 3 : 3 = "a" 4 : 4 = "a" 5 : 6 = "ab" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "ab{3}" 5 : 8 = "abbb" Matching "ab{2,3}" 0 : 2 = "abb" 5 : 8 = "abbb" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "[ab]" 0 : 0 = "a" 1 : 1 = "b" 2 : 2 = "b" 3 : 3 = "a" 4 : 4 = "a" 5 : 5 = "a" 6 : 6 = "b" 7 : 7 = "b" 8 : 8 = "b" 9 : 9 = "b" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "a[ab]+" 0 : 14 = "abbaaabbbbaaaaa" Matching "a[ab]+?" 0 : 1 = "ab" 3 : 4 = "aa" 5 : 6 = "ab" 10 : 11 = "aa" 12 : 13 = "aa" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "[^-. ]+" 0 : 3 = "This" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "[a-z]+" 1 : 3 = "his" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "[A-Z]+" 0 : 0 = "T" Matching "[a-zA-Z]+" 0 : 3 = "This" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "[A-Z][a-z]+" 0 : 3 = "This" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "a." 0 : 1 = "ab" 3 : 4 = "aa" 5 : 6 = "ab" 10 : 11 = "aa" 12 : 13 = "aa" Matching "b." 1 : 2 = "bb" 6 : 7 = "bb" 8 : 9 = "bb" Matching "a.*b" 0 : 9 = "abbaaabbbb" Matching "a.*?b" 0 : 1 = "ab" 3 : 6 = "aaab" ************************************************** 11111111112222222 012345678901234567890123456 This is a prime #1 example! Matching "\d+" 17 : 17 = "1" Matching "\D+" 0 : 16 = "This is a prime #" 18 : 26 = " example!" Matching "\s+" 4 : 4 = " " 7 : 7 = " " 9 : 9 = " " 15 : 15 = " " 18 : 18 = " " Matching "\S+" 0 : 3 = "This" 5 : 6 = "is" 8 : 8 = "a" 10 : 14 = "prime" 16 : 17 = "#1" 19 : 26 = "example!" Matching "\w+" 0 : 3 = "This" 5 : 6 = "is" 8 : 8 = "a" 10 : 14 = "prime" 17 : 17 = "1" 19 : 25 = "example" Matching "\W+" 4 : 4 = " " 7 : 7 = " " 9 : 9 = " " 15 : 16 = " #" 18 : 18 = " " 26 : 26 = "!" ************************************************** 1111111111222 01234567890123456789012 \d+ \D+ \s+ \S+ \w+ \W+ Matching "\\d\+" 0 : 2 = "\d+" Matching "\\D\+" 4 : 6 = "\D+" Matching "\\s\+" 8 : 10 = "\s+" Matching "\\S\+" 12 : 14 = "\S+" Matching "\\w\+" 16 : 18 = "\w+" Matching "\\W\+" 20 : 22 = "\W+" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "^\w+" 0 : 3 = "This" Matching "\A\w+" 0 : 3 = "This" Matching "\w+\S*$" 26 : 37 = "punctuation." Matching "\w+\S*\Z" 26 : 37 = "punctuation." Matching "\w*t\w*" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "\bt\w+" 13 : 16 = "text" Matching "\w+t\b" 13 : 16 = "text" Matching "\Bt\B" 23 : 23 = "t" 30 : 30 = "t" 33 : 33 = "t"待续...