python正则表达式

时间：2016-06-30 10:59:25 阅读：269 评论：0 收藏：0 [点我收藏+]

五、python正则表达式

标签： python 正则表达式

1. 正则表达式的使用

>>> p = re.compile("abc")
>>> m= p.match("abcdf")
>>> type(m)            
<type ‘_sre.SRE_Match‘>
>>> print m.group()
abc
>>> m= p.match("acbcdf")
>>> print m.group()     
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: ‘NoneType‘ object has no attribute ‘group‘
>>> type(m)             #无匹配模式打印m为NoneType值
<type ‘NoneType‘>

2. re.compile()

3. 正则表达式介绍

元字符

. 除换行符的任意字符 DOTALL
\ 转义字符
[…] 字符集合
\d 数字：[0-9]
\D 非数字[^0-9]
\s 空白字符[<空格>\t\r\n\f\n]
\S 非空白字符[^\s]
\w 单词字符[A-Za-z0-9_]
\W 非单词字符[^\w]

>>> p = re.compile(".")
>>> m1 = p.match(‘dfg‘)
>>> print m1.group()
d
>>> m1 = p.match(‘\n‘) 
>>> print m1.group()  
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: ‘NoneType‘ object has no attribute ‘group‘
>>> p = re.compile("\.")
>>> m2 = p.match(‘abc.ef‘)
>>> print m2.group()      
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: ‘NoneType‘ object has no attribute ‘group‘
>>> m2 = p.match(‘.‘)     
>>> print m2.group() 
.
>>> p = re.compile("\.")      
>>> m = p.findall("abc.ef.gh")   #findall查找所有
>>> print m
[‘.‘, ‘.‘]
>>> p = re.compile(‘[abc]‘)    #子集
>>> m= p.findall(‘abcdef‘)
>>> print m
[‘a‘, ‘b‘, ‘c‘]
>>> m1 = p.match(‘abcdef‘)
>>> print m1.group()
a

数量词

* 匹配前一个字符0或者多次
+ 匹配前一个字符1次或者多次
? 匹配前一个字符0次或者1次
{m} 匹配前一个字符m次
{m,n} 匹配前一个字符m至n次
数量词? 变成非贪婪模式

>>> p = re.compile(‘[abc]*‘)
>>> print p.findall(‘abcdef‘)
[‘abc‘, ‘‘, ‘‘, ‘‘, ‘‘]
>>> p = re.compile(‘[abc]+‘) 
>>> print p.findall(‘abcdef‘)
[‘abc‘]
>>> p = re.compile(‘[abc]*?‘)       #非贪婪模式
>>> print p.findall(‘abcdef‘)
[‘‘, ‘‘, ‘‘, ‘‘, ‘‘, ‘‘, ‘‘]
>>> p = re.compile(‘[abc]+?‘)
>>> print p.findall(‘abcdef‘)
[‘a‘, ‘b‘, ‘c‘]
>>> p = re.compile(‘[abc]{2}‘)
>>> print p.findall(‘abcdef‘) 
[‘ab‘]
>>> print p.findall(‘ababab‘)
[‘ab‘, ‘ab‘, ‘ab‘]
>>> p = re.compile(‘[abc]{2,3}‘)
>>> print p.findall(‘abcabc‘)   
[‘abc‘, ‘abc‘]
>>> p = re.compile(‘[abc]{2,3}?‘)   #非贪婪模式
>>> print p.findall(‘abcabc‘)    
[‘ab‘, ‘ca‘, ‘bc‘]
>>>

边界符
^ 匹配字符串开头，多行匹配每一行开头
$ 匹配字符串末尾，多行匹配每一行末尾
\A 仅匹配字符串开头
\Z 仅匹配字符串末尾
\b 匹配\w 和 \W 之间

>>> p=re.compile(‘^[abc]*‘)
>>> p.findall("abcdef")
[‘abc‘]
>>> p.findall("bcdef")     
[‘bc‘]
>>> p.findall("def")  
[‘‘]
>>> p.findall("defabc")
[‘‘]
>>> p=re.compile(‘[abc]*‘)
>>> p.findall("defabc")   
[‘‘, ‘‘, ‘‘, ‘abc‘, ‘‘]
>>> p=re.compile(‘[^abc]*‘)    #^边界符放在[]内表示非abc字符的查找
>>> p.findall("defabc")    
[‘def‘, ‘‘, ‘‘, ‘‘, ‘‘]
>>> p=re.compile(‘^[abc]*e$‘)
>>> p.findall("defabc")      
[]
>>> p.findall("abcde") 
[]
>>> p.findall("abce") 
[‘abce‘]
>>> p.findall("bce") 
[‘bce‘]
>>> p.findall("ce") 
[‘ce‘]
>>> p=re.compile(‘^[abc]*?e$‘)
>>> p.findall("abce")         
[‘abce‘]

逻辑分组

|       左右表达式任意匹配一个
    先匹配左边一旦成功则跳过匹配右边
    如果|没有包含在()中，匹配整个正则表达式
(…)     分组匹配，从左到右，每遇到一个 ( 编号+1 
    分组后面可加数量词
(?P<name>…) 除了分组序号外，指定一个 name的别名
\<number> 引用编号为<number>的分组匹配到的字符串
(?P=name) 引用别名为<name>的分组匹配到的串

>>> p=re.compile(‘abc‘)
>>> m =p.match(‘abcdef‘)
>>> print m.group()
abc
>>> dir(m)
[‘__class__‘, ‘__copy__‘, ‘__deepcopy__‘, ‘__delattr__‘, ‘__doc__‘, ‘__format__‘, ‘__getattribute__‘, ‘__hash__‘, ‘__init__‘, ‘__new__‘, ‘__reduce__‘, ‘__reduce_ex__‘, ‘__repr__‘, ‘__setattr__‘, ‘__sizeof__‘, ‘__str__‘, ‘__subclasshook__‘, ‘end‘, ‘endpos‘, ‘expand‘, ‘group‘, ‘groupdict‘, ‘groups‘, ‘lastgroup‘, ‘lastindex‘, ‘pos‘, ‘re‘, ‘regs‘, ‘span‘, ‘start‘, ‘string‘]
>>> type(m.groups())
<type ‘tuple‘>
>>> m.groupdict()
{}
>>> p=re.compile(‘(a)(b)(c)‘)
>>> m =p.match(‘abcdef‘)     
>>> m.group()
‘abc‘
>>> m.groups()
(‘a‘, ‘b‘, ‘c‘)
>>> m =p.match(‘abdef‘) 
>>> m.groups()         
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: ‘NoneType‘ object has no attribute ‘groups‘
>>> m =p.match(‘abcdef‘)
>>> m.groups()          
(‘a‘, ‘b‘, ‘c‘)
>>> m.groupdict()       
{}
>>> p=re.compile(‘(a)b(c)‘)  
>>> m =p.match(‘abcdef‘)   
>>> m.groups()             
(‘a‘, ‘c‘)
>>> m =p.match(‘adcdef‘)   
>>> m.groups()          
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
>>> p=re.compile(‘(?P<name>a)b(c)‘)  #命名分组与匿名分组混合使用
>>> m=p.match(‘abcdef‘) 
>>> m.groups()
(‘a‘, ‘c‘)
>>> m.groupdict()
{‘name‘: ‘a‘}
>>> m.group()    
‘abc‘
>>> p=re.compile(‘(?P<name>a)b(c)(?P=name)‘)
>>> p.findall(‘abcd‘)
[]
>>> p.findall(‘abca‘)
[(‘a‘, ‘c‘)]
>>> p.findall(‘abcaa‘)                      
[(‘a‘, ‘c‘)]
>>> p=re.compile(‘(?P<name>a)b(c)(?P=name)\1‘)
>>> p.findall(‘abcaa‘)                        
[]
>>> p=re.compile(r‘(?P<name>a)b(c)(?P=name)\1‘) #防止上句\1变成转义字符，r为原生字符。\1为编号的索引位置
>>> p.findall(‘abcaa‘)                         
[(‘a‘, ‘c‘)]
>>> p.findall(‘abcac‘)
[]
>>> p=re.compile(r‘(?P<name>a)b(c)(?P=name)\2‘) 
>>> p.findall(‘abcac‘)                         
[(‘a‘, ‘c‘)]
>>> m =p.match(‘abcac‘)
>>> m.group()
‘abcac‘
>>> m.groups()         
(‘a‘, ‘c‘)
>>> m.groupdict()
{‘name‘: ‘a‘}
>>> m.group(2)   
‘c‘

特殊构造

(?:…) (…)不分组版本，用于使用 | 或者后接数量词
(?iLmsux) iLmsux的每个字符代表一个匹配模式，只能用在正
则表达式的开头，可选多个
(?#...) #号后的内容将作为注释
(?=…) 之后的字符串内容需要匹配表达式才能成功匹配
(?!...) 之后的字符串不匹配表达式才能成功
(?<=…) 之前的字符串需要匹配表达式才能成功
(?<!...) 之前的字符串需要不匹配表达式才能成功
(?(id/name) yes |no)        如果编号为id/名字为name的组匹配到字符串，则需
要匹配yes，否则匹配no，no可以省略

>>> p=re.compile(r‘(?:abc){2}‘)   #不分组模式
>>> p.findall(‘abc‘)
[]
>>> p.findall(‘abcabc‘)
[‘abcabc‘]
>>> p.findall(‘abcabcdef‘)
[‘abcabc‘]
>>> m=p.match("abcabc")
>>> m.groups()              #不是分组，只是格式串
()
>>> p=re.compile(r‘a(?=\d)‘)   
>>> p.findall(‘a1a2a3‘)
[‘a‘, ‘a‘, ‘a‘]
>>> p=re.compile(r‘\w(?=\d)‘)
>>> p.findall(‘word1 wor2 wo3‘)   
[‘d‘, ‘r‘, ‘o‘]
>>> p=re.compile(r‘\w+(?=\d)‘) 
>>> p.findall(‘word1 wor2 wo3‘)
[‘word‘, ‘wor‘, ‘wo‘]
>>> p=re.compile(r‘\w+?(?=\d)‘)     #非贪婪模式
>>> p.findall(‘word1 wor2 wo3‘)
[‘word‘, ‘wor‘, ‘wo‘]
>>> p=re.compile(r‘a(?!\d)‘)        #不匹配表达式
>>> p.findall(‘word1 wor2 wo3‘)
[]
>>> p.findall(‘word1 wor2 awo3‘)
[‘a‘]
>>> p=re.compile(r‘(?<=\d)a‘)   
>>> p.findall(‘word1 wor2 awo3‘)
[]
>>> p.findall(‘word1 wor2 awo3a‘)
[‘a‘]
>>> p=re.compile(r‘(?<!\d)a‘)     
>>> p.findall(‘word1 wora2 awo3a‘)
[‘a‘, ‘a‘]
>>> p=re.compile(r‘(\d)?abc(?(1)\b|abc)‘) #逻辑匹配
>>> p.findall(‘1abc4‘)                   
[]
>>> p.findall(‘1abcabc‘)
[‘‘]
>>> p.findall(‘abcabc‘) 
[‘‘]
>>> m=p.match (‘abcabc‘)
>>> m.group()
‘abcabc‘

iLmsux 正则表达式使用开关

I re.I 忽略大小写
L re.L 使用预定字符类 \w \W \b \B \s \S 取决当前区域设定
m re.M 多行模式改变^ 和 $ 的行为
s re.S  . 任意匹配模式
u re.U 使用预定字符类 \w \W \b \B \s \S \d \D 取决unicode定义的字符属性
x re.X 详细模式，可以多行，忽略空白字符，并且可以加入注释

>>> 
>>> p=re.compile(r‘(?i)abc‘)             
>>> p.findall(‘abcabc‘)     
[‘abc‘, ‘abc‘]
>>> p.findall(‘Abcabc‘)
[‘Abc‘, ‘abc‘]
>>> p=re.compile(r‘abc‘,re.I)   #常见用法，忽略大小写
>>> p.findall(‘Abcabc‘)      
[‘Abc‘, ‘abc‘]

4. 贪婪模式和非贪婪模式

正则表达式通常用于在文本中查找匹配的字符串。Python
里数量词默认是贪婪的（在少数语言里也可能是默认非贪
婪），总是尝试匹配尽可能多的字符；非贪婪的则相反，
总是尝试匹配尽可能少的字符。例如：正则表达式"ab*"如
果用于查找"abbbc"，将找到"abbb"。而如果使用非贪婪
的数量词"ab*?"，将找到"a"。

5. re模块的其他工具

re.compile(strPattern[, flag]) #函数返回pattern模式对象
pattern
match 返回查找的结果对象，从文本开头，只要匹配就返回
search 返回查找的结果对象，在整个目标文本内查找

>>> m=p.match(‘cdabcaaa‘)
>>> m.group()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: ‘NoneType‘ object has no attribute ‘group‘
>>> m=p.search(‘cdabcaaa‘)  
>>> m.group()
‘abc‘

split 字符串分拆
findall 查找目标串中所有满足条件的pattern的字符串，返回一个列表
finditer 返回一个生成器

>>> p=re.compile(‘a‘)
>>> m=p.finditer(‘abcaaadefaa‘)
>>> type(m)
<type ‘callable-iterator‘>
>>> m.next()
<_sre.SRE_Match object at 0x7fc971e3a5e0>
>>> dir(m.next())
[‘__class__‘, ‘__copy__‘, ‘__deepcopy__‘, ‘__delattr__‘, ‘__doc__‘, ‘__format__‘, ‘__getattribute__‘, ‘__hash__‘, ‘__init__‘, ‘__new__‘, ‘__reduce__‘, ‘__reduce_ex__‘, ‘__repr__‘, ‘__setattr__‘, ‘__sizeof__‘, ‘__str__‘, ‘__subclasshook__‘, ‘end‘, ‘endpos‘, ‘expand‘, ‘group‘, ‘groupdict‘, ‘groups‘, ‘lastgroup‘, ‘lastindex‘, ‘pos‘, ‘re‘, ‘regs‘, ‘span‘, ‘start‘, ‘string‘]
>>> m.next().group()
‘a‘
>>> m.next().group()
‘a‘
>>> m.next().group()
‘a‘
>>> m.next().group()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
StopIteration

sub 根据模式串进行编译和交互的函数

>>> import re
>>> p=re.compile(r‘(\w+) (\w+)‘)  #使用效果函数进行位置替换
>>> s=‘hi you ,good boy‘
>>> print p.sub(r‘\2 \1‘,s)
you hi ,boy good

python正则表达式

原文：http://blog.csdn.net/refuil/article/details/51787334

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)