Python正则表达式详细剖析
正则表达式
正则表达式(或 RE)指定与其匹配的一组字符串;该模块中的函数可让您检查特定字符串是否与给定的正则表达式匹配(或者给定的正则表达式是否与特定字符串匹配,这归结为同一件事)。下面sojson接下来举例和详细解析。
下表提供了可在正则表达式中使用的特殊模式匹配字符的列表和说明。
比较 HTML 标签:
标签类型 | 格式 | 例子 |
---|---|---|
开放标签 | <[^/>][^>]*> | ,<表> |
关闭标签 | ]+> | 、 |
自我关闭 | <[^/>]+/> | |
所有标签 | <[^>]+> | , |
# open tag
>>> import re
>>> re.search('<[^/>][^>]*>', '') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '
') != None
False
# close tag
>>> import re
>>> re.search(']+>', '') != None
True
# self close
>>> import re
>>> re.search('<[^/>]+/>', '
') != None
True
re.findall() 匹配字符串:
# split all string
>>> import re
>>> source = "split all string"
>>> re.findall('[\w]+', source)
['split', 'all', 'string']
# parsing python.org website
>>> import urllib
>>> import re
>>> x = urllib.urlopen('https://www.w3resource.org')
>>> html = x.read()
>>> x.close()
>>> print("open tags")
open tags
>>> re.findall('<[^/>][^>]*>', html)[0:2]
['', '']
>>> print("close tags")
close tags
>>> re.findall(']+>', html)[0:2]
['', '']
>>> print("self-closing tags")
组间比较:
# (...) group a regular expression
>>> import re
>>> mon = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2018-09-01')
>>> mon
<_sre.SRE_Match object at 0x019A72F0>
>>> mon.groups()
('2018', '09', '01')
>>> mon.group()
'2018-09-01')
>>> mon.group(1)
'2018'
>>> mon.group(2)
'09'
>>> mon.group(3)
'01'
# Nesting groups
>>> import re
>>> mon = re.search(r'(((\d{4})-\d{2})-\d{2})', '2018-09-01')
>>> mon.groups()
('2018-09-01', '2018-09', '2018')
>>> mon.group()
'2018-09-01'
>>> mon.group(1)
'2018-09-01'
>>> mon.group(2)
'2018-09'
>>> mon.group(3)
'2018'
非捕获组:
# non capturing group
>>> import re
>>> url = 'http://w3resource.com/'
>>> mon = re.search('(?:http|ftp)://([^/\r\n]+)(/[^\r\n]*)?', url)
>>> mon.groups()
('w3resource.com', '/')
# capturing group
>>> import re
>>> mon = re.search('(http|ftp)://([^/\r\n]+)(/[^\r\n]*)?', url)
>>> mon.groups()
('http', 'w3resource.com', '/')
回溯参考:
# compare 'xx', 'yy'
>>> import re
>>> re.search(r'([a-z])\1$','xx') != None
True
>>> import re
>>> re.search(r'([a-z])\1$','yy') != None
True
>>> import re
>>> re.search(r'([a-z])\1$','xy') != None
False
# compare open tag and close tag
>>> import re
>>> pattern = r'<([^>]+)>[\s\S]*?'
>>> re.search(pattern, ' test ') != None
True
>>> re.search(pattern, '
test
') != None
True
>>> re.search(pattern, ' test ') != None
False
命名分组 (?P) :
# group reference ``(?P...)``
>>> import re
>>> pattern = '(?P\d{4})-(?P\d{2})-(?P\d{2})'
>>> mon = re.search(pattern, '2018-09-01')
>>> mon.group('year')
'2018'
>>> mon.group('month')
'09'
>>> mon.group('day')
'01'
# back reference ``(?P=name)``
>>> import re
>>> re.search('^(?P[a-z])(?P=char)','aa')
<_sre.SRE_Match object at 0x01A08660>
替换字符串:
# basic substitute
>>> import re
>>> res = "4x5y6z"
>>> re.sub(r'[a-z]',' ', res)
'4 5 6 '
# substitute with group reference
>>> import re
>>> date = r'2018-09-01'
>>> re.sub(r'(\d{4})-(\d{2})-(\d{2})',r'\2/\3/\1/',date)
'09/01/2018/'
# camelcase to underscore
>>> def convert(s):
... res = re.sub(r'(.)([A-Z][a-z]+)',r'\1_\2', s)
... return re.sub(r'([a-z])([A-Z])',r'\1_\2', res).lower()
...
>>> convert('SentenceCase')
'sentence_case'
>>> convert('SentenceSentenceCase')
'sentence_sentence_case'
>>> convert('SampleExampleHTTPServer')
'sample_example_http_server'
环视四周 :符号 | 比较方向 |
---|---|
(?<=...) | 右到左 |
(?=...) | 左到右 |
(?!<...) | 右到左 |
(?!...) | 左到右 |
# basic
>>> import re
>>> re.sub('(?=\d{3})', ' ', '56789')
' 5 6 789'
>>> re.sub('(?!\d{3})', ' ', '56789')
'567 8 9 '
>>> re.sub('(?<=\d{3})', ' ', '56789')
'567 8 9 '
>>> re.sub('(?
匹配常用用户名或密码:
>>> import re
>>> re.match('^[a-zA-Z0-9-_]{3,16}$', 'Foo') is not None
True
>>> re.match('^\w|[-_]{3,16}$', 'Foo') is not None
True
匹配十六进制颜色值:
>>> import re
>>> re.match('^#?([a-f0-9]{6}|[a-f0-9]{3})$', '#ff0000')
<_sre.SRE_Match object at 0x019E7720>
>>> re.match('^#?([a-f0-9]{6}|[a-f0-9]{3})$', '#000000')
<_sre.SRE_Match object at 0x019E77A0>
匹配电子邮件:
>>> import re
>>> re.match('^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})
'citi.world@example.com')
<_sre.SRE_Match object; span=(0, 22), match='citi.world@example.com'>
# or
>>> import re
>>> example = re.compile(r'''^([a-zA-Z0-9._%-]+@
[a-zA-Z0-9.-]+
\.[a-zA-Z]{2,4})*$''', re.X)
>>> example.match('citi.world@example.citi.com')
<_sre.SRE_Match object; span=(0, 27), match='citi.world@example.citi.com'>
>>> example.match('citi%world@example.citi.com')
<_sre.SRE_Match object; span=(0, 27), match='citi%world@example.citi.com'>
匹配网址:
>>> import re
>>> example = re.compile(r'''^(https?:\/\/)? # match http or https
... ([\da-z\.-]+) # match domain
... \.([a-z\.]{2,6}) # match domain
... ([\/\w \.-]*)\/?$ # match api or file
... ''', re.X)
>>> example.match('www.yahoo.com')
<_sre.SRE_Match object; span=(0, 13), match='www.yahoo.com'>
>>> example.match('http://www.example')
<_sre.SRE_Match object; span=(0, 18), match='http://www.example'>
>>> example.match('http://www.example/w3r.html')
<_sre.SRE_Match object; span=(0, 27), match='http://www.example/w3r.html'>
>>> example.match('http://www.example/w3r!.html')
>>> example
re.compile('^(https?:\\/\\/)?\n([\\da-z\\.-]+)\n\\.([a-z\\.]{2,6})\n([\\/\\w \\.-]*)\\/?$\n', re.VERBOSE)
匹配IP地址:
符号 | 描述 |
---|---|
[1]?[0-9][0-9] | 匹配 0-199 模式 |
2[0-4][0-9] | 匹配 200-249 模式 |
25[0-5] | 匹配 251-255 模式 |
(?:...) | 不抓组 |
>>> import re
>>> example = re.compile(r'''^(?:(?:25[0-5]
... |2[0-4][0-9]
... |[1]?[0-9][0-9]?)\.){3}
... (?:25[0-5]
... |2[0-4][0-9]
... |[1]?[0-9][0-9]?)$''', re.X)
>>> example.match('192.168.1.1')
<_sre.SRE_Match object at 0x0134A608>
>>> example.match('255.255.255.0')
<_sre.SRE_Match object at 0x01938678>
>>> example.match('172.17.0.5')
<_sre.SRE_Match object at 0x0134A608>
>>> example.match('256.0.0.0') is None
True
匹配Mac地址:
>>> import random
>>> mac = [random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b)]
>>> mac = ':'.join(map(lambda x: "%02x" % x, mac))
>>> mac
'05:38:64:60:55:63'
>>> import re
>>> example = re.compile(r'''[0-9a-f]{2}([:])
... [0-9a-f]{2}
... (\1[0-9a-f]{2}){4}
'', re.X) >>> example.match(mac) is not None
True
词法分析器:
>>> import re
>>> from collections import namedtuple
>>> tokens = [r'(?P\d+)',
r'(?P\+)',
r'(?P-)',
r'(?P\*)',
r'(?P/)',
r'(?P\s+)']
>>> lex = re.compile('|'.join(tokens))
>>> Token = namedtuple('Token', ['type', 'value'])
>>> def tokenize(text):
scan = lex.scanner(text)
return (Token(m.lastgroup, m.group())
for m in iter(scan.match, None) if m.lastgroup != 'WS')
>>> for _t in tokenize('9 + 5 * 2 - 7'):
print(_t)
Token(type='NUMBER', value='9')
Token(type='PLUS', value='+')
Token(type='NUMBER', value='5')
Token(type='TIMES', value='*')
Token(type='NUMBER', value='2')
Token(type='MINUS', value='-')
Token(type='NUMBER', value='7')
>>> tokens
['(?P\\d+)', '(?P\\+)', '(?P-)', '(?P\\*)', '(?P/)', '(?P\\s+)']
版权所属:SO JSON在线解析
原文地址:https://www.sojson.com/blog/512.html
转载时必须以链接形式注明原始出处及本声明。
本文主题:
如果本文对你有帮助,那么请你赞助我,让我更有激情的写下去,帮助更多的人。