2022年10月4日星期二

python3 regex .* vs .*? : match more or less, match, search, findall, sub

1.  .* vs  .*? :

import re

input='helloworld, 123abc,test'
pattern1='.*(\d+)'
pattern2='.*?(\d+)'

result1=re.match(pattern1, input)
result2=re.match(pattern2, input)

print(result1.group(1)) 
=> 3    # .* match more, \d+ match 3 only

print(result2.group(1)) 
=> 123 # .*? match less, \d+ match 123.  Generally used.

2. match, search, findall
pattern='test'
input2=' a test, b test '
m=re.match(pattern, input2)
print(m)
=>None
m=re.search(pattern, input2)
print(m)
=><_sre.SRE_Match object; span=(2, 6), match='test'>
print(m.group())
=>test
m=re.findall(pattern, input2)
print(m)
=>['test', 'test']

3. replace by sub() 
html = '''    <ul id="list" class="list-group">
        <li data-view="5"><a href="null">test1</a></li>
        <li data-view="6"><a href="null">test2</a></li>
    </ul>
    '''

html = re.sub('<a.*?>|</a>', '', html)
print(html)
=>    <ul id="list" class="list-group">
        <li data-view="5">test1</li>
        <li data-view="6">test2</li>
 </ul>

pattern = '<li.*?>(.*?)</li>'
results = re.findall(pattern, html, re.S)  # re.S改行
print(results)=> ["test1","test2"]
pattern = '<li.*>(.*)</li>'
results = re.findall(pattern, html, re.S)  # re.S改行
print(results)=> ["test2"]

4. get match group by name 
pattern = '<li.*?>(?P<text>.*?)</li>'
results = re.seach(pattern, html, re.S)  # re.S改行
print(results.group(1))=>test1
print(results.group("text"))=>test1

5. match lookahead assertion. (?=, ?! )
input1='hello world!'
input2='hello goodbye!'

pattern1='hello (?=world)'       #?=world: lookahead match
pattern2='hello (?!word)'  #?!world: lookahead not match 

r1=re.pattern(pattern1, input1)
print(r1.group())=>hello

r2=re.pattern(pattern2, input1)
print(r2.group())=>None
r3=re.pattern(pattern2, input2)
print(r3.group())=>hello

6. match lookbehind assertion. (?<=,?<! )
>>> m = re.search('(?<=abc)def', 'abcdef')  #?<=lookbehind match
>>> m.group(0)
'def'

m = re.search(r'(?<=-)\w+', 'spam-egg')     # must has - before \w+  
>>> m.group(0)
'egg'



没有评论: