9  正则

9.1 正则表达式

9.1.1 字母数字字符(Word character)

不包含标点符号

import re

text = '这个单词是由-EabcdxyxG共11个字符组成的吗?'

# 单个字符
pattern = r'\w'
print(re.findall(pattern, text))
['这', '个', '单', '词', '是', '由', 'E', 'a', 'b', 'c', 'd', 'x', 'y', 'x', 'G', '共', '1', '1', '个', '字', '符', '组', '成', '的', '吗']
# 多个字符

pattern = r'\w+'
print(re.findall(pattern, text))
['这个单词是由', 'EabcdxyxG共11个字符组成的吗']

9.1.2 非单词字符

# 非字符,包括标点符号
pattern = r'\W'
print(re.findall(pattern, text))
['-', '?']

9.2 分割

# 要分割的文本
text = "Hello,world!How are you?"

# 按照逗号、句点、感叹号和问号来分割文本
result = re.split(r'[,.!?]', text)

# 输出分割后的结果
print(result)

# `maxsplit`设置分割的次数,
result = re.split(r'[,.!?]', text, maxsplit=1)

print(result)


# 如果为0,则按最大可能次数分割,它是默认值
result = re.split(r'[,.!?]', text, maxsplit=0)

print(result)
['Hello', 'world', 'How are you', '']
['Hello', 'world!How are you?']
['Hello', 'world', 'How are you', '']

9.2.1 保留分割符

text = "Hello|World@How|Are|You"

# 分割符为|或者@
result = re.split(fr"(?<=[@|])", text)

print(result)
['Hello|', 'World@', 'How|', 'Are|', 'You']
?re.split

9.3 替换

import re

# 定义一个字符串
text = "apple banana cherry"
# 使用 re.sub() 将 "banana" 替换为 "orange"
new_text = re.sub(r"banana", "orange", text)
print(new_text)  # 输出:"apple orange cherry"
apple orange cherry

9.3.1 支持替换函数

# 定义一个替换函数,将匹配到的字符串转换为大写
def to_uppercase(match):
    return match.group(0).upper()

# 定义一个字符串
text = "apple banana cherry"
# 使用 re.sub() 将字符串中的小写单词转换为大写
new_text = re.sub(r"\b[a-z]+\b", to_uppercase, text)
print(new_text)  # 输出:"APPLE BANANA CHERRY"
APPLE BANANA CHERRY

9.4 待分类入库

9.5 quiz 1

Check if a string contains the word word in it (case insensitive). If you have no idea, I guess you could try /word/.

import re
ptn = r'\bword\b'
txt = "I have no word to say, because i always have words with him using Word."
re.findall(ptn, txt)  # 不忽略大小写
re.findall(re.compile(ptn, re.I), txt)  # 忽略大小写, I代表incasesensitivity
['word', 'Word']

9.6 quiz 2

Use substitution to replace every occurrence of the word i with the word I (uppercase, I as in me). E.g.: i’‘m replacing it. am i not? -> I’’m replacing it. am I not?.

txt = '''i''ll use it on input to fix my lazy spelling! i mean... i just wanted to check if you understood how it worked.'''
ptn =  r'\bi\b'
re.sub(ptn, "I", txt)
"I''ll use it on input to fix my lazy spelling! I mean... I just wanted to check if you understood how it worked."

10 字母

10.1 quiz 3

With regex you can count the number of matches. Can you make it return the number of uppercase consonants (B,C,D,F,..,X,Y,Z) in a given string? E.g.: it should return 3 with the text ABcDeFO!. Note: Only ASCII. We consider Y to be a consonant!

txt = 'You should use the WHO guide to treat COVID-19.'

ptn = r'[B-DF-HJ-NP-TV-Z]'

re.findall(ptn, txt)
['Y', 'W', 'H', 'C', 'V', 'D']

10.2 quiz 4

Oh no! It seems my friends spilled beer all over my keyboard last night and my keys are super sticky now. Some of the time whennn I press a key, I get two duplicates.

Can you ppplease help me fix thhhis?

txt = '''Oh no! It seems my friends spilled beer all over my keyboard last night and my keys are super sticky now. Some of the time whennn I press a key, I get two duplicates.

Can you ppplease help me fix thhhis?'''

ptn = r'([nph]){3}'

re.sub(ptn, r"\1", txt, 0, re.M)  # r"\1"指匹配到的group 1,即n,p,h。有些语言中用$1表示;默认 0 表示替换所有的匹配
'Oh no! It seems my friends spilled beer all over my keyboard last night and my keys are super sticky now. Some of the time when I press a key, I get two duplicates.\n\nCan you please help me fix this?'

10.3 quiz 5

Match positive integers less than or equal to 255

import re
ptn = r"\b1[0-9][0-9]\b|\b2[0-5][0-5]\b|\b\d{1,2}\b"  # \b设定好边界

txt = "11 123 501 9 10 31 30 2530 0 255 -1 256 -125 100 199"

txt = re.sub(r'-\d+', '', txt)   # 删除负整数

re.findall(ptn, txt)
['11', '123', '9', '10', '31', '30', '0', '255', '100', '199']

11 quiz 6

Validate IPv4 address

import re


ptn1 = r'^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$'  # 前导0不正确
ptn2 = r'\b^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.)){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$\b'  # 前导0也正确




txt1 = "1.1.23.269"
txt2 = "1.1.23.169"
txt3 = "01.1.023.169"

for txt in [txt1, txt2, txt3]:
    if re.findall(ptn1, txt):
        print("{} : Valid IPv4 address using ptn1.".format(txt))
    else:
        print("{} : Invalid IPv4 address using ptn1.".format(txt))

    if re.findall(ptn2, txt):
        print("{} : Valid IPv4 address using ptn2.".format(txt))
    else:
        print("{} : Invalid IPv4 address using ptn2.".format(txt))

    if not re.findall(ptn1, txt) and not re.findall(ptn2, txt):
        print("{} : Invalid IPv4 address using ptn1 and ptn2.".format(txt))
1.1.23.269 : Invalid IPv4 address using ptn1.
1.1.23.269 : Invalid IPv4 address using ptn2.
1.1.23.269 : Invalid IPv4 address using ptn1 and ptn2.
1.1.23.169 : Valid IPv4 address using ptn1.
1.1.23.169 : Valid IPv4 address using ptn2.
01.1.023.169 : Invalid IPv4 address using ptn1.
01.1.023.169 : Valid IPv4 address using ptn2.

12 quiz 7

Find IPv4 address

import re

ptn1 = r'((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])'  # 前导0不正确
ptn2 = r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.)){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'  # 前导0也正确


txt1 = "1.1.23.269是IP地址"
txt2 = "判断10.1.23.169是IP地址,203.245.25.67也是IP地址"
txt3 = "01.1.023.169"

re.search(ptn2, txt2)  # 只能找到第1个

# 找到所有,search返回的是整体的match,而不是分组内容
def regexMatchAny(pattern:str, rawText):
    ptn = re.compile(pattern)
    res = ptn.search(rawText)  
    lst = []
    while res:
        start, end = res.span()
        lst.append(res.group(0))
        res = ptn.search(rawText, end+1)
    return lst

regexMatchAny(r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.)){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', txt2)
['10.1.23.169', '203.245.25.67']