python regex

时间：2018-05-13 23:02:01 阅读：356 评论：0 收藏：0 [点我收藏+]

标签：coop

正则--> why what how

why :

从大量文本中查找规则字符串，比字符串各种查找都迅速，利用c语言的匹配引擎，广泛应用于各种搜索，查找，爬虫

what :

正则 - - > 代数，变量替换（用一些规定好的符号去陪陪有规则的文本）

在线工具：

https://www.regepai.com/

http://tool.oschina.net/regex

语法说明：

re模块使用说明：

正则语法： 1.表达式 2.转意符号

模块内容： 1.函数2. 常量 3.异常

两个对象 1.re对象：regex 2.match对象：match（match object）

import re

import sys
sys.argv?

# 函数
# re.compile(pattern, flags=0)  --regex   返回的对象是regex
# re.search(pattern, flags=0)  --match    返回的对象是match
# re.match(pattern,string, flags=0) --match
# re.fulimatch(match, string, flags=0)  --match
# re.findall(pattern, string flags=0)  --list
# re.finditer(pattern, string, flags=0)  --iterator
# re.sub(pattern, repl, string, count=0, flags=0)  --str
# re.subn(pattern, reple, string, count=0, flags=0) --tuple
# re.escape(pattern)  --characters
# repurge()

# 常量
# re.A/re.ASCII让\w,\W,\b,\B,\d,\D,\s和\S禁止性ASCII-匹配
# re.DEGUG 暂时debug信息
# re.I  ignore忽略大小写
# re.L  locale 表示特殊字符集  \w\W \b,\B,\s,\S依赖于当前环境
# re.M  multiline 多行模式
# re.S  dotall 即为
# re.X  verbose  为了增加可读性，忽略恐吓和‘#’后面的注释

# import re 
# re.compile() 创建一个正则对象regex，一个变量对此使用
# regex = re.compile(pattern) # 使用regex对象，推荐，应用更灵活
# result = regex.match(string)
# ==
# match = re.match(pattern, string)
# 3, 使用regex查找一个字符串，返回被匹配的对象
# 4， 调用陪陪对象的group方法，返回实际匹配的文本

import re

# 使用match对象
re.search(‘coop‘,‘coop is a hero‘)

<_sre.SRE_Match object; span=(0, 4), match=‘coop‘>

match = re.search(‘coop‘,‘who is coop?‘)

print(match.group())

coop

match

<_sre.SRE_Match object; span=(7, 11), match=‘coop‘>

match = re.search(‘coop‘,‘how are you‘)

print(match.group())

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-26-75394e24da25> in <module>()
----> 1 print(match.group())

AttributeError: ‘NoneType‘ object has no attribute ‘group‘

import re
s = """
1234-1243-443
110-1231-1234
coop@126.com
110@qq.com
coop@qq.com
html.lee@google.com
http://baidu.com
https://github.com
http://taobao.com
"""
#targ = ‘\d{4}‘
targ = ‘\d{3,4}-\d{4}-\d{3,4}‘  # 如何匹配到前两行号码？
targ = ‘\d+-\d+-\d+‘
match = re.search(targ,s)
if match:
    print(‘here:‘,match.group())
else:
    print(‘there in no such character‘)

here: 1234-1243-443

aa = re.match(‘\d{3}‘,‘coop123‘)

print(aa)

None

aa = re.match(‘\d{3}‘,‘12coop‘)

print(aa)

None

aa = re.match(‘\d{3}‘,‘123456coop‘)

print(‘here:‘,aa.group())  # match 只匹配从头开始的字符，如果没有就不匹配

here: 123

re.match(‘f‘,‘fffffather‘)

<_sre.SRE_Match object; span=(0, 1), match=‘f‘>

‘\n1234-1243-443\n110-1231-1234\ncoop@126.com\n110@qq.com\ncoop@qq.com\nhtml.lee@google.com\nhttp://baidu.com\nhttps://github.com\nhttp://taobao.com\n‘

# 使用compile生成regex对象
regex = re.compile(‘\n1234‘)  # 先生成正则表达式，然后下面直接调用该方法
match  = regex.match(s)

print(match)
if match:
    print(‘here:‘,match.group(),end=‘‘)  # \n1234  \n的作用
else:
    print(‘sorry ,there is no such character‘)

<_sre.SRE_Match object; span=(0, 5), match=‘\n1234‘>
here: 
1234

# 使用compile生成regex对象
regex = re.compile(‘1234‘)  # 先生成正则表达式，然后下面直接调用该方法
match  = regex.match(s)

print(match)
if match:
    print(‘here:‘,match.group(),end=‘‘)  # \n1234  \n的作用
else:
    print(‘sorry ,there is no such character‘)

None
sorry ,there is no such character

print(s)

1234-1243-443
110-1231-1234
coop@126.com
110@qq.com
coop@qq.com
html.lee@google.com
http://baidu.com
https://github.com
http://taobao.com

# 使用compile生成regex对象
regex = re.compile(‘1234‘)  # 先生成正则表达式，然后下面直接调用该方法
match  = regex.match(s)

print(match)
if match:
    print(‘here:‘,match.group(),end=‘‘)  # \n1234  \n的作用
else:
    print(‘sorry ,there is no such character‘)

None
sorry ,there is no such character

正则表达式的分组匹配

 # （） ：括号分组
# | ：管道符号匹配多个分组
# ？ ：选择出现0次或1次
# re.x ：换行，注释

findall 返回列表

有分组，返回元祖列表

无分组，返回

regex = re.compile(r‘coop‘)  # r raw  ,原始的

regex.findall

<function SRE_Pattern.findall(string=None, pos=0, endpos=9223372036854775807, *, source=None)>

regex.findall(‘adsfhasdlfcoop124235sdfcoop‘)

[‘coop‘, ‘coop‘]

##############################

regex = re.compile(r‘(coop)‘)  # () 分组
regex.findall(‘adsfhasdlfcoop124235sdfcoop‘)

[‘coop‘, ‘coop‘]

regex.split(‘adsfhasdlfcoop124235sdfcoop‘)

[‘adsfhasdlf‘, ‘coop‘, ‘124235sdf‘, ‘coop‘, ‘‘]

###########################

regex = re.compile(r‘coop‘)
regex.split(‘adsfhasdlfcoop124235sdfcoop‘)  #返回的列表，最后又一个空的

[‘adsfhasdlf‘, ‘124235sdf‘, ‘‘]

regex = re.compile(r‘coop‘)
regex.split(‘adsfhasdlfcoop124235sdfcoop‘,maxsplit=1)

[‘adsfhasdlf‘, ‘124235sdfcoop‘]

regex = re.compile(r‘coop‘)
regex.split(‘adsfhasdlfcoop124235sdfcoopsdfasd‘)  # 更改后，空内容消失

[‘adsfhasdlf‘, ‘124235sdf‘, ‘sdfasd‘]

############################  提取IP地址的正则表达式

regex = re.compile(r"((2[0-4]\d|25[0-5]|[0-1]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[0-1]?\d\d?)")
re_ip = regex.match(‘192.168.1.1‘)
re_ip = re_ip.group()

print(re_ip)

192.168.1.1

##############################

regex = re.compile(r"""((2[0-4]\d|25[0-5]|[0-1]?\d\d?)\.)  # ip的第一组数字，包含后面的点
                   {3}                                  # 表示三组数字
                   (2[0-4]\d|25[0-5]|[0-1]?\d\d?)"""    #  最后一组数字
                   ,re.X)  #  正则可以换行，可以注释
re_ip = regex.match(‘192.168.1.1‘)
print(re_ip.group())

192.168.1.1

regex = re.compile(r"""((2[0-4]\d|25[0-5]|[0-1]?\d\d?)\.)  # ip的第一组数字，包含后面的点
                   {3}                                  # 表示三组数字
                   (2[0-4]\d|25[0-5]|[0-1]?\d\d?)"""    #  最后一组数字
                   ,re.X)  #  正则可以换行，可以注释
re_ip = regex.match(‘192.168.1.1‘)
print(re_ip)

<_sre.SRE_Match object; span=(0, 11), match=‘192.168.1.1‘>

import re
regex = re.compile(r"""((2[0-4]\d|25[0-5]|[0-1]?\d?\d?)\.)  # ip的第一组数字，包含后面的点
                   {3}                                  # 表示三组数字
                   (2[0-4]\d|25[0-5]|[0-1]?\d?\d?)"""    #  最后一组数字
                   ,re.X)  #  正则可以换行，可以注释
re_ip = regex.match(‘192.168.1.1‘)
print(re_ip)

<_sre.SRE_Match object; span=(0, 11), match=‘192.168.1.1‘>

import re
regex = re.compile(r"""((2[0-4]\d|25[0-5]|[0-1]?\d?\d)\.)  # ip的第一组数字，包含后面的点
                   {3}                                  # 表示三组数字
                   (2[0-4]\d|25[0-5]|[0-1]?\d?\d)"""    #  最后一组数字
                   ,re.X)  #  正则可以换行，可以注释
re_ip = regex.match(‘192.168.1.1‘)
print(re_ip)

<_sre.SRE_Match object; span=(0, 11), match=‘192.168.1.1‘>

贪婪匹配与不贪婪匹配

{m,n}? 对于前一个字符重复m到n次，并且取尽可能少的情况，

如字符串‘aaaaaa’，a{2，4}会匹配4个a，但在a{2，4}？只会匹配2个a

. 与？的搭配，默认贪婪匹配。？给加上限制，表示非贪婪匹配，？把匹配到的字符限制到最少

r‘‘‘(.)‘‘‘ 与r‘‘‘(.?)‘‘‘

"""

*? 重复任意次，但尽可能少的重复。
+? 重复1次或更多次，但尽可能少的重复。
?? 重复0次或1次，但尽可能少的重复。
{n,}? 重复n次以上，但尽可能少的重复。

"""

# 只匹配双引号中的内容（包含引号）
re_quoto = re.compile(r‘"(.*)"‘)
text1 = ‘Computer says "no."‘
find1 = re_quoto.findall(text1)
print(find1)
text2 = ‘Computer says "no",Phone says "yes." ‘
find2 = re_quoto.findall(text2)
print(find2)

[‘no.‘]
[‘no",Phone says "yes.‘]

re_quoto = re.compile(r‘"(.*?)"‘)
text1 = ‘Computer says "no."‘
find1 = re_quoto.findall(text1)
print(find1)

text2 = ‘Computer says "no",Phone says "yes,"‘  # 非贪婪匹配，匹配的是"no",和"yes",有双引号
find2 = re_quoto.findall(text2)
print(find2)

[‘no.‘]
[‘no‘, ‘yes,‘]

re_quoto = re.compile(r‘"(.*?)"‘)
text1 = ‘Computer says "no."‘
find1 = re_quoto.findall(text1)
print(find1)

text2 = ‘Computer says "no,Phone says "yes,"‘  # 非贪婪匹配，匹配的是"no",和"yes",有双引号
find2 = re_quoto.findall(text2)
print(find2)

[‘no.‘]
[‘no,Phone says ‘]

技术分享图片

python regex

标签：coop

原文地址：http://blog.51cto.com/13118411/2115728

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

python regex

why :

从大量文本中查找规则字符串，比字符串各种查找都迅速，利用c语言的匹配引擎，广泛应用于各种搜索，查找，爬虫

what :

正则 - - > 代数，变量替换（用一些规定好的符号去陪陪有规则的文本）

在线工具：

https://www.regepai.com/

http://tool.oschina.net/regex

语法说明：

re模块使用说明：

正则语法： 1.表达式 2.转意符号

模块内容： 1.函数2. 常量 3.异常

两个对象 1.re对象：regex 2.match对象：match（match object）

正则表达式的分组匹配

findall 返回列表

有分组，返回元祖列表

无分组，返回

贪婪匹配与不贪婪匹配

{m,n}? 对于前一个字符重复m到n次，并且取尽可能少的情况，

如字符串‘aaaaaa’，a{2，4}会匹配4个a，但在a{2，4}？只会匹配2个a

. 与 ？的搭配，默认贪婪匹配。？给加上限制，表示非贪婪匹配，？把匹配到的字符限制到最少

r‘‘‘(.)‘‘‘ 与r‘‘‘(.?)‘‘‘

"""

"""

. 与？的搭配，默认贪婪匹配。？给加上限制，表示非贪婪匹配，？把匹配到的字符限制到最少