标签:down pes maxent rate 转换 pch imp pen 实体
import nltk
def ie_proprocess(document):
sentences = nltk.sent_tokenize(document)#句子分割器
sentences = [nltk.word_tokenize(sent) for sent in sentences]#分词器
sentences = [nltk.pos_tag(sent) for sent in sentences]#词性标注器
#基于正则表达式的NP分块器的例子
sentence = [("the","DT"),("little","JJ"),("yellow","JJ"),("dog","NN"),("barked","VBD"),("at","IN"),("the","DT"),("cat","NN")]
grammer = "NP:{<DT>?<JJ>*<NN>}"#尖括号英语标记标识符的边界,尖括号之间的所有括号都被忽略
cp = nltk.RegexpParser(grammer)
result = cp.parse(sentence)
print(result)
result.draw()
#result
#(S
# (NP the/DT little/JJ yellow/JJ dog/NN)
# barked/VBD
# at/IN
# (NP the/DT cat/NN))
#用正则表达式分块
grammer = r‘‘‘NP:{<DT|PP\$>?<JJ>*<NN>} #匹配一个可选的限定词或所有格代名词
{<NPP>+} ‘‘‘ #匹配一个或多个专有名词
cp = nltk.RegexpParser(grammer)
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
print(cp.parse(sentence))
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
nouns = [("money","NN"),("market","NN"),("fund","NN")]
grammar = "NP: {<NN><NN>}" #如果将匹配两个连续名词的文本的规则应用到包含3个连续名词的文本中,则只有前两个名词被分块
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))#(S (NP money/NN market/NN) fund/NN)
#探索文本语料库
#使用分块器可以在已标注的语料库中提取匹配特定词性标记序列的短语
cp = nltk.RegexpParser(‘CHUNK : {<V.*><TO><V.*>}‘)
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == ‘CHUNK‘:
print(subtree)
(CHUNK combined / VBN to / TO achieve / VB)
(CHUNK continue / VB to / TO place / VB)
def find_chunks(chunk):#chunk = ‘CHUNK : {<V.*><TO><V.*>}‘
cp = nltk.RegexpParser (chunk)
brown = nltk.corpus.brown
for sent in brown.tagged_sents ():
tree = cp.parse (sent)
for subtree in tree.subtrees ():
if subtree.label () == ‘CHUNK‘:
print (subtree)
grammar = r"""
NP:
{<.*>+}
}<VBD|IN>+{"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
#(S
# (NP the/DT little/JJ yellow/JJ dog/NN)
# barked/VBD
# at/IN
# (NP the/DT cat/NN))
from nltk.corpus import conll2000
print(conll2000.chunked_sents(‘train.txt‘)[99])
#(S
# (PP Over/IN)
# (NP a/DT cup/NN)
# (PP of/IN)
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# (VP told/VBD)
# (NP his/PRP$ story/NN)
# ./.)
#corpora模块语料库包含三种分块类型:NP分块,VP分块,PP分块
print(conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])[99])#只选择NP分块
#(S
# Over/IN
# (NP a/DT cup/NN)
# of/IN
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# told/VBD
# (NP his/PRP$ story/NN)
# ./.)
cp = nltk.RegexpParser("") #不分块
test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
print(cp.evaluate(test_sents))
#ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
grammer = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammer)
test_sents = conll2000.chunked_sents(‘test.txt‘)
print(cp.evaluate(test_sents))
#ChunkParse score:
IOB Accuracy: 62.5%%
Precision: 70.6%%
Recall: 38.5%%
F-Measure: 49.8%%
class UnigramChunker(nltk.ChunkParserI):
def __init__(self,train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]
#为词性标注IOB块标记
conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图
test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
train_sents = conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
# ChunkParse score:
# IOB Accuracy: 92.9%%
# Precision: 79.9%%
# Recall: 86.8%%
# F-Measure: 83.2%%
postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
print(postags)
#[‘#‘, ‘$‘, "‘‘", ‘(‘, ‘)‘, ‘,‘, ‘.‘, ‘:‘, ‘CC‘, ‘CD‘, ‘DT‘, ‘EX‘, ‘FW‘, ‘IN‘, ‘JJ‘, ‘JJR‘, ‘JJS‘, ‘MD‘, ‘NN‘, ‘NNP‘, ‘NNPS‘, ‘NNS‘, ‘PDT‘, ‘POS‘, ‘PRP‘, ‘PRP$‘, ‘RB‘, ‘RBR‘, ‘RBS‘, ‘RP‘, ‘SYM‘, ‘TO‘, ‘UH‘, ‘VB‘, ‘VBD‘, ‘VBG‘, ‘VBN‘, ‘VBP‘, ‘VBZ‘, ‘WDT‘, ‘WP‘, ‘WP$‘, ‘WRB‘, ‘``‘]
print(unigram_chunker.tagger.tag(postags))
#[(‘#‘, ‘B-NP‘), (‘$‘, ‘B-NP‘), ("‘‘", ‘O‘), (‘(‘, ‘O‘), (‘)‘, ‘O‘), (‘,‘, ‘O‘), (‘.‘, ‘O‘), (‘:‘, ‘O‘), (‘CC‘, ‘O‘),...]
#建立bigram分类器
class BigramChunker(nltk.ChunkParserI):
def __init__(self,train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = nltk.BigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]
#为词性标注IOB块标记
conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
ChunkParse score:
# IOB Accuracy: 93.3%%
# Precision: 82.3%%
# Recall: 86.8%%
# F-Measure: 84.5%%
#训练基于分类器的分块器
#使用连续分类器对名词短语分块
# maxent 最大熵
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
return {"pos": pos} #只提供当前标识符的词性标记
class ConsecutiveNPChunkTagger (nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag (tagged_sent)
history = []
for i, (word, tag) in enumerate (tagged_sent):
featureset = npchunk_features(untagged_sent, i, history)
train_set.append ((featureset, tag))
history.append (tag)
self.classifier = nltk.MaxentClassifier.train(train_set, algorithm=‘megam‘, trace=0)# 最大熵
def tag(self, sentence):
history = []
for i, word in enumerate (sentence):
featureset = npchunk_features (sentence, i, history)
tag = self.classifier.classify(featureset)
history.append (tag)
return zip (sentence, history)
class ConsecutiveNPChunker (nltk.ChunkParserI):
def __init__(self, train_sents):
tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags (sent)] for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag (sentence)
conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
return nltk.chunk.conlltags2tree (conlltags)
# chunker = ConsecutiveNPChunker(train_sents)
# print(chunker.evaluate(test_sents))
#有时词性标记不足以确定一个句子应如何分块
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary","NN"), ("saw","VBD"),("the","DT"),("cat","NN"),("sit","VB"),("on","IN"),("the","DT"),("mat","NN")]
print(cp.parse(sentence))
# (S
# (NP Mary/NN)
# saw/VBD #无法识别VP
# (CLAUSE
# (NP the/DT cat/NN)
# (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
# (S
# (CLAUSE
# (NP Mary/NN)
# (VP
# saw/VBD
# (CLAUSE
# (NP the/DT cat/NN)
# (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
tree1 = nltk.Tree(‘NP‘,[‘Alice‘])
tree2 = nltk.Tree(‘NP‘,[‘the‘,‘rabbit‘])
tree3 = nltk.Tree(‘VP‘,[‘chased‘,tree2])
tree4 = nltk.Tree(‘S‘,[tree1,tree3])
# print(tree4[1])
tree4.draw()
def traverse(t):
try:
t.label()
except AttributeError:
print(t, end="")
else:
# Now we know that t.node is defined
print("(", t.label(), end=‘‘)
for child in t:
traverse(child)
print(")", end=‘‘)
import nltk
t = nltk.Tree(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
t = nltk.Tree.fromstring(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
traverse(t)
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent,binary=True))#ne_chunk(binary=True)命名实体只被标注为NE,否则,分类器会添加类型标注
print(nltk.ne_chunk(sent))
import re
IN = re.compile(r‘.*\bin\b(?!\b.+ing)‘)
for doc in nltk.corpus.ieer.parsed_docs(‘NYT_19980315‘):
for rel in nltk.sem.extract_rels(‘ORG‘,‘LOC‘,doc,corpus=‘ieer‘,pattern=IN):
print(nltk.sem.relextract.rtuple(rel))
from nltk.corpus import conll2002
vnv = """
(
is/V| #3rdsing present and
was/V| #past forms of the verb zijn (‘be‘)
werd/V| #and also present
wordt/V #pastof worden(‘become‘)
)
.* #followed byanything
van/Prep #followed byvan(‘of‘)
"""
VAN = re.compile(vnv,re.VERBOSE)
for doc in conll2002.chunked_sents(‘ned.train‘):
for r in nltk.sem.extract_rels(‘PER‘,‘ORG‘,doc,corpus=‘con112002‘,pattern=VAN):
print(nltk.sem.clause(r,relsym = "VAN"))
标签:down pes maxent rate 转换 pch imp pen 实体
原文地址:https://www.cnblogs.com/nxf-rabbit75/p/9565176.html