标签:down pes maxent rate 转换 pch imp pen 实体
import nltk
def ie_proprocess(document):
sentences = nltk.sent_tokenize(document)#句子分割器
sentences = [nltk.word_tokenize(sent) for sent in sentences]#分词器
sentences = [nltk.pos_tag(sent) for sent in sentences]#词性标注器
sentence = [("the","DT"),("little","JJ"),("yellow","JJ"),("dog","NN"),("barked","VBD"),("at","IN"),("the","DT"),("cat","NN")]
grammer = "NP:{<DT>?<JJ>*<NN>}"#尖括号英语标记标识符的边界,尖括号之间的所有括号都被忽略
cp = nltk.RegexpParser(grammer)
result = cp.parse(sentence)
# (NP the/DT little/JJ yellow/JJ dog/NN)
# barked/VBD
# at/IN
# (NP the/DT cat/NN))
grammer = r‘‘‘NP:{<DT|PP\$>?<JJ>*<NN>} #匹配一个可选的限定词或所有格代名词
{<NPP>+} ‘‘‘ #匹配一个或多个专有名词
cp = nltk.RegexpParser(grammer)
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
nouns = [("money","NN"),("market","NN"),("fund","NN")]
grammar = "NP: {<NN><NN>}" #如果将匹配两个连续名词的文本的规则应用到包含3个连续名词的文本中,则只有前两个名词被分块
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))#(S (NP money/NN market/NN) fund/NN)
cp = nltk.RegexpParser(‘CHUNK : {<V.*><TO><V.*>}‘)
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == ‘CHUNK‘:
(CHUNK combined / VBN to / TO achieve / VB)
(CHUNK continue / VB to / TO place / VB)
def find_chunks(chunk):#chunk = ‘CHUNK : {<V.*><TO><V.*>}‘
cp = nltk.RegexpParser (chunk)
brown = nltk.corpus.brown
for sent in brown.tagged_sents ():
tree = cp.parse (sent)
for subtree in tree.subtrees ():
if subtree.label () == ‘CHUNK‘:
print (subtree)
grammar = r"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
# (NP the/DT little/JJ yellow/JJ dog/NN)
# barked/VBD
# at/IN
# (NP the/DT cat/NN))
from nltk.corpus import conll2000
# (PP Over/IN)
# (NP a/DT cup/NN)
# (PP of/IN)
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# (VP told/VBD)
# (NP his/PRP$ story/NN)
# ./.)
print(conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])[99])#只选择NP分块
# Over/IN
# (NP a/DT cup/NN)
# of/IN
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# told/VBD
# (NP his/PRP$ story/NN)
# ./.)
cp = nltk.RegexpParser("") #不分块
test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
#ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
grammer = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammer)
test_sents = conll2000.chunked_sents(‘test.txt‘)
#ChunkParse score:
IOB Accuracy: 62.5%%
Precision: 70.6%%
Recall: 38.5%%
F-Measure: 49.8%%
class UnigramChunker(nltk.ChunkParserI):
def __init__(self,train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]
conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图
test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
train_sents = conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])
unigram_chunker = UnigramChunker(train_sents)
# ChunkParse score:
# IOB Accuracy: 92.9%%
# Precision: 79.9%%
# Recall: 86.8%%
# F-Measure: 83.2%%
postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
#[‘#‘, ‘$‘, "‘‘", ‘(‘, ‘)‘, ‘,‘, ‘.‘, ‘:‘, ‘CC‘, ‘CD‘, ‘DT‘, ‘EX‘, ‘FW‘, ‘IN‘, ‘JJ‘, ‘JJR‘, ‘JJS‘, ‘MD‘, ‘NN‘, ‘NNP‘, ‘NNPS‘, ‘NNS‘, ‘PDT‘, ‘POS‘, ‘PRP‘, ‘PRP$‘, ‘RB‘, ‘RBR‘, ‘RBS‘, ‘RP‘, ‘SYM‘, ‘TO‘, ‘UH‘, ‘VB‘, ‘VBD‘, ‘VBG‘, ‘VBN‘, ‘VBP‘, ‘VBZ‘, ‘WDT‘, ‘WP‘, ‘WP$‘, ‘WRB‘, ‘``‘]
#[(‘#‘, ‘B-NP‘), (‘$‘, ‘B-NP‘), ("‘‘", ‘O‘), (‘(‘, ‘O‘), (‘)‘, ‘O‘), (‘,‘, ‘O‘), (‘.‘, ‘O‘), (‘:‘, ‘O‘), (‘CC‘, ‘O‘),...]
class BigramChunker(nltk.ChunkParserI):
def __init__(self,train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = nltk.BigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]
conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图
bigram_chunker = BigramChunker(train_sents)
ChunkParse score:
# IOB Accuracy: 93.3%%
# Precision: 82.3%%
# Recall: 86.8%%
# F-Measure: 84.5%%
# maxent 最大熵
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
return {"pos": pos} #只提供当前标识符的词性标记
class ConsecutiveNPChunkTagger (nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag (tagged_sent)
history = []
for i, (word, tag) in enumerate (tagged_sent):
featureset = npchunk_features(untagged_sent, i, history)
train_set.append ((featureset, tag))
history.append (tag)
self.classifier = nltk.MaxentClassifier.train(train_set, algorithm=‘megam‘, trace=0)# 最大熵
def tag(self, sentence):
history = []
for i, word in enumerate (sentence):
featureset = npchunk_features (sentence, i, history)
tag = self.classifier.classify(featureset)
history.append (tag)
return zip (sentence, history)
class ConsecutiveNPChunker (nltk.ChunkParserI):
def __init__(self, train_sents):
tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags (sent)] for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag (sentence)
conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
return nltk.chunk.conlltags2tree (conlltags)
# chunker = ConsecutiveNPChunker(train_sents)
# print(chunker.evaluate(test_sents))
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
cp = nltk.RegexpParser(grammar)
sentence = [("Mary","NN"), ("saw","VBD"),("the","DT"),("cat","NN"),("sit","VB"),("on","IN"),("the","DT"),("mat","NN")]
# (S
# (NP Mary/NN)
# saw/VBD #无法识别VP
# (NP the/DT cat/NN)
# (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
cp = nltk.RegexpParser(grammar, loop=2)
# (S
# (NP Mary/NN)
# (VP
# saw/VBD
# (NP the/DT cat/NN)
# (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
tree1 = nltk.Tree(‘NP‘,[‘Alice‘])
tree2 = nltk.Tree(‘NP‘,[‘the‘,‘rabbit‘])
tree3 = nltk.Tree(‘VP‘,[‘chased‘,tree2])
tree4 = nltk.Tree(‘S‘,[tree1,tree3])
# print(tree4[1])
def traverse(t):
except AttributeError:
print(t, end="")
# Now we know that t.node is defined
print("(", t.label(), end=‘‘)
for child in t:
print(")", end=‘‘)
import nltk
t = nltk.Tree(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
t = nltk.Tree.fromstring(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
sent = nltk.corpus.treebank.tagged_sents()[22]
import re
IN = re.compile(r‘.*\bin\b(?!\b.+ing)‘)
for doc in nltk.corpus.ieer.parsed_docs(‘NYT_19980315‘):
for rel in nltk.sem.extract_rels(‘ORG‘,‘LOC‘,doc,corpus=‘ieer‘,pattern=IN):
from nltk.corpus import conll2002
vnv = """
is/V| #3rdsing present and
was/V| #past forms of the verb zijn (‘be‘)
werd/V| #and also present
wordt/V #pastof worden(‘become‘)
.* #followed byanything
van/Prep #followed byvan(‘of‘)
VAN = re.compile(vnv,re.VERBOSE)
for doc in conll2002.chunked_sents(‘ned.train‘):
for r in nltk.sem.extract_rels(‘PER‘,‘ORG‘,doc,corpus=‘con112002‘,pattern=VAN):
print(nltk.sem.clause(r,relsym = "VAN"))
