PyTorch在NLP任务中使用预训练词向量

时间：2020-07-25 09:23:34 阅读：111 评论：0 收藏：0 [点我收藏+]

1. 例子

import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets

import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets

# use torchtext to load data, no need to download dataset
# set up fields
# 两个Field对象定义字段的处理方法（文本字段、标签字段）
TEXT = data.Field(tokenize=‘spacy‘)  # 分词
LABEL = data.LabelField(dtype=torch.float)

# make splits for data
# IMDB共50000影评，包含正面和负面两个类别。数据被前面的Field处理
# 按照(TEXT, LABEL) 分割成 训练集，测试集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print(‘len of train data:‘, len(train_data))        # 25000
print(‘len of test data:‘, len(test_data))          # 25000

# torchtext.data.Example : 用来表示一个样本，数据+标签
print(train_data.examples[15].text)                 # 文本：句子的单词列表
print(train_data.examples[15].label)                # 标签: 积极
# [‘The‘, ‘movie‘, ‘is‘, ‘a‘, ‘bit‘, ‘"‘, ‘thin‘, ‘"‘, ‘after‘, ‘reading‘, ‘the‘, ‘book‘, ‘,‘, ‘but‘, ‘it‘, "‘s", ‘still‘, ‘one‘, ‘of‘, ‘the‘, ‘greatest‘, ‘movies‘, ‘ever‘, ‘made‘, ‘.‘, ‘Sheryl‘, ‘Lee‘, ‘is‘, ‘beautiful‘, ‘and‘, ‘Nick‘, ‘Nolte‘, ‘is‘, ‘really‘, ‘"‘, ‘vonneguty‘, ‘"‘, ‘.‘, ‘He‘, ‘makes‘, ‘great‘, ‘job‘, ‘expressing‘, ‘the‘, ‘feelings‘, ‘from‘, ‘the‘, ‘book‘, ‘to‘, ‘the‘, ‘film‘, ‘.‘, ‘Not‘, ‘many‘, ‘films‘, ‘engage‘, ‘the‘, ‘feeling‘, ‘of‘, ‘the‘, ‘book‘, ‘as‘, ‘well‘, ‘as‘, ‘Mother‘, ‘Night‘, ‘does‘, ‘.‘]
# pos

# build the vocabulary
# 在这种情况下，会默认下载glove.6B.zip文件，进而解压出glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, glove.6B.300d.txt这四个文件
# 因此我们可以事先将glove.6B.zip或glove.6B.100d.txt放在当前文件夹下
TEXT.build_vocab(train_data, max_size=10000, vectors=‘glove.6B.100d‘)  # 等价:text.build_vocab(train, vectors=GloVe(name=‘6B‘, dim=100))
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))             # 10002
print(TEXT.vocab.itos[:12])        # [‘<unk>‘, ‘<pad>‘, ‘the‘, ‘,‘, ‘.‘, ‘and‘, ‘a‘, ‘of‘, ‘to‘, ‘is‘, ‘in‘, ‘I‘]
print(TEXT.vocab.stoi[‘and‘])      # 5
print(LABEL.vocab.stoi)            # defaultdict(None, {‘neg‘: 0, ‘pos‘: 1})

标签：样本 nump oca 任务数据词向量测试 elf cab

原文地址：https://www.cnblogs.com/douzujun/p/13375250.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行