import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
# use torchtext to load data, no need to download dataset
# set up fields
# 两个Field对象定义字段的处理方法(文本字段、标签字段)
TEXT = data.Field(tokenize=‘spacy‘) # 分词
LABEL = data.LabelField(dtype=torch.float)
# make splits for data
# IMDB共50000影评,包含正面和负面两个类别。数据被前面的Field处理
# 按照(TEXT, LABEL) 分割成 训练集,测试集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(‘len of train data:‘, len(train_data)) # 25000
print(‘len of test data:‘, len(test_data)) # 25000
# torchtext.data.Example : 用来表示一个样本,数据+标签
print(train_data.examples[15].text) # 文本:句子的单词列表
print(train_data.examples[15].label) # 标签: 积极
# [‘The‘, ‘movie‘, ‘is‘, ‘a‘, ‘bit‘, ‘"‘, ‘thin‘, ‘"‘, ‘after‘, ‘reading‘, ‘the‘, ‘book‘, ‘,‘, ‘but‘, ‘it‘, "‘s", ‘still‘, ‘one‘, ‘of‘, ‘the‘, ‘greatest‘, ‘movies‘, ‘ever‘, ‘made‘, ‘.‘, ‘Sheryl‘, ‘Lee‘, ‘is‘, ‘beautiful‘, ‘and‘, ‘Nick‘, ‘Nolte‘, ‘is‘, ‘really‘, ‘"‘, ‘vonneguty‘, ‘"‘, ‘.‘, ‘He‘, ‘makes‘, ‘great‘, ‘job‘, ‘expressing‘, ‘the‘, ‘feelings‘, ‘from‘, ‘the‘, ‘book‘, ‘to‘, ‘the‘, ‘film‘, ‘.‘, ‘Not‘, ‘many‘, ‘films‘, ‘engage‘, ‘the‘, ‘feeling‘, ‘of‘, ‘the‘, ‘book‘, ‘as‘, ‘well‘, ‘as‘, ‘Mother‘, ‘Night‘, ‘does‘, ‘.‘]
# pos
# build the vocabulary
# 在这种情况下,会默认下载glove.6B.zip文件,进而解压出glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, glove.6B.300d.txt这四个文件
# 因此我们可以事先将glove.6B.zip或glove.6B.100d.txt放在当前文件夹下
TEXT.build_vocab(train_data, max_size=10000, vectors=‘glove.6B.100d‘) # 等价:text.build_vocab(train, vectors=GloVe(name=‘6B‘, dim=100))
print(len(TEXT.vocab)) # 10002
print(TEXT.vocab.itos[:12]) # [‘<unk>‘, ‘<pad>‘, ‘the‘, ‘,‘, ‘.‘, ‘and‘, ‘a‘, ‘of‘, ‘to‘, ‘is‘, ‘in‘, ‘I‘]
print(TEXT.vocab.stoi[‘and‘]) # 5
print(LABEL.vocab.stoi) # defaultdict(None, {‘neg‘: 0, ‘pos‘: 1})
