本文共 6250 字,大约阅读时间需要 20 分钟。
为了巩固词嵌入技术在文本向量化中的应用,本次完成一个基于PyTorch的文本情感分类案例。使用IMDB数据集,该数据集包含5万条流行电影评论,训练集和测试集各25000条。每条评论都附带情感评分(1-4为负面,5-10为正面),评论内容需要通过词嵌入技术进行向量化处理,实现情感预测。
文本情感分类可以定义为多分类问题(或视为回归问题)。完整流程包括:
使用简单的多层感知机(MLP)作为分类模型,包含词嵌入层和全连接层。
在测试集上计算平均损失和准确率,评估模型性能。
import osimport refrom torch.utils.data import Dataset, DataLoaderdata_base_path = r'../data/aclImdb/'class ImdbDataset(Dataset): def __init__(self, mode): super().__init__() if mode == "train": text_path = ["train/neg", "train/pos"] else: text_path = ["test/neg", "test/pos"] self.total_file_path_list = [] for path in text_path: self.total_file_path_list.extend([os.path.join(path, file) for file in os.listdir(path)]) def __getitem__(self, item): cur_path = self.total_file_path_list[item] cur_filename = os.path.basename(cur_path) label = int(cur_filename.split("_")[-1].split(".")[0]) - 1 # 标签转换为0-9 text = tokenize(open(cur_path).read().strip()) return label, text def __len__(self): return len(self.total_file_path_list)def tokenize(text): filters = ['!', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n', '\x97', '\x96', '”', '“'] text = re.sub(r'<.*?>", " ", text, flags=re.S) text = re.sub('|'.join(filters), ' ', text, flags=re.S) return [word.strip() for word in text.split()] def get_dataloader(mode, batch_size=32): dataset = ImdbDataset(mode) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloaderdef collate_fn(batch): labels, texts = zip(*batch) labels = torch.tensor(labels, dtype=torch.int32) texts = [torch.tensor(text) for text in texts] return labels, texts
import numpy as npclass Word2Sequence: UNK_TAG = "UNK" PAD_TAG = "PAD" UNK = 0 PAD = 1 def __init__(self): self.dict = {self.UNK_TAG: self.UNK, self.PAD_TAG: self.PAD} self.fited = False self.inversed_dict = {} def to_index(self, word): assert self.fited, "必须先进行fit操作" return self.dict.get(word, self.UNK) def to_word(self, index): assert self.fited, "必须先进行fit操作" return self.inversed_dict.get(index, self.UNK_TAG) def __len__(self): return len(self.dict) def fit(self, sentences, min_count=1, max_count=None, max_feature=None): count = {} for sentence in sentences: for word in sentence: count[word] = count.get(word, 0) + 1 if min_count is not None: count = {k: v for k, v in count.items() if v >= min_count} if max_count is not None: count = {k: v for k, v in count.items() if v <= max_count} if isinstance(max_feature, int): count = sorted(count.items(), key=lambda x: x[1])[-max_feature:] else: count = sorted(count.items()) for word, _ in count: self.dict[word] = len(self.dict) self.fited = True self.inversed_dict = {v: k for k, v in self.dict.items()} def transform(self, sentence, max_len=None): if max_len is not None and len(sentence) > max_len: sentence = sentence[:max_len] padding = [self.PAD] * (max_len - len(sentence)) if max_len is not None else [] indexed = [self.to_index(word) for word in sentence] + padding return np.array(indexed, dtype=np.int64) import torchimport torch.nn as nnimport torch.nn.functional as Ffrom build_dataset import get_dataloader, MAX_LENclass IMDBModel(nn.Module): def __init__(self, max_len): super().__init__() self.embedding = nn.Embedding(len(ws), 300, padding_idx=ws.PAD) self.fc = nn.Linear(max_len * 300, 10) def forward(self, x): embed = self.embedding(x) # [batch_size, max_len, 300] embed = embed.view(-1) # [batch_size * max_len, 300] out = self.fc(embed) # [batch_size * max_len, 10] return F.log_softmax(out, dim=-1)
from torch.optim import Adamimport torchimport torch.optim as optimtrain_batch_size = 128test_batch_size = 1000imdb_model = IMDBModel(MAX_LEN)optimizer = optim.Adam(imdb_model.parameters())criterion = nn.CrossEntropyLoss()def train(): global_max_epochs = 10 for epoch in range(global_max_epochs): print(f"Train Epoch: {epoch+1}") imdb_model.train() train_loader = get_dataloader("train", train_batch_size) for idx, (targets, inputs) in enumerate(train_loader): optimizer.zero_grad() outputs = imdb_model(inputs) loss = F.nll_loss(outputs, targets) loss.backward() optimizer.step() if idx % 10 == 0: print(f"Train {idx * train_batch_size}/{len(train_loader.dataset)} ({100. * idx / len(train_loader)}%)\tLoss: {loss.item()}") torch.save(imdb_model.state_dict(), "model/mnist_net.pkl") torch.save(optimizer.state_dict(), 'model/mnist_optimizer.pkl')def evaluate(): imdb_model.eval() test_loader = get_dataloader("test", test_batch_size) total_loss = 0 total_correct = 0 with torch.no_grad(): for targets, inputs in test_loader: outputs = imdb_model(inputs) loss = F.nll_loss(outputs, targets) total_loss += loss.item() preds = torch.max(outputs, dim=-1).item() total_correct += sum(preds == targets) avg_loss = total_loss / len(test_loader.dataset) accuracy = (total_correct / len(test_loader.dataset)) * 100 print(f"Test set: Avg. loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")if __name__ == "__main__": evaluate() for _ in range(3): train() 通过本次项目,我掌握了如何使用PyTorch处理文本数据并构建情感分类模型的关键技术。主要包括:
这一过程帮助我对自然语言处理任务有了更深入的理解,为后续项目奠定了基础。
转载地址:http://ihhh.baihongyu.com/