当前位置：首页 > news >正文

做大型网站需要多少钱单页网站案例分析

news 2025/11/18 15:01:31

做大型网站需要多少钱,单页网站案例分析,微信公众号网站开发 2016,eclipse用来做网站前端目录 1--中文分类 1-1--使用预训练模型推理 1-2--基于预训练模型实现下游任务 2--中文填空 3--中文句子关系推断 1--中文分类 1-1--使用预训练模型推理代码实例#xff1a; import torch from datasets import load_dataset from transformers import BertTokenizer,…目录 1--中文分类 1-1--使用预训练模型推理 1-2--基于预训练模型实现下游任务 2--中文填空 3--中文句子关系推断 1--中文分类 1-1--使用预训练模型推理代码实例 import torch from datasets import load_dataset from transformers import BertTokenizer, BertModel# 定义全局分词工具 tokenizer BertTokenizer.from_pretrained(bert-base-chinese)# 定义数据集 class Dataset(torch.utils.data.Dataset):def __init__(self, split):self.dataset load_dataset(path lansinuote/ChnSentiCorp, split split) # 加载数据集def __len__(self):return len(self.dataset)def __getitem__(self, i):text self.dataset[i][text]label self.dataset[i][label]return text, label# 自定义数据的处理(加载)方式 def my_collate_fn(data): # data 的类型与 dataset 的返回值相同本例中dataset返回一个列表[text, label]# 根据dataset的返回结果取出对应的text和labelsents [i[0] for i in data]labels [i[1] for i in data]# 使用全局的分词工具进行编码data tokenizer.batch_encode_plus(batch_text_or_text_pairs sents,truncation True,padding max_length,max_length 500,return_tensors pt,return_length True)input_ids data[input_ids]attention_mask data[attention_mask]token_type_ids data[token_type_ids]labels torch.LongTensor(labels)return input_ids, attention_mask, token_type_ids, labelsdef main():dataset Dataset(train) # 初始化训练集# print(len(dataset), dataset[0])# 定义dataloaderloader torch.utils.data.DataLoader(dataset dataset,batch_size 16,collate_fn my_collate_fn,shuffle True,drop_last True)# 遍历dataloader加载数据for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):breakprint(len(loader))print(input_ids.shape, attention_mask.shape, token_type_ids.shape, labels) # 打印一个样本# 加载预训练模型model BertModel.from_pretrained(bert-base-chinese)for param in model.parameters(): # 不进行梯度计算和反向传播param.requires_grad_(False)# 调用预训练模型推理一个样本 output model(input_ids input_ids, attention_mask attention_mask, token_type_ids token_type_ids)print(output.last_hidden_state.shape) # 打印最后一个隐层输出特征的维度if __name__ __main__:main()print(All done!) 输出结果 # dataloader单个样本 torch.Size([16, 500]) torch.Size([16, 500]) torch.Size([16, 500]) tensor([1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1]) # 最后一个隐层的输出特征: torch.Size([16, 500, 768])1-2--基于预训练模型实现下游任务利用预训练 bert 模型最后一个隐层的[cls] token的特征进行中文分类代码 import torch from datasets import load_dataset from transformers import BertTokenizer, BertModel, AdamW# 定义全局分词工具 tokenizer BertTokenizer.from_pretrained(bert-base-chinese)# 定义数据集 class Dataset(torch.utils.data.Dataset):def __init__(self, split):self.dataset load_dataset(path lansinuote/ChnSentiCorp, split split) # 加载数据集def __len__(self):return len(self.dataset)def __getitem__(self, i):text self.dataset[i][text]label self.dataset[i][label]return text, label# 自定义数据的处理(加载)方式 def my_collate_fn(data): # data 的类型与 dataset 的返回值相同本例中dataset返回一个列表[text, label]# 根据dataset的返回结果取出对应的text和labelsents [i[0] for i in data]labels [i[1] for i in data]# 使用全局的分词工具进行编码data tokenizer.batch_encode_plus(batch_text_or_text_pairs sents,truncation True,padding max_length,max_length 500,return_tensors pt,return_length True)input_ids data[input_ids]attention_mask data[attention_mask]token_type_ids data[token_type_ids]labels torch.LongTensor(labels)return input_ids, attention_mask, token_type_ids, labels# 定义下游任务模型 class Model(torch.nn.Module):def __init__(self):super().__init__()self.pretrained_model BertModel.from_pretrained(bert-base-chinese) # 加载预训练模型self.fc torch.nn.Linear(768, 2)# 固定预训练模型for param in self.pretrained_model.parameters():param.requires_grad Falsedef forward(self, input_ids, attention_mask, token_type_ids):with torch.no_grad():output self.pretrained_model(input_idsinput_ids,attention_maskattention_mask,token_type_idstoken_type_ids)output self.fc(output.last_hidden_state[:, 0]) # 利用最后一个隐层的[cls]token特征进行分类output output.softmax(dim1)return output# 定义测试函数 def test(model, dataset):model.eval()correct 0total 0# 定义加载测试集的dataloaderloader_test torch.utils.data.DataLoader(dataset dataset,batch_size 32,collate_fn my_collate_fn,shuffle True,drop_last True)for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):if idx 5: # 测试5个batchbreakprint(idx)with torch.no_grad():input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)output output.argmax(dim1)correct (output labels).sum().item()total len(labels)print(Acc: , correct / total) # 打印5个batch的总体准确率def main():dataset Dataset(train) # 初始化训练集# print(len(dataset), dataset[0])# 定义dataloaderloader torch.utils.data.DataLoader(dataset dataset,batch_size 16,num_workers 8,collate_fn my_collate_fn,shuffle True,drop_last True)# 初始化模型model Model()model model.cuda() # 使用GPU# 初始化优化器和损失函数optimizer AdamW(model.parameters(), lr5e-4)criterion torch.nn.CrossEntropyLoss().cuda()# 训练模型model.train()for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader): # 遍历加载数据input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)loss criterion(output, labels)loss.backward()optimizer.step()optimizer.zero_grad()if idx % 5 0: # 每5个batch打印当前准确率和损失output output.argmax(dim1)accuracy (output labels).sum().item() / len(labels)print(idx, loss.item(), accuracy)if idx 300: # 使用300个batch进行训练break# 测试模型test(model, Dataset(validation))if __name__ __main__:main() 部分输出结果 ... 260 0.5995925664901733 0.75 265 0.3791050910949707 1.0 270 0.42692136764526367 0.9375 275 0.4765201210975647 0.875 280 0.4071955382823944 0.9375 285 0.4194560945034027 0.875 290 0.449373722076416 0.9375 295 0.38813596963882446 1.0 300 0.5164415240287781 0.875 Acc: 0.89375 2--中文填空对训练数据的第15个词进行 mask 掉预测第15个词利用 bert 模型提取特征对最后一个隐层的第15个token特征进行分类分类用的是一个简单的线性层其维度为768, token.vocab_size其中token.vocab_sized的大小为21128即预测21128个词的分类分数再与真实标签进行损失计算代码 import torch from datasets import load_dataset, load_from_disk from transformers import BertTokenizer, BertModel, AdamW# 定义全局分词工具 token BertTokenizer.from_pretrained(bert-base-chinese)# 定义数据集 class Dataset(torch.utils.data.Dataset):def __init__(self, split):dataset load_dataset(path lansinuote/ChnSentiCorp, split split)# dataset load_from_disk(./data/ChnSentiCorp)# dataset dataset[split]def f(data):return len(data[text]) 30self.dataset dataset.filter(f) # 筛选数据集def __len__(self):return len(self.dataset)def __getitem__(self, i):text self.dataset[i][text]return textdef collate_fn(data):# batch编码data token.batch_encode_plus(batch_text_or_text_pairs data,truncation True,padding max_length,max_length 30, # padding到30个词return_tensors pt, # 返回pytorch格式return_length True)input_ids data[input_ids]attention_mask data[attention_mask]token_type_ids data[token_type_ids]# 把第15个词固定替换为masklabels input_ids[:, 15].reshape(-1).clone() # 记录真实标签input_ids[:, 15] token.get_vocab()[token.mask_token]return input_ids, attention_mask, token_type_ids, labels# 定义下游任务模型 class Model(torch.nn.Module):def __init__(self):super().__init__()self.decoder torch.nn.Linear(768, token.vocab_size, biasFalse) # token.vocab_size为21128预测21128个词的分类分数self.bias torch.nn.Parameter(torch.zeros(token.vocab_size))self.decoder.bias self.biasself.pretrained BertModel.from_pretrained(bert-base-chinese)# 固定预训练模型for param in self.pretrained.parameters():param.requires_grad Falsedef forward(self, input_ids, attention_mask, token_type_ids):# 使用bert模型提取特征with torch.no_grad():output self.pretrained(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)output self.decoder(output.last_hidden_state[:, 15])return output# 测试 def test(model):model.eval()correct 0total 0loader_test torch.utils.data.DataLoader(dataset Dataset(test), batch_size 32, collate_fn collate_fn, shuffle True, drop_last True)for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()if idx 15: # 测试15个batchbreakwith torch.no_grad():output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)output output.argmax(dim1)correct (output labels).sum().item()total len(labels)print(token.decode(input_ids[0])) # 打印测试数据print(真实标签: , token.decode(labels[0]), 预测标签: , token.decode(labels[0]))print(Acc: , correct / total)def main():# 初始化训练集dataset Dataset(train)# 定义dataloaderloader torch.utils.data.DataLoader(dataset dataset,batch_size 16,collate_fn collate_fn,shuffle True,drop_last True)# 初始化模型model Model().cuda()# 训练optimizer AdamW(model.parameters(), lr5e-4)criterion torch.nn.CrossEntropyLoss().cuda()model.train()for epoch in range(5):for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)loss criterion(output, labels)loss.backward()optimizer.step()optimizer.zero_grad()if idx % 50 0:output output.argmax(dim1)accuracy (output labels).sum().item() / len(labels)print(epoch, idx, loss.item(), accuracy)# 测试模型 test(model)if __name__ __main__:main() 部分输出结果 4 300 0.633719801902771 1.0 4 350 0.8078413605690002 0.75 4 400 0.7607380747795105 0.75 4 450 1.2219955921173096 0.875 4 500 0.7912384867668152 0.8125 4 550 0.4526982307434082 0.875 Filter: 100%|██████████████████████████████████████████████████| 1200/1200 [00:0000:00, 152215.71 examples/s] [CLS] 1. 有急事出去要们童叫出租 [MASK] 他们就叫酒店里的黑车价 [SEP] 真实标签: 车预测标签: 车 [CLS] 酒店特别提示 [ 2008 / 02 / 29 - 2008 [MASK] 08 / 30 ] 酒店对面立交桥改造 [SEP] 真实标签: / 预测标签: / [CLS] 不知大陆观众有多少看过台湾的 [MASK] 生活智慧王 [UNK] 节目里面介绍 [SEP] 真实标签: [ U N K ] 预测标签: [ U N K ] [CLS] 性价比极高我在苏宁买 4699 [MASK] 东才 4399. 功能很全用起来很 [SEP] 真实标签: 东预测标签: 东 [CLS] 服务态度极其差前台接待好象 [MASK] 有受过培训连基本的礼貌都 [SEP] 真实标签: 没预测标签: 没 [CLS] 自己马上就有宝宝了期待着宝 [MASK] 降临人世所以提前看看家教 [SEP] 真实标签: 宝预测标签: 宝 [CLS] 《阴阳师. 晴明取瘤》这本书买 [MASK] 来放在书架上好段日子我都 [SEP] 真实标签: 回预测标签: 回 [CLS] 出差入住的酒店, 订了个三人间 [MASK] 房间没空调, 冷得要死, 而且 [SEP] 真实标签: . 预测标签: . [CLS] 2007 年 9 月 11 日 256 元住普通标间 [MASK] 街其它房型已无。我是喜 [SEP] 真实标签: 临预测标签: 临 [CLS] 1 、作为便携本重了一点厚 [MASK] 一些 2 、屏幕确实太小了上 [SEP] 真实标签: 了预测标签: 了 [CLS] 官方给的 [UNK] 碟子和驱动真是让人 [MASK] 郁闷拿到还是自己重新装的 [SEP] 真实标签: 很预测标签: 很 [CLS] 外观设计别出心裁配置均衡性 [MASK] 比高比 [UNK] 系列又有进步。散 [SEP] 真实标签: 价预测标签: 价 [CLS] 酒店的位置很好, 距离火车站非 [MASK] 近. 总提感觉酒店的性价比不 [SEP] 真实标签: 常预测标签: 常 [CLS] 虽然只是刚刚开始阅读但是已 [MASK] 给我带来很多思想冲击了。一 [SEP] 真实标签: 经预测标签: 经 [CLS] 于丹的论语心得简直就 [MASK] 胡说八道。除了《论语》之外 [SEP] 真实标签: 是预测标签: 是 Acc: 0.7229166666666667 3--中文句子关系推断代码 import torch import random from datasets import load_dataset, load_from_disk from transformers import BertTokenizer, BertModel, AdamW# 定义全局分词工具 token BertTokenizer.from_pretrained(bert-base-chinese)# 定义数据集 class Dataset(torch.utils.data.Dataset):def __init__(self, split):# dataset load_dataset(pathlansinuote/ChnSentiCorp, splitsplit)dataset load_from_disk(./data/ChnSentiCorp)dataset dataset[split]def f(data):return len(data[text]) 40self.dataset dataset.filter(f)def __len__(self):return len(self.dataset)def __getitem__(self, i):text self.dataset[i][text]# 切分一句话为前半句和后半句sentence1 text[:20]sentence2 text[20:40]label 0 # label为0表示为同一句# 有一半的概率把后半句替换为一句无关的话if random.randint(0, 1) 0:j random.randint(0, len(self.dataset) - 1)sentence2 self.dataset[j][text][20:40]label 1return sentence1, sentence2, labeldef collate_fn(data):sents [i[:2] for i in data]labels [i[2] for i in data]# 编码data token.batch_encode_plus(batch_text_or_text_pairs sents,truncation True,padding max_length,max_length 45,return_tensors pt,return_length True,add_special_tokens True)input_ids data[input_ids]attention_mask data[attention_mask]token_type_ids data[token_type_ids]labels torch.LongTensor(labels)return input_ids, attention_mask, token_type_ids, labels# 定义下游任务模型 class Model(torch.nn.Module):def __init__(self):super().__init__()self.fc torch.nn.Linear(768, 2) # 二分类self.pretrained BertModel.from_pretrained(bert-base-chinese)# 固定预训练模型for param in self.pretrained.parameters():param.requires_grad Falsedef forward(self, input_ids, attention_mask, token_type_ids):with torch.no_grad():output self.pretrained(input_ids input_ids, attention_mask attention_mask, token_type_ids token_type_ids)output self.fc(output.last_hidden_state[:, 0])output output.softmax(dim1)return outputdef main():model Model().cuda()optimizer AdamW(model.parameters(), lr5e-4)criterion torch.nn.CrossEntropyLoss().cuda() # dataloaderloader torch.utils.data.DataLoader(dataset Dataset(train),batch_size 8,collate_fn collate_fn,shuffle True,drop_last True) # 训练model.train()for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)loss criterion(output, labels)loss.backward()optimizer.step()optimizer.zero_grad()if idx % 5 0: # 每5个batch打印output output.argmax(dim1)accuracy (output labels).sum().item() / len(labels)print(idx, loss.item(), accuracy)if idx 300: # 训练300个batchbreak# 测试test(model)# 定义测试函数 def test(model):model.eval()correct 0total 0loader_test torch.utils.data.DataLoader(dataset Dataset(test),batch_size 32,collate_fn collate_fn,shuffle True,drop_last True)for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):input_ids input_ids.cuda()attention_mask attention_mask.cuda()token_type_ids token_type_ids.cuda()labels labels.cuda()if idx 5: # 测试5个batchbreakwith torch.no_grad():output model(input_idsinput_ids, attention_maskattention_mask, token_type_idstoken_type_ids)pred output.argmax(dim1)correct (pred labels).sum().item()total len(labels)print(acc:, correct / total)if __name__ __main__:main() 部分运行结果 240 0.39283961057662964 0.875 245 0.7069525122642517 0.5 250 0.41953372955322266 0.875 255 0.5032698512077332 0.75 260 0.6422066688537598 0.75 265 0.5467717051506042 0.75 270 0.4452913701534271 0.875 275 0.5998544096946716 0.625 280 0.4301206171512604 0.875 285 0.5177156329154968 0.75 290 0.3987200856208801 0.875 295 0.33609679341316223 1.0 300 0.3723036050796509 0.875 acc: 0.925

查看全文

http://www.pierceye.com/news/65178/