网站首页顶部图片尺寸,怎么做58同城网站吗,长沙做网站费用,家庭网络组网方案关于论文的学习笔记#xff1a;Co-DETR:DETRs与协同混合分配训练论文学习笔记-CSDN博客 作者提出了一种新的协同混合任务训练方案#xff0c;即Co-DETR#xff0c;以从多种标签分配方式中学习更高效的基于detr的检测器。这种新的训练方案通过训练ATSS和Faster RCNN等一对多标… 关于论文的学习笔记Co-DETR:DETRs与协同混合分配训练论文学习笔记-CSDN博客 作者提出了一种新的协同混合任务训练方案即Co-DETR以从多种标签分配方式中学习更高效的基于detr的检测器。这种新的训练方案通过训练ATSS和Faster RCNN等一对多标签分配监督下的多个并行辅助头部可以很容易地提高编码器在端到端检测器中的学习能力。此外作者通过从这些辅助头部提取正坐标来进行额外的定制正查询以提高解码器中正样本的训练效率。在推理中这些辅助头被丢弃因此作者的方法不给原始检测器引入额外的参数和计算成本同时不需要手工制作的非最大抑制(NMS)。 代码基于mmdetection实现以下列了一些主要函数以及对应的路径方便调试时对照代码真是量大管饱稍不留神就容易迷失在代码的海洋里。 配置文件
代码的配置文件\Co-DETR\projects\configs\co_deformable_detr\co_deformable_detr_r50_1x_coco.py
_base_ [../_base_/datasets/coco_detection.py,../_base_/default_runtime.py
]
# model settings
num_dec_layer 6
lambda_2 2.0model dict(typeCoDETR,backbonedict(typeResNet,depth50,num_stages4,out_indices(1, 2, 3),frozen_stages1,norm_cfgdict(typeBN, requires_gradFalse),norm_evalTrue,stylepytorch,init_cfgdict(typePretrained, checkpointtorchvision://resnet50)),neckdict(typeChannelMapper,in_channels[512, 1024, 2048],kernel_size1,out_channels256,act_cfgNone,norm_cfgdict(typeGN, num_groups32),num_outs4),# faster-rcnn的辅助rpn头rpn_headdict(typeRPNHead,in_channels256,feat_channels256,anchor_generatordict(typeAnchorGenerator,octave_base_scale4,scales_per_octave3,ratios[0.5, 1.0, 2.0],strides[8, 16, 32, 64, 128]),bbox_coderdict(typeDeltaXYWHBBoxCoder,target_means[.0, .0, .0, .0],target_stds[1.0, 1.0, 1.0, 1.0]),loss_clsdict(typeCrossEntropyLoss, use_sigmoidTrue, loss_weight1.0*num_dec_layer*lambda_2),loss_bboxdict(typeL1Loss, loss_weight1.0*num_dec_layer*lambda_2)),query_headdict(typeCoDeformDETRHead,num_query300,num_classes80,in_channels2048,sync_cls_avg_factorTrue,with_box_refineTrue,as_two_stageTrue,mixed_selectionTrue,transformerdict(typeCoDeformableDetrTransformer,num_co_heads2, # 辅助头的设置默认2个encoderdict(typeDetrTransformerEncoder,num_layers6,transformerlayersdict(typeBaseTransformerLayer,attn_cfgsdict(typeMultiScaleDeformableAttention, embed_dims256, dropout0.0),feedforward_channels2048,ffn_dropout0.0,operation_order(self_attn, norm, ffn, norm))),decoderdict(typeCoDeformableDetrTransformerDecoder,num_layersnum_dec_layer,return_intermediateTrue,look_forward_twiceTrue,transformerlayersdict(typeDetrTransformerDecoderLayer,attn_cfgs[dict(typeMultiheadAttention,embed_dims256,num_heads8,dropout0.0),dict(typeMultiScaleDeformableAttention,embed_dims256,dropout0.0)],feedforward_channels2048,ffn_dropout0.0,operation_order(self_attn, norm, cross_attn, norm,ffn, norm)))),positional_encodingdict(typeSinePositionalEncoding,num_feats128,normalizeTrue,offset-0.5),loss_clsdict(typeFocalLoss,use_sigmoidTrue,gamma2.0,alpha0.25,loss_weight2.0),loss_bboxdict(typeL1Loss, loss_weight5.0),loss_ioudict(typeGIoULoss, loss_weight2.0)),# faster-rcnn的辅助roi头roi_head[dict(typeCoStandardRoIHead,bbox_roi_extractordict(typeSingleRoIExtractor,roi_layerdict(typeRoIAlign, output_size7, sampling_ratio0),out_channels256,featmap_strides[8, 16, 32, 64],finest_scale112),bbox_headdict(typeShared2FCBBoxHead,in_channels256,fc_out_channels1024,roi_feat_size7,num_classes80,bbox_coderdict(typeDeltaXYWHBBoxCoder,target_means[0., 0., 0., 0.],target_stds[0.1, 0.1, 0.2, 0.2]),reg_class_agnosticFalse,reg_decoded_bboxTrue,loss_clsdict(typeCrossEntropyLoss, use_sigmoidFalse, loss_weight1.0*num_dec_layer*lambda_2),loss_bboxdict(typeGIoULoss, loss_weight10.0*num_dec_layer*lambda_2)))],# ATSS辅助头bbox_head[dict(typeCoATSSHead,num_classes80,in_channels256,stacked_convs1,feat_channels256,anchor_generatordict(typeAnchorGenerator,ratios[1.0],octave_base_scale8,scales_per_octave1,strides[8, 16, 32, 64, 128]),bbox_coderdict(typeDeltaXYWHBBoxCoder,target_means[.0, .0, .0, .0],target_stds[0.1, 0.1, 0.2, 0.2]),loss_clsdict(typeFocalLoss,use_sigmoidTrue,gamma2.0,alpha0.25,loss_weight1.0*num_dec_layer*lambda_2),loss_bboxdict(typeGIoULoss, loss_weight2.0*num_dec_layer*lambda_2),loss_centernessdict(typeCrossEntropyLoss, use_sigmoidTrue, loss_weight1.0*num_dec_layer*lambda_2)),],# model training and testing settingstrain_cfg[dict(assignerdict(typeHungarianAssigner,cls_costdict(typeFocalLossCost, weight2.0),reg_costdict(typeBBoxL1Cost, weight5.0, box_formatxywh),iou_costdict(typeIoUCost, iou_modegiou, weight2.0))),dict(rpndict(assignerdict(typeMaxIoUAssigner,pos_iou_thr0.7,neg_iou_thr0.3,min_pos_iou0.3,match_low_qualityTrue,ignore_iof_thr-1),samplerdict(typeRandomSampler,num256,pos_fraction0.5,neg_pos_ub-1,add_gt_as_proposalsFalse),allowed_border-1,pos_weight-1,debugFalse),rpn_proposaldict(nms_pre4000,max_per_img1000,nmsdict(typenms, iou_threshold0.7),min_bbox_size0),rcnndict(assignerdict(typeMaxIoUAssigner,pos_iou_thr0.5,neg_iou_thr0.5,min_pos_iou0.5,match_low_qualityFalse,ignore_iof_thr-1),samplerdict(typeRandomSampler,num512,pos_fraction0.25,neg_pos_ub-1,add_gt_as_proposalsTrue),pos_weight-1,debugFalse)),dict(assignerdict(typeATSSAssigner, topk9),allowed_border-1,pos_weight-1,debugFalse),],test_cfg[dict(max_per_img100),dict(rpndict(nms_pre1000,max_per_img1000,nmsdict(typenms, iou_threshold0.7),min_bbox_size0),rcnndict(score_thr0.0,nmsdict(typenms, iou_threshold0.5),max_per_img100)),dict(nms_pre1000,min_bbox_size0,score_thr0.0,nmsdict(typenms, iou_threshold0.6),max_per_img100),# soft-nms is also supported for rcnn testing# e.g., nmsdict(typesoft_nms, iou_threshold0.5, min_score0.05)])img_norm_cfg dict(mean[123.675, 116.28, 103.53], std[58.395, 57.12, 57.375], to_rgbTrue)
# train_pipeline, NOTE the img_scale and the Pads size_divisor is different
# from the default setting in mmdet.
train_pipeline [dict(typeLoadImageFromFile),dict(typeLoadAnnotations, with_bboxTrue),dict(typeRandomFlip, flip_ratio0.5),dict(typeAutoAugment,policies[[dict(typeResize,img_scale[(480, 1333), (512, 1333), (544, 1333),(576, 1333), (608, 1333), (640, 1333),(672, 1333), (704, 1333), (736, 1333),(768, 1333), (800, 1333)],multiscale_modevalue,keep_ratioTrue)],[dict(typeResize,# The radio of all image in train dataset 7# follow the original implimg_scale[(400, 4200), (500, 4200), (600, 4200)],multiscale_modevalue,keep_ratioTrue),dict(typeRandomCrop,crop_typeabsolute_range,crop_size(384, 600),allow_negative_cropTrue),dict(typeResize,img_scale[(480, 1333), (512, 1333), (544, 1333),(576, 1333), (608, 1333), (640, 1333),(672, 1333), (704, 1333), (736, 1333),(768, 1333), (800, 1333)],multiscale_modevalue,overrideTrue,keep_ratioTrue)]]),dict(typeNormalize, **img_norm_cfg),dict(typePad, size_divisor1),dict(typeDefaultFormatBundle),dict(typeCollect, keys[img, gt_bboxes, gt_labels])
]
# test_pipeline, NOTE the Pads size_divisor is different from the default
# setting (size_divisor32). While there is little effect on the performance
# whether we use the default setting or use size_divisor1.
test_pipeline [dict(typeLoadImageFromFile),dict(typeMultiScaleFlipAug,img_scale(1333, 800),flipFalse,transforms[dict(typeResize, keep_ratioTrue),dict(typeRandomFlip),dict(typeNormalize, **img_norm_cfg),dict(typePad, size_divisor1),dict(typeImageToTensor, keys[img]),dict(typeCollect, keys[img])])
]data dict(samples_per_gpu2,workers_per_gpu2,traindict(filter_empty_gtFalse, pipelinetrain_pipeline),valdict(pipelinetest_pipeline),testdict(pipelinetest_pipeline))
# optimizer
optimizer dict(typeAdamW,lr2e-4,weight_decay1e-4,paramwise_cfgdict(custom_keys{backbone: dict(lr_mult0.1),sampling_offsets: dict(lr_mult0.1),reference_points: dict(lr_mult0.1)}))
optimizer_config dict(grad_clipdict(max_norm0.1, norm_type2))
# learning policy
lr_config dict(policystep, step[11])
runner dict(typeEpochBasedRunner, max_epochs12)这里使用默认的配置 co_deformable_detr_r50_1x_coco.py默认使用2个辅助头。
Backbone 输入图像首先经过resnet得到三层的特征图并将最后一层特征图进行下采样得到四层特征图作为deformerable-transformer的输入。 def forward(self, x):Forward function.if self.deep_stem:x self.stem(x)else:x self.conv1(x)x self.norm1(x)x self.relu(x)x self.maxpool(x)outs []for i, layer_name in enumerate(self.res_layers):res_layer getattr(self, layer_name)x res_layer(x)if i in self.out_indices:outs.append(x)return tuple(outs)def train(self, modeTrue):Convert the model into training mode while keep normalization layerfreezed.super(ResNet, self).train(mode)self._freeze_stages()if mode and self.norm_eval:for m in self.modules():# trick: eval have effect on BatchNorm onlyif isinstance(m, _BatchNorm):m.eval()
Transformer
codetr的主函数在projects\models\co_detr.py的forward_train中 def forward_train(self,img,img_metas,gt_bboxes,gt_labels,gt_bboxes_ignoreNone,gt_masksNone,proposalsNone,**kwargs):Args:img (Tensor): of shape (N, C, H, W) encoding input images.Typically these should be mean centered and std scaled.img_metas (list[dict]): list of image info dict where each dicthas: img_shape, scale_factor, flip, and may also containfilename, ori_shape, pad_shape, and img_norm_cfg.For details on the values of these keys seemmdet/datasets/pipelines/formatting.py:Collect.gt_bboxes (list[Tensor]): Ground truth bboxes for each image withshape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels (list[Tensor]): class indices corresponding to each boxgt_bboxes_ignore (None | list[Tensor]): specify which boundingboxes can be ignored when computing the loss.gt_masks (None | Tensor) : true segmentation masks for each boxused if the architecture supports a segmentation task.proposals : override rpn proposals with custom proposals. Use whenwith_rpn is False.Returns:dict[str, Tensor]: a dictionary of loss componentsbatch_input_shape tuple(img[0].size()[-2:])for img_meta in img_metas:img_meta[batch_input_shape] batch_input_shapeif not self.with_attn_mask: # remove attn mask for LSJfor i in range(len(img_metas)):input_img_h, input_img_w img_metas[i][batch_input_shape]img_metas[i][img_shape] [input_img_h, input_img_w, 3]# 取了四层feature map每一层的feature map最后会经过1*1的卷积进行降维[N,512/1024/2048/2048,H,W] - [N,256,H,W],此处的H和W为对应层的feature map的尺寸 stride分别为[8,16,32,64]x self.extract_feat(img, img_metas) #losses dict()def upd_loss(losses, idx, weight1):new_losses dict()for k,v in losses.items():new_k {}{}.format(k,idx)if isinstance(v,list) or isinstance(v,tuple):new_losses[new_k] [i*weight for i in v]else:new_losses[new_k] v*weightreturn new_losses# DETR encoder and decoder forwardif self.with_query_head:bbox_losses, x self.query_head.forward_train(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore)losses.update(bbox_losses)# x: 是encoder的输出(memory)根据特征图大小的展开并对最小的特征图进行下采样 [[N 256 H W]*5]# RPN forward and lossif self.with_rpn:proposal_cfg self.train_cfg[self.head_idx].get(rpn_proposal,self.test_cfg[self.head_idx].rpn)rpn_losses, proposal_list self.rpn_head.forward_train(x,img_metas,gt_bboxes,gt_labelsNone,gt_bboxes_ignoregt_bboxes_ignore,proposal_cfgproposal_cfg,**kwargs)losses.update(rpn_losses) # rpn_losses: cls loss和bbox loss proposal_list:rpn输出的经过nms后的proposal [xyxy,score] proposal shape-tuple([1000,5],[1000,5])else:proposal_list proposalspositive_coords []for i in range(len(self.roi_head)):roi_losses self.roi_head[i].forward_train(x, img_metas, proposal_list,gt_bboxes, gt_labels,gt_bboxes_ignore, gt_masks,**kwargs)if self.with_pos_coord: # positive_coords (coords, labels, targets)positive_coords.append(roi_losses.pop(pos_coords))else: if pos_coords in roi_losses.keys():tmp roi_losses.pop(pos_coords) roi_losses upd_loss(roi_losses, idxi)losses.update(roi_losses)# ATSS headfor i in range(len(self.bbox_head)):bbox_losses self.bbox_head[i].forward_train(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore)if self.with_pos_coord: # pos_coords (ori_anchors, ori_labels, ori_bbox_targets, atss)pos_coords bbox_losses.pop(pos_coords)positive_coords.append(pos_coords)else:if pos_coords in bbox_losses.keys():tmp bbox_losses.pop(pos_coords) bbox_losses upd_loss(bbox_losses, idxilen(self.roi_head))losses.update(bbox_losses)if self.with_pos_coord and len(positive_coords)0:for i in range(len(positive_coords)):bbox_losses self.query_head.forward_train_aux(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore, positive_coords[i], i)bbox_losses upd_loss(bbox_losses, idxi)losses.update(bbox_losses) return losses
transformer的代码主体就是deformer-detr
主要位于projects\models\co_deformable_detr_head.py中的forward def forward(self, mlvl_feats, img_metas):Forward function.Args:mlvl_feats (tuple[Tensor]): Features from the upstreamnetwork, each is a 4D-tensor with shape(N, C, H, W).img_metas (list[dict]): List of image information.Returns:all_cls_scores (Tensor): Outputs from the classification head, \shape [nb_dec, bs, num_query, cls_out_channels]. Note \cls_out_channels should includes background.all_bbox_preds (Tensor): Sigmoid outputs from the regression \head with normalized coordinate format (cx, cy, w, h). \Shape [nb_dec, bs, num_query, 4].enc_outputs_class (Tensor): The score of each point on encode \feature map, has shape (N, h*w, num_class). Only when \as_two_stage is True it would be returned, otherwise \None would be returned.enc_outputs_coord (Tensor): The proposal generate from the \encode feature map, has shape (N, h*w, 4). Only when \as_two_stage is True it would be returned, otherwise \None would be returned.batch_size mlvl_feats[0].size(0)input_img_h, input_img_w img_metas[0][batch_input_shape]img_masks mlvl_feats[0].new_ones((batch_size, input_img_h, input_img_w))for img_id in range(batch_size):img_h, img_w, _ img_metas[img_id][img_shape]img_masks[img_id, :img_h, :img_w] 0# Deformable-DETR对每一层的feature map生成对应mask用于记录原始图像在padding图中所占的位置原图位于padding图中的左上角mlvl_masks []mlvl_positional_encodings []for feat in mlvl_feats:mlvl_masks.append(F.interpolate(img_masks[None],sizefeat.shape[-2:]).to(torch.bool).squeeze(0)) # mlvl_masks-[[N H/8 W/8],[N H/16 W/16],[N H/32 W/32],[N H/64 W/64]]mlvl_positional_encodings.append(self.positional_encoding(mlvl_masks[-1])) # 对每个mask层进行位置编码 mlvl_positional_encodings-[[N 256 H/8 W/8],[N 256 H/16 W/16],[N 256 H/32 W/32],[N 256 H/64 W/64]]query_embeds Noneif not self.as_two_stage or self.mixed_selection:query_embeds self.query_embedding.weight # 维度为[300,256]hs, init_reference, inter_references, \enc_outputs_class, enc_outputs_coord, enc_outputs self.transformer(mlvl_feats,mlvl_masks,query_embeds,mlvl_positional_encodings,reg_branchesself.reg_branches if self.with_box_refine else None, # noqa:E501cls_branchesself.cls_branches if self.as_two_stage else None, # noqa:E501return_encoder_outputTrue) # hs [6 300 N 256], init_reference[N 300 4], inter_references [6 N 300 4], enc_outputs_class [N C 80], enc_outputs_coord [N C 4], enc_outputs [C N 256](memory)# hs - inter_states:decoder每一层的输出 [6 300 N 256]# init_reference - init_reference_out:最初的refpoint [N 300 4]# inter_references - inter_references_out:每一层的输出经过Linear后得到的refpoint [6 N 300 4]# enc_outputs_class - enc_outputs_class:gen_encoder_output_proposals得到的output_memory(经过有效性过滤)经过Linear后得到的关于类别的输出 [N C 80]# enc_outputs_coord - enc_outputs_coord_unact:gen_encoder_output_proposals得到的output_memory(经过有效性过滤)经过Linear后得到的关于坐标的输出加上output_proposals [N C 4]# enc_outputs - memory:encoder的输出 [C N 256]outs []num_level len(mlvl_feats)start 0for lvl in range(num_level):bs, c, h, w mlvl_feats[lvl].shapeend start h*wfeat enc_outputs[start:end].permute(1, 2, 0).contiguous() # [N 256 h*w]start endouts.append(feat.reshape(bs, c, h, w)) # encoder的输出根据每一层特征图长宽的展开并存放在outs中 [[N 256 H/8 W/8],[N 256 H/16 W/16],[N 256 H/32 W/32],[N 256 H/64 W/64]]outs.append(self.downsample(outs[-1])) # 加入对outs中最小的特征图进行3*3 stride为2的下采样 [[N 256 H/8 W/8],[N 256 H/16 W/16],[N 256 H/32 W/32],[N 256 H/64 W/64],[N 256 H/128 W/128]]hs hs.permute(0, 2, 1, 3)outputs_classes []outputs_coords []# 从decoder的每一个中间层输出得到类别和bbox并分别存放在outputs_classes和outputs_coords中for lvl in range(hs.shape[0]):if lvl 0:reference init_referenceelse:reference inter_references[lvl - 1]reference inverse_sigmoid(reference)outputs_class self.cls_branches[lvl](hs[lvl]) # cls_branches为Linear(256,80) [N 300 256]-[N 300 80]tmp self.reg_branches[lvl](hs[lvl]) #reg_branches为Linear(256,256) Linear(256,256) Linear(256,4) [N 300 256]-[N 300 4]if reference.shape[-1] 4:tmp referenceelse:assert reference.shape[-1] 2tmp[..., :2] referenceoutputs_coord tmp.sigmoid()outputs_classes.append(outputs_class) # 每一个decoder输出的classoutputs_coords.append(outputs_coord) # 每一个decoder输出的coordoutputs_classes torch.stack(outputs_classes) # 将中间层输出的类别合并 [6 N 300 80]outputs_coords torch.stack(outputs_coords) # 将中间层输出的bbox合并 [6 N 300 4]if self.as_two_stage:return outputs_classes, outputs_coords, \enc_outputs_class, \enc_outputs_coord.sigmoid(), outs # outputs_classes:每一个decoder输出的class,outputs_coords:每一个decoder输出的coord(bbox),enc_outputs_class:gen_encoder_output_proposals得到的output_memory经过Linear后得到的关于类别的输出 [N C 80],enc_outputs_coord:gen_encoder_output_proposals得到的output_memory经过Linear后得到的关于坐标的输出加上output_proposals [N C 4],outs:是encoder的输出(memory)根据特征图大小的展开并对最小的特征图进行下采样else:return outputs_classes, outputs_coords, \None, None, outs
其中transformer的具体实现在projects\models\transformer.py的CoDeformableDetrTransformer.forward中 def forward(self,mlvl_feats,mlvl_masks,query_embed,mlvl_pos_embeds,reg_branchesNone,cls_branchesNone,return_encoder_outputFalse,attn_masksNone,**kwargs):Forward function for Transformer.Args:mlvl_feats (list(Tensor)): Input queries fromdifferent level. Each element has shape[bs, embed_dims, h, w].mlvl_masks (list(Tensor)): The key_padding_mask fromdifferent level used for encoder and decoder,each element has shape [bs, h, w].query_embed (Tensor): The query embedding for decoder,with shape [num_query, c].mlvl_pos_embeds (list(Tensor)): The positional encodingof feats from different level, has the shape[bs, embed_dims, h, w].reg_branches (obj:nn.ModuleList): Regression heads forfeature maps from each decoder layer. Only wouldbe passed whenwith_box_refine is True. Default to None.cls_branches (obj:nn.ModuleList): Classification headsfor feature maps from each decoder layer. Only wouldbe passed when as_two_stageis True. Default to None.Returns:tuple[Tensor]: results of decoder containing the following tensor.- inter_states: Outputs from decoder. Ifreturn_intermediate_dec is True output has shape \(num_dec_layers, bs, num_query, embed_dims), else has \shape (1, bs, num_query, embed_dims).- init_reference_out: The initial value of reference \points, has shape (bs, num_queries, 4).- inter_references_out: The internal value of reference \points in decoder, has shape \(num_dec_layers, bs,num_query, embed_dims)- enc_outputs_class: The classification score of \proposals generated from \encoders feature maps, has shape \(batch, h*w, num_classes). \Only would be returned when as_two_stage is True, \otherwise None.- enc_outputs_coord_unact: The regression results \generated from encoders feature maps., has shape \(batch, h*w, 4). Only would \be returned when as_two_stage is True, \otherwise None.assert self.as_two_stage or query_embed is not Nonefeat_flatten []mask_flatten []lvl_pos_embed_flatten []spatial_shapes []for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):bs, c, h, w feat.shapespatial_shape (h, w)spatial_shapes.append(spatial_shape)feat feat.flatten(2).transpose(1, 2) # 将H和W打平 [N,256,H,W] - [N,H*W,256]mask mask.flatten(1) # [N,H,W] - [N,H*W]pos_embed pos_embed.flatten(2).transpose(1, 2) # 同样将H和W打平 [N,256,H,W] - [N,H*W,256]lvl_pos_embed pos_embed self.level_embeds[lvl].view(1, 1, -1) # self.level_embed是一个nn.Parameter生成的[4,256]的tensor每一层特征图mask生成的pos_embedlvl_pos_embed_flatten.append(lvl_pos_embed)feat_flatten.append(feat)mask_flatten.append(mask)feat_flatten torch.cat(feat_flatten, 1) # 将打平后的tensor cat在一起 feat_flatten-[N H/8*W/8H/16*W/16H/32*W/32H/64*W/64 256]mask_flatten torch.cat(mask_flatten, 1) # mask_flatten- [N H/8*W/8H/16*W/16H/32*W/32H/64*W/64]lvl_pos_embed_flatten torch.cat(lvl_pos_embed_flatten, 1) # lvl_pos_embed_flatten-[N H/8*W/8H/16*W/16H/32*W/32H/64*W/64 256]spatial_shapes torch.as_tensor(spatial_shapes, dtypetorch.long, devicefeat_flatten.device) # 存放着每一层feature map的[H,W],维度为[4,2] [[H/8 W/8],[H/16 W/16],[H/32 W/32],[H/64 W/64]]level_start_index torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) # cat在一起后feature map的起始索引,如:第一层是0,第二层是H/8*W/80,第三层是H/16*W/16H/8*W/80,最后一层H/32*W/32H/16*W/16H/8*W/80 共4维valid_ratios torch.stack([self.get_valid_ratio(m) for m in mlvl_masks], 1) # 输出一个[N,4,2]的tensor表示每一层的feature map中对应的非padding部分的有效长宽与该层feature map长宽的比值# C H/8*W/8H/16*W/16H/32*W/32H/64*W/64 reference_points-[N C 4 2]reference_points \self.get_reference_points(spatial_shapes,valid_ratios,devicefeat.device)feat_flatten feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims)lvl_pos_embed_flatten lvl_pos_embed_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims)memory self.encoder(queryfeat_flatten,keyNone,valueNone,query_poslvl_pos_embed_flatten,query_key_padding_maskmask_flatten,spatial_shapesspatial_shapes,reference_pointsreference_points,level_start_indexlevel_start_index,valid_ratiosvalid_ratios,**kwargs)memory memory.permute(1, 0, 2) # [C N 256]-[N C 256]bs, _, c memory.shapeif self.as_two_stage:output_memory, output_proposals \self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes) # [N C 256] [N C 4]enc_outputs_class cls_branches[self.decoder.num_layers](output_memory) # cls_branches为Linear(256,80) [N C 256]-[N C 80]enc_outputs_coord_unact \reg_branches[self.decoder.num_layers](output_memory) output_proposals #reg_branches为Linear(256,256) Linear(256,256) Linear(256,4) [N C 256]-[N C 4]topk self.two_stage_num_proposalstopk query_embed.shape[0] # 300topk_proposals torch.topk(enc_outputs_class[..., 0], topk, dim1)[1] # 从enc_outputs_class中取前300的query索引 [N 300]topk_coords_unact torch.gather(enc_outputs_coord_unact, 1,topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) # 根据topk_proposals对enc_outputs_coord_unact进行采样 [N 300 4]topk_coords_unact topk_coords_unact.detach()reference_points topk_coords_unact.sigmoid()init_reference_out reference_pointspos_trans_out self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) # 对topk_coords_unact进行位置编码之后Linear(512,512) 再LayerNorm 得到[N 300 512]if not self.mixed_selection:query_pos, query torch.split(pos_trans_out, c, dim2)else:# query_embed here is the content embed for deformable DETRquery query_embed.unsqueeze(0).expand(bs, -1, -1) # 由nn.Embedding生成 [300,256] - [N 300 256]query_pos, _ torch.split(pos_trans_out, c, dim2) # [N 300 256]else:query_pos, query torch.split(query_embed, c, dim1)query_pos query_pos.unsqueeze(0).expand(bs, -1, -1)query query.unsqueeze(0).expand(bs, -1, -1)reference_points self.reference_points(query_pos).sigmoid()init_reference_out reference_points# decoderquery query.permute(1, 0, 2) # [300 N 256]memory memory.permute(1, 0, 2) # [C N 256]query_pos query_pos.permute(1, 0, 2) # [300 N 256]inter_states, inter_references self.decoder(queryquery,keyNone,valuememory,query_posquery_pos,key_padding_maskmask_flatten,reference_pointsreference_points,spatial_shapesspatial_shapes,level_start_indexlevel_start_index,valid_ratiosvalid_ratios,reg_branchesreg_branches,attn_masksattn_masks,**kwargs) # inter_states [6 300 N 256], inter_references [6 N 300 4]inter_references_out inter_referencesif self.as_two_stage:if return_encoder_output:return inter_states, init_reference_out,\inter_references_out, enc_outputs_class,\enc_outputs_coord_unact, memory# inter_states:decoder每一层的输出 [6 300 N 256]# init_reference_out:最初的refpoint [N 300 4]# inter_references_out:每一层的输出经过Linear后得到的refpoint [6 N 300 4]# enc_outputs_class:gen_encoder_output_proposals得到的output_memory(经过有效性过滤)经过Linear后得到的关于类别的输出 [N C 80]# enc_outputs_coord_unact:gen_encoder_output_proposals得到的output_memory(经过有效性过滤)经过Linear后得到的关于坐标的输出加上output_proposals [N C 4]# memory:encoder的输出 [C N 256]return inter_states, init_reference_out,\inter_references_out, enc_outputs_class,\enc_outputs_coord_unactif return_encoder_output:return inter_states, init_reference_out, \inter_references_out, None, None, memoryreturn inter_states, init_reference_out, \inter_references_out, None, None 这里的encoder和decoder的deformable attention在\anaconda3\envs\codetr\Lib\site-packages\mmcv\ops\multi_scale_deform_attn.py中
class MultiScaleDeformableAttention(BaseModule):An attention module used in Deformable-Detr.Deformable DETR: Deformable Transformers for End-to-End Object Detection.https://arxiv.org/pdf/2010.04159.pdf_.Args:embed_dims (int): The embedding dimension of Attention.Default: 256.num_heads (int): Parallel attention heads. Default: 64.num_levels (int): The number of feature map used inAttention. Default: 4.num_points (int): The number of sampling points foreach query in each head. Default: 4.im2col_step (int): The step used in image_to_column.Default: 64.dropout (float): A Dropout layer on inp_identity.Default: 0.1.batch_first (bool): Key, Query and Value are shape of(batch, n, embed_dim)or (n, batch, embed_dim). Default to False.norm_cfg (dict): Config dict for normalization layer.Default: None.init_cfg (obj:mmcv.ConfigDict): The Config for initialization.Default: None.def __init__(self,embed_dims256,num_heads8,num_levels4,num_points4,im2col_step64,dropout0.1,batch_firstFalse,norm_cfgNone,init_cfgNone):super().__init__(init_cfg)if embed_dims % num_heads ! 0:raise ValueError(fembed_dims must be divisible by num_heads, fbut got {embed_dims} and {num_heads})dim_per_head embed_dims // num_headsself.norm_cfg norm_cfgself.dropout nn.Dropout(dropout)self.batch_first batch_first# youd better set dim_per_head to a power of 2# which is more efficient in the CUDA implementationdef _is_power_of_2(n):if (not isinstance(n, int)) or (n 0):raise ValueError(invalid input for _is_power_of_2: {} (type: {}).format(n, type(n)))return (n (n - 1) 0) and n ! 0if not _is_power_of_2(dim_per_head):warnings.warn(Youd better set embed_dims in MultiScaleDeformAttention to make the dimension of each attention head a power of 2 which is more efficient in our CUDA implementation.)self.im2col_step im2col_stepself.embed_dims embed_dimsself.num_levels num_levelsself.num_heads num_headsself.num_points num_pointsself.sampling_offsets nn.Linear(embed_dims, num_heads * num_levels * num_points * 2)self.attention_weights nn.Linear(embed_dims,num_heads * num_levels * num_points)self.value_proj nn.Linear(embed_dims, embed_dims)self.output_proj nn.Linear(embed_dims, embed_dims)self.init_weights()def init_weights(self):Default initialization for Parameters of Module.constant_init(self.sampling_offsets, 0.)thetas torch.arange(self.num_heads,dtypetorch.float32) * (2.0 * math.pi / self.num_heads)grid_init torch.stack([thetas.cos(), thetas.sin()], -1)grid_init (grid_init /grid_init.abs().max(-1, keepdimTrue)[0]).view(self.num_heads, 1, 1,2).repeat(1, self.num_levels, self.num_points, 1)for i in range(self.num_points):grid_init[:, :, i, :] * i 1self.sampling_offsets.bias.data grid_init.view(-1)constant_init(self.attention_weights, val0., bias0.)xavier_init(self.value_proj, distributionuniform, bias0.)xavier_init(self.output_proj, distributionuniform, bias0.)self._is_init Truedeprecated_api_warning({residual: identity},cls_nameMultiScaleDeformableAttention)def forward(self,query,keyNone,valueNone,identityNone,query_posNone,key_padding_maskNone,reference_pointsNone,spatial_shapesNone,level_start_indexNone,**kwargs):Forward Function of MultiScaleDeformAttention.Args:query (torch.Tensor): Query of Transformer with shape(num_query, bs, embed_dims).key (torch.Tensor): The key tensor with shape(num_key, bs, embed_dims).value (torch.Tensor): The value tensor with shape(num_key, bs, embed_dims).identity (torch.Tensor): The tensor used for addition, with thesame shape as query. Default None. If None,query will be used.query_pos (torch.Tensor): The positional encoding for query.Default: None.key_pos (torch.Tensor): The positional encoding for key. DefaultNone.reference_points (torch.Tensor): The normalized referencepoints with shape (bs, num_query, num_levels, 2),all elements is range in [0, 1], top-left (0,0),bottom-right (1, 1), including padding area.or (N, Length_{query}, num_levels, 4), addadditional two dimensions is (w, h) toform reference boxes.key_padding_mask (torch.Tensor): ByteTensor for query, withshape [bs, num_key].spatial_shapes (torch.Tensor): Spatial shape of features indifferent levels. With shape (num_levels, 2),last dimension represents (h, w).level_start_index (torch.Tensor): The start index of each level.A tensor has shape (num_levels, ) and can be representedas [0, h_0*w_0, h_0*w_0h_1*w_1, ...].Returns:torch.Tensor: forwarded results with shape[num_query, bs, embed_dims].# encoder:query shape [C N 256] 其中CH/8*W/8H/16*W/16H/32*W/32H/64*W/64 / decoder:query shape [300 N 256]if value is None:value queryif identity is None:identity queryif query_pos is not None:query query query_pos # encoder:query lvl_pos_embed(由每层的feature map的mask生成) / decoder:query pos_trans_out得到的query_posif not self.batch_first:# change to (bs, num_query ,embed_dims)query query.permute(1, 0, 2) # encoder:[C N 256]-[N C 256] /decoder:[300 N 256]-[N 300 256]value value.permute(1, 0, 2) # encoder:[C N 256]-[N C 256] /decoder:[C N 256]-[N C 256]bs, num_query, _ query.shapebs, num_value, _ value.shapeassert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() num_valuevalue self.value_proj(value) # Linear(256,256) [N C 256]-[N C 256]if key_padding_mask is not None:value value.masked_fill(key_padding_mask[..., None], 0.0) # 保留有效区域的值非有效区域用0代替value value.view(bs, num_value, self.num_heads, -1) # num_heads8 [N C 256]-[N C 8 32]sampling_offsets self.sampling_offsets(query).view(bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) # sampling_offsets为Linear(256,256) num_levelsnum_points4 # 每个query产生对应不同head不同level的偏置sampling_offsets的shape由 encoder:[N,C,256] - [N,C,8,4,4,2] / decoder:[N,300,256] - [N,300,8,4,4,2]attention_weights self.attention_weights(query).view(bs, num_query, self.num_heads, self.num_levels * self.num_points) # attention_weights为Linear(256,128) 每个偏置向量的权重经过Linear(256,128),attention_weights的shape由 encoder:[N,C,256] - [N,C,8,16] / decoder:[N,300,256] - [N,300,8,16]attention_weights attention_weights.softmax(-1)# 对属于同一个query的来自与不同level的向量权重在每个head分别归一化softmax后attention_weights的shape由 encoder:[N,C,8,16] - [N,C,8,4,4] / decoder:[N,300,8,16] - [N,300,8,4,4]attention_weights attention_weights.view(bs, num_query,self.num_heads,self.num_levels,self.num_points)if reference_points.shape[-1] 2:offset_normalizer torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) # offset_normalizer 将input_spatial_shapes中[H,W]的形式转化为[W,H]sampling_locations reference_points[:, :, None, :, None, :] \ sampling_offsets \/ offset_normalizer[None, None, None, :, None, :] # 采样点的坐标[N,C,8,4,4,2]elif reference_points.shape[-1] 4:sampling_locations reference_points[:, :, None, :, None, :2] \ sampling_offsets / self.num_points \* reference_points[:, :, None, :, None, 2:] \* 0.5 # 采样点的坐标[N,300,8,4,4,2]else:raise ValueError(fLast dim of reference_points must bef 2 or 4, but get {reference_points.shape[-1]} instead.)if torch.cuda.is_available() and value.is_cuda:output MultiScaleDeformableAttnFunction.apply(value, spatial_shapes, level_start_index, sampling_locations,attention_weights, self.im2col_step)else:output multi_scale_deformable_attn_pytorch(value, spatial_shapes, sampling_locations, attention_weights)output self.output_proj(output) # 输出经过一个Linear层维度由 encoder:[N,C,256] - [N,C,256] / decoder: [N,300,256] - [N,300,256]if not self.batch_first:# (num_query, bs ,embed_dims)output output.permute(1, 0, 2)return self.dropout(output) identity
loss就是匈牙利做一对一的匹配然后加上bbox loss和cls loss以及iou loss
loss的计算过程中会对每个特征图的batch中的每个元素单独进行的通过get_targets将batch元素拆分成一个列表 def get_targets(self,cls_scores_list,bbox_preds_list,gt_bboxes_list,gt_labels_list,img_metas,gt_bboxes_ignore_listNone):Compute regression and classification targets for a batch image.Outputs from a single decoder layer of a single feature level are used.Args:cls_scores_list (list[Tensor]): Box score logits from a singledecoder layer for each image with shape [num_query,cls_out_channels].bbox_preds_list (list[Tensor]): Sigmoid outputs from a singledecoder layer for each image, with normalized coordinate(cx, cy, w, h) and shape [num_query, 4].gt_bboxes_list (list[Tensor]): Ground truth bboxes for each imagewith shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels_list (list[Tensor]): Ground truth class indices for eachimage with shape (num_gts, ).img_metas (list[dict]): List of image meta information.gt_bboxes_ignore_list (list[Tensor], optional): Boundingboxes which can be ignored for each image. Default None.Returns:tuple: a tuple containing the following targets.- labels_list (list[Tensor]): Labels for all images.- label_weights_list (list[Tensor]): Label weights for all \images.- bbox_targets_list (list[Tensor]): BBox targets for all \images.- bbox_weights_list (list[Tensor]): BBox weights for all \images.- num_total_pos (int): Number of positive samples in all \images.- num_total_neg (int): Number of negative samples in all \images.# assert gt_bboxes_ignore_list is None, \# Only supports for gt_bboxes_ignore setting to None.num_imgs len(cls_scores_list) # list([300 80],[300 80])if gt_bboxes_ignore_list is None:gt_bboxes_ignore_list [gt_bboxes_ignore_list for _ in range(num_imgs)]# labels_list:当前batch中值全为80的tensor在pos_inds处插入了gt_labels其余索引处值为80 长度为bs的list(默认bs2)# label_weights_list:维度为[300] 值全为1 长度为bs的list# bbox_targets_list:当前batch中值全为[0 0 0 0]的tensor在pos_inds处插入了pos_gt_bboxes_targets其余索引处值为[0 0 0 0] 长度为bs的list# bbox_weights_list:bbox_weights根据pos_inds设置为1即存在gt bbox的索引处设置为1 长度为bs的list# pos_inds_list:正样本在匈牙利匹配中row上的索引 长度为bs的list# neg_inds_list:负样本在匈牙利匹配中row上的索引 长度为bs的list# 通过multi_apply对batch中每一张图的gt和pred(此处主要用在匈牙利算法中)单独处理之后再将每个batch上的值存放在一个list中(labels_list, label_weights_list, bbox_targets_list,bbox_weights_list, pos_inds_list, neg_inds_list) multi_apply(self._get_target_single, cls_scores_list, bbox_preds_list,gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)num_total_pos sum((inds.numel() for inds in pos_inds_list)) # 该batch中正样本的总数num_total_neg sum((inds.numel() for inds in neg_inds_list)) # 该batch中负样本的总数return (labels_list, label_weights_list, bbox_targets_list,bbox_weights_list, num_total_pos, num_total_neg)def _get_target_single(self,cls_score,bbox_pred,gt_bboxes,gt_labels,img_meta,gt_bboxes_ignoreNone):Compute regression and classification targets for one image.Outputs from a single decoder layer of a single feature level are used.Args:cls_score (Tensor): Box score logits from a single decoder layerfor one image. Shape [num_query, cls_out_channels].bbox_pred (Tensor): Sigmoid outputs from a single decoder layerfor one image, with normalized coordinate (cx, cy, w, h) andshape [num_query, 4].gt_bboxes (Tensor): Ground truth bboxes for one image withshape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels (Tensor): Ground truth class indices for one imagewith shape (num_gts, ).img_meta (dict): Meta information for one image.gt_bboxes_ignore (Tensor, optional): Bounding boxeswhich can be ignored. Default None.Returns:tuple[Tensor]: a tuple containing the following for one image.- labels (Tensor): Labels of each image.- label_weights (Tensor]): Label weights of each image.- bbox_targets (Tensor): BBox targets of each image.- bbox_weights (Tensor): BBox weights of each image.- pos_inds (Tensor): Sampled positive indices for each image.- neg_inds (Tensor): Sampled negative indices for each image.num_bboxes bbox_pred.size(0)ori_gt_bboxes_ignore gt_bboxes_ignoregt_bboxes_ignore None# assigner and sampler # 匈牙利匹配assign_result self.assigner.assign(bbox_pred, cls_score, gt_bboxes,gt_labels, img_meta,gt_bboxes_ignore)sampling_result self.sampler.sample(assign_result, bbox_pred,gt_bboxes)pos_inds sampling_result.pos_indsneg_inds sampling_result.neg_inds# label targetslabels gt_bboxes.new_full((num_bboxes, ),self.num_classes,dtypetorch.long) # [300] 值为80labels[pos_inds] gt_labels[sampling_result.pos_assigned_gt_inds] # 将gt_labels按正样本col上索引排序并按正样本row上索引插入labelslabel_weights gt_bboxes.new_ones(num_bboxes) # [300] 值为1# bbox targetsbbox_targets torch.zeros_like(bbox_pred) # [300 4]值为0bbox_weights torch.zeros_like(bbox_pred) # [300 4]值为0bbox_weights[pos_inds] 1.0 # bbox_weights根据正样本row上索引设置为1img_h, img_w, _ img_meta[img_shape]# DETR regress the relative position of boxes (cxcywh) in the image.# Thus the learning target should be normalized by the image size, also# the box format should be converted from defaultly x1y1x2y2 to cxcywh.factor bbox_pred.new_tensor([img_w, img_h, img_w,img_h]).unsqueeze(0)pos_gt_bboxes_normalized sampling_result.pos_gt_bboxes / factor # 归一化的gt bboxpos_gt_bboxes_targets bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) # xyxy-cxcywhbbox_targets[pos_inds] pos_gt_bboxes_targets # 将pos_gt_bboxes_targets根据pos_inds插入bbox_targetsreturn (labels, label_weights, bbox_targets, bbox_weights, pos_inds,neg_inds)# labels,在pos_inds处插入了gt_labels其余值为80# label_weights,维度为[300] 值全为1# bbox_targets,在pos_inds处插入了pos_gt_bboxes_targets其余值为0# bbox_weights,bbox_weights根据pos_inds设置为1# pos_inds,正样本在匈牙利匹配中row上的索引# neg_inds,负样本在匈牙利匹配中row上的索引
之后计算loss
def loss(self,all_cls_scores,all_bbox_preds,enc_cls_scores,enc_bbox_preds,enc_outputs,gt_bboxes_list,gt_labels_list,img_metas,gt_bboxes_ignoreNone):Loss function.Args:all_cls_scores (Tensor): Classification score of alldecoder layers, has shape[nb_dec, bs, num_query, cls_out_channels].all_bbox_preds (Tensor): Sigmoid regressionoutputs of all decode layers. Each is a 4D-tensor withnormalized coordinate format (cx, cy, w, h) and shape[nb_dec, bs, num_query, 4].enc_cls_scores (Tensor): Classification scores ofpoints on encode feature map , has shape(N, h*w, num_classes). Only be passed when as_two_stage isTrue, otherwise is None.enc_bbox_preds (Tensor): Regression results of each pointson the encode feature map, has shape (N, h*w, 4). Only bepassed when as_two_stage is True, otherwise is None.gt_bboxes_list (list[Tensor]): Ground truth bboxes for each imagewith shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels_list (list[Tensor]): Ground truth class indices for eachimage with shape (num_gts, ).img_metas (list[dict]): List of image meta information.gt_bboxes_ignore (list[Tensor], optional): Bounding boxeswhich can be ignored for each image. Default None.Returns:dict[str, Tensor]: A dictionary of loss components.# gt_bboxes_ignore None# assert gt_bboxes_ignore is None, \# f{self.__class__.__name__} only supports \# ffor gt_bboxes_ignore setting to None.# all_cls_scores:每一个decoder输出class(6层),# all_bbox_preds:每一个decoder输出coord(6层),# enc_cls_scores:gen_encoder_output_proposals得到的output_memory(经过筛选)经过Linear后得到的关于类别的输出 [N C 80],# enc_bbox_preds:gen_encoder_output_proposals得到的output_memory(经过筛选)经过Linear后得到的关于坐标的输出加上output_proposals [N C 4],# enc_outputs:是encoder的输出(memory)根据特征图大小的展开并对最小的特征图进行下采样num_dec_layers len(all_cls_scores)all_gt_bboxes_list [gt_bboxes_list for _ in range(num_dec_layers)] # 对gt_bboxes_list复制6次用于对应每一层decoder的输出all_gt_labels_list [gt_labels_list for _ in range(num_dec_layers)] # 对gt_labels_list复制6次用于对应每一层decoder的输出all_gt_bboxes_ignore_list [gt_bboxes_ignore for _ in range(num_dec_layers)] # [None, None, None, None, None, None]img_metas_list [img_metas for _ in range(num_dec_layers)] # 对img_metas复制6次# 对GT bbox和label以及图像信息复制成长度为6的列表losses_cls, losses_bbox, losses_iou multi_apply(self.loss_single, all_cls_scores, all_bbox_preds,all_gt_bboxes_list, all_gt_labels_list, img_metas_list,all_gt_bboxes_ignore_list)loss_dict dict()# loss of proposal generated from encode feature map.if enc_cls_scores is not None:binary_labels_list [torch.zeros_like(gt_labels_list[i])for i in range(len(img_metas))]enc_loss_cls, enc_losses_bbox, enc_losses_iou \self.loss_single(enc_cls_scores, enc_bbox_preds,gt_bboxes_list, binary_labels_list,img_metas, gt_bboxes_ignore)loss_dict[enc_loss_cls] enc_loss_clsloss_dict[enc_loss_bbox] enc_losses_bboxloss_dict[enc_loss_iou] enc_losses_iou# loss from the last decoder layerloss_dict[loss_cls] losses_cls[-1]loss_dict[loss_bbox] losses_bbox[-1]loss_dict[loss_iou] losses_iou[-1]# loss from other decoder layersnum_dec_layer 0for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],losses_bbox[:-1],losses_iou[:-1]):loss_dict[fd{num_dec_layer}.loss_cls] loss_cls_iloss_dict[fd{num_dec_layer}.loss_bbox] loss_bbox_iloss_dict[fd{num_dec_layer}.loss_iou] loss_iou_inum_dec_layer 1return loss_dictdef loss_single(self,cls_scores,bbox_preds,gt_bboxes_list,gt_labels_list,img_metas,gt_bboxes_ignore_listNone):Loss function for outputs from a single decoder layer of a singlefeature level.Args:cls_scores (Tensor): Box score logits from a single decoder layerfor all images. Shape [bs, num_query, cls_out_channels].bbox_preds (Tensor): Sigmoid outputs from a single decoder layerfor all images, with normalized coordinate (cx, cy, w, h) andshape [bs, num_query, 4].gt_bboxes_list (list[Tensor]): Ground truth bboxes for each imagewith shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels_list (list[Tensor]): Ground truth class indices for eachimage with shape (num_gts, ).img_metas (list[dict]): List of image meta information.gt_bboxes_ignore_list (list[Tensor], optional): Boundingboxes which can be ignored for each image. Default None.Returns:dict[str, Tensor]: A dictionary of loss components for outputs froma single decoder layer.num_imgs cls_scores.size(0)cls_scores_list [cls_scores[i] for i in range(num_imgs)] # cls_scores[N 300 80] 根据图像数量在batch维度拆分成每张图像对应的cls预测bbox_preds_list [bbox_preds[i] for i in range(num_imgs)] # bbox_preds[N 300 4] 根据图像数量在batch维度拆分成每张图像对应的bbox预测cls_reg_targets self.get_targets(cls_scores_list, bbox_preds_list,gt_bboxes_list, gt_labels_list,img_metas, gt_bboxes_ignore_list)(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,num_total_pos, num_total_neg) cls_reg_targetslabels torch.cat(labels_list, 0) # 对每张图上经过匈牙利匹配后得到的gt label合并label_weights torch.cat(label_weights_list, 0) # 值为1bbox_targets torch.cat(bbox_targets_list, 0) # 对每张图上经过匈牙利匹配后得到的gt bbox合并bbox_weights torch.cat(bbox_weights_list, 0) # 存在gt bbox的索引位置值为1# classification losscls_scores cls_scores.reshape(-1, self.cls_out_channels)# construct weighted avg_factor to match with the official DETR repocls_avg_factor num_total_pos * 1.0 \num_total_neg * self.bg_cls_weightif self.sync_cls_avg_factor:cls_avg_factor reduce_mean(cls_scores.new_tensor([cls_avg_factor]))cls_avg_factor max(cls_avg_factor, 1)loss_cls self.loss_cls(cls_scores, labels, label_weights, avg_factorcls_avg_factor)# Compute the average number of gt boxes across all gpus, for# normalization purposesnum_total_pos loss_cls.new_tensor([num_total_pos])num_total_pos torch.clamp(reduce_mean(num_total_pos), min1).item()# construct factors used for rescale bboxesfactors []for img_meta, bbox_pred in zip(img_metas, bbox_preds):img_h, img_w, _ img_meta[img_shape]factor bbox_pred.new_tensor([img_w, img_h, img_w,img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1)factors.append(factor)factors torch.cat(factors, 0) # bbox scale 用于将bbox从归一化坐标恢复到图像坐标# DETR regress the relative position of boxes (cxcywh) in the image,# thus the learning target is normalized by the image size. So here# we need to re-scale them for calculating IoU lossbbox_preds bbox_preds.reshape(-1, 4)bboxes bbox_cxcywh_to_xyxy(bbox_preds) * factorsbboxes_gt bbox_cxcywh_to_xyxy(bbox_targets) * factors# regression IoU loss, defaultly GIoU lossloss_iou self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factornum_total_pos)# regression L1 lossloss_bbox self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factornum_total_pos)return loss_cls, loss_bbox, loss_iou RPN head
# RPN forward and lossif self.with_rpn:proposal_cfg self.train_cfg[self.head_idx].get(rpn_proposal,self.test_cfg[self.head_idx].rpn)rpn_losses, proposal_list self.rpn_head.forward_train(x,img_metas,gt_bboxes,gt_labelsNone,gt_bboxes_ignoregt_bboxes_ignore,proposal_cfgproposal_cfg,**kwargs)losses.update(rpn_losses) # rpn_losses: cls loss和bbox loss proposal_list:rpn输出的经过nms后的proposal [xyxy,score] proposal shape-tuple([1000,5],[1000,5])else:proposal_list proposalspositive_coords []for i in range(len(self.roi_head)):roi_losses self.roi_head[i].forward_train(x, img_metas, proposal_list,gt_bboxes, gt_labels,gt_bboxes_ignore, gt_masks,**kwargs)if self.with_pos_coord: # positive_coords (coords, labels, targets)positive_coords.append(roi_losses.pop(pos_coords))else: if pos_coords in roi_losses.keys():tmp roi_losses.pop(pos_coords) roi_losses upd_loss(roi_losses, idxi)losses.update(roi_losses) RPNHead( (loss_cls): CrossEntropyLoss(avg_non_ignoreFalse) (loss_bbox): L1Loss() (rpn_conv): Conv2d(256, 256, kernel_size(3, 3), stride(1, 1), padding(1, 1)) (rpn_cls): Conv2d(256, 9, kernel_size(1, 1), stride(1, 1)) (rpn_reg): Conv2d(256, 36, kernel_size(1, 1), stride(1, 1)) ) init_cfg{type: Normal, layer: Conv2d, std: 0.01} 这里就是faster rcnn的主体了其中输入的x是之前encoder输出的query根据每一层特征图的展开并对最后一层的特征图进行下采样得到五层特征图作为faster rcnn中rpn head的输入 def forward_train(self,x,img_metas,gt_bboxes,gt_labelsNone,gt_bboxes_ignoreNone,proposal_cfgNone,**kwargs):Args:x (list[Tensor]): Features from FPN.img_metas (list[dict]): Meta information of each image, e.g.,image size, scaling factor, etc.gt_bboxes (Tensor): Ground truth bboxes of the image,shape (num_gts, 4).gt_labels (Tensor): Ground truth labels of each box,shape (num_gts,).gt_bboxes_ignore (Tensor): Ground truth bboxes to beignored, shape (num_ignored_gts, 4).proposal_cfg (mmcv.Config): Test / postprocessing configuration,if None, test_cfg would be usedReturns:tuple:losses: (dict[str, Tensor]): A dictionary of loss components.proposal_list (list[Tensor]): Proposals of each image.outs self(x) # outs是一个tuple类型存放两个list元素第一个list存放rpn_cls_score, 第二个list存放rpn_bbox_predif gt_labels is None:loss_inputs outs (gt_bboxes, img_metas) # outs与gt bbox和图像信息合并else:loss_inputs outs (gt_bboxes, gt_labels, img_metas)losses self.loss(*loss_inputs, gt_bboxes_ignoregt_bboxes_ignore) # 计算bbox lossl1 loss和cls lossCE lossif proposal_cfg is None:return losseselse:proposal_list self.get_bboxes(*outs, img_metasimg_metas, cfgproposal_cfg)return losses, proposal_list
对rpn的输出求cls loss和bbox lossL1 loss def loss_single(self, cls_score, bbox_pred, anchors, labels, label_weights,bbox_targets, bbox_weights, num_total_samples):Compute loss of a single scale level.Args:cls_score (Tensor): Box scores for each scale levelHas shape (N, num_anchors * num_classes, H, W).bbox_pred (Tensor): Box energies / deltas for each scalelevel with shape (N, num_anchors * 4, H, W).anchors (Tensor): Box reference for each scale level with shape(N, num_total_anchors, 4).labels (Tensor): Labels of each anchors with shape(N, num_total_anchors).label_weights (Tensor): Label weights of each anchor with shape(N, num_total_anchors)bbox_targets (Tensor): BBox regression targets of each anchorweight shape (N, num_total_anchors, 4).bbox_weights (Tensor): BBox regression loss weights of each anchorwith shape (N, num_total_anchors, 4).num_total_samples (int): If sampling, num total samples equal tothe number of total anchors; Otherwise, it is the number ofpositive anchors.Returns:dict[str, Tensor]: A dictionary of loss components.# classification losslabels labels.reshape(-1)label_weights label_weights.reshape(-1)cls_score cls_score.permute(0, 2, 3,1).reshape(-1, self.cls_out_channels)loss_cls self.loss_cls(cls_score, labels, label_weights, avg_factornum_total_samples)# regression lossbbox_targets bbox_targets.reshape(-1, 4)bbox_weights bbox_weights.reshape(-1, 4)bbox_pred bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)if self.reg_decoded_bbox:# When the regression loss (e.g. IouLoss, GIouLoss)# is applied directly on the decoded bounding boxes, it# decodes the already encoded coordinates to absolute format.anchors anchors.reshape(-1, 4)bbox_pred self.bbox_coder.decode(anchors, bbox_pred)loss_bbox self.loss_bbox(bbox_pred,bbox_targets,bbox_weights,avg_factornum_total_samples)return loss_cls, loss_bboxforce_fp32(apply_to(cls_scores, bbox_preds))def loss(self,cls_scores,bbox_preds,gt_bboxes,gt_labels,img_metas,gt_bboxes_ignoreNone):Compute losses of the head.Args:cls_scores (list[Tensor]): Box scores for each scale levelHas shape (N, num_anchors * num_classes, H, W)bbox_preds (list[Tensor]): Box energies / deltas for each scalelevel with shape (N, num_anchors * 4, H, W)gt_bboxes (list[Tensor]): Ground truth bboxes for each image withshape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels (list[Tensor]): class indices corresponding to each boximg_metas (list[dict]): Meta information of each image, e.g.,image size, scaling factor, etc.gt_bboxes_ignore (None | list[Tensor]): specify which boundingboxes can be ignored when computing the loss. Default: NoneReturns:dict[str, Tensor]: A dictionary of loss components. # RPN 的cls 和bbox lossfeatmap_sizes [featmap.size()[-2:] for featmap in cls_scores] # 每一层feature map的尺寸assert len(featmap_sizes) self.prior_generator.num_levelsdevice cls_scores[0].device# anchor_list: 每个特征图上生成的anchor valid_flag_list: 有效区域的标识符用于判断anchor是否在图像的有效区域内anchor_list, valid_flag_list self.get_anchors(featmap_sizes, img_metas, devicedevice)label_channels self.cls_out_channels if self.use_sigmoid_cls else 1cls_reg_targets self.get_targets(anchor_list,valid_flag_list,gt_bboxes,img_metas,gt_bboxes_ignore_listgt_bboxes_ignore,gt_labels_listgt_labels,label_channelslabel_channels)if cls_reg_targets is None:return None(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,num_total_pos, num_total_neg) cls_reg_targetsnum_total_samples (num_total_pos num_total_neg if self.sampling else num_total_pos)# anchor number of multi levelsnum_level_anchors [anchors.size(0) for anchors in anchor_list[0]]# concat all level anchors and flags to a single tensorconcat_anchor_list []for i in range(len(anchor_list)):concat_anchor_list.append(torch.cat(anchor_list[i]))all_anchor_list images_to_levels(concat_anchor_list,num_level_anchors) # 将已经合并的anchor按特征图层级展开losses_cls, losses_bbox multi_apply(self.loss_single,cls_scores,bbox_preds,all_anchor_list,labels_list,label_weights_list,bbox_targets_list,bbox_weights_list,num_total_samplesnum_total_samples)return dict(loss_clslosses_cls, loss_bboxlosses_bbox) rpn head会得到对应的proposal def get_bboxes(self,cls_scores, # rpn预测的类别前背景bbox_preds, # rpn预测的bboxscore_factorsNone,img_metasNone,cfgNone, # {nms_pre: 4000, max_per_img: 1000, nms: {type: nms, iou_threshold: 0.7}, min_bbox_size: 0}rescaleFalse,with_nmsTrue,**kwargs):Transform network outputs of a batch into bbox results.Note: When score_factors is not None, the cls_scores areusually multiplied by it then obtain the real score used in NMS,such as CenterNess in FCOS, IoU branch in ATSS.Args:cls_scores (list[Tensor]): Classification scores for allscale levels, each is a 4D-tensor, has shape(batch_size, num_priors * num_classes, H, W).bbox_preds (list[Tensor]): Box energies / deltas for allscale levels, each is a 4D-tensor, has shape(batch_size, num_priors * 4, H, W).score_factors (list[Tensor], Optional): Score factor forall scale level, each is a 4D-tensor, has shape(batch_size, num_priors * 1, H, W). Default None.img_metas (list[dict], Optional): Image meta info. Default None.cfg (mmcv.Config, Optional): Test / postprocessing configuration,if None, test_cfg would be used. Default None.rescale (bool): If True, return boxes in original image space.Default False.with_nms (bool): If True, do nms before return boxes.Default True.Returns:list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.The first item is an (n, 5) tensor, where the first 4 columnsare bounding box positions (tl_x, tl_y, br_x, br_y) and the5-th column is a score between 0 and 1. The second item is a(n,) tensor where each item is the predicted class label ofthe corresponding box.assert len(cls_scores) len(bbox_preds)if score_factors is None:# e.g. Retina, FreeAnchor, Foveabox, etc.with_score_factors Falseelse:# e.g. FCOS, PAA, ATSS, AutoAssign, etc.with_score_factors Trueassert len(cls_scores) len(score_factors)num_levels len(cls_scores)featmap_sizes [cls_scores[i].shape[-2:] for i in range(num_levels)]mlvl_priors self.prior_generator.grid_priors(featmap_sizes,dtypecls_scores[0].dtype,devicecls_scores[0].device) # 每一层的anchorresult_list []for img_id in range(len(img_metas)):img_meta img_metas[img_id]cls_score_list select_single_mlvl(cls_scores, img_id)bbox_pred_list select_single_mlvl(bbox_preds, img_id)if with_score_factors:score_factor_list select_single_mlvl(score_factors, img_id)else:score_factor_list [None for _ in range(num_levels)] # [None, None, None, None, None]results self._get_bboxes_single(cls_score_list, bbox_pred_list,score_factor_list, mlvl_priors,img_meta, cfg, rescale, with_nms,**kwargs)result_list.append(results)return result_list # tuple([1000,5],[1000,5])
ROI head
roi head对rpn中的proposal提取roi此处roi通过iou进行匹配 ModuleList( (0): CoStandardRoIHead( (bbox_roi_extractor): SingleRoIExtractor( (roi_layers): ModuleList( (0): RoIAlign(output_size(7, 7), spatial_scale0.125, sampling_ratio0, pool_modeavg, alignedTrue, use_torchvisionFalse) (1): RoIAlign(output_size(7, 7), spatial_scale0.0625, sampling_ratio0, pool_modeavg, alignedTrue, use_torchvisionFalse) (2): RoIAlign(output_size(7, 7), spatial_scale0.03125, sampling_ratio0, pool_modeavg, alignedTrue, use_torchvisionFalse) (3): RoIAlign(output_size(7, 7), spatial_scale0.015625, sampling_ratio0, pool_modeavg, alignedTrue, use_torchvisionFalse) ) ) (bbox_head): Shared2FCBBoxHead( (loss_cls): CrossEntropyLoss(avg_non_ignoreFalse) (loss_bbox): GIoULoss() (fc_cls): Linear(in_features1024, out_features81, biasTrue) (fc_reg): Linear(in_features1024, out_features320, biasTrue) (shared_convs): ModuleList() (shared_fcs): ModuleList( (0): Linear(in_features12544, out_features1024, biasTrue) (1): Linear(in_features1024, out_features1024, biasTrue) ) (cls_convs): ModuleList() (cls_fcs): ModuleList() (reg_convs): ModuleList() (reg_fcs): ModuleList() (relu): ReLU(inplaceTrue) ) init_cfg[{type: Normal, std: 0.01, override: {name: fc_cls}}, {type: Normal, std: 0.001, override: {name: fc_reg}}, {type: Xavier, distribution: uniform, override: [{name: shared_fcs}, {name: cls_fcs}, {name: reg_fcs}]}] ) ) class CoStandardRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):Simplest base roi head including one bbox head and one mask head.def forward_train(self,x,img_metas,proposal_list,gt_bboxes,gt_labels,gt_bboxes_ignoreNone,gt_masksNone,**kwargs):Args:x (list[Tensor]): list of multi-level img features.img_metas (list[dict]): list of image info dict where each dicthas: img_shape, scale_factor, flip, and may also containfilename, ori_shape, pad_shape, and img_norm_cfg.For details on the values of these keys seemmdet/datasets/pipelines/formatting.py:Collect.proposals (list[Tensors]): list of region proposals.gt_bboxes (list[Tensor]): Ground truth bboxes for each image withshape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels (list[Tensor]): class indices corresponding to each boxgt_bboxes_ignore (None | list[Tensor]): specify which boundingboxes can be ignored when computing the loss.gt_masks (None | Tensor) : true segmentation masks for each boxused if the architecture supports a segmentation task.Returns:dict[str, Tensor]: a dictionary of loss components# assign gts and sample proposalsif self.with_bbox or self.with_mask:num_imgs len(img_metas)if gt_bboxes_ignore is None:gt_bboxes_ignore [None for _ in range(num_imgs)] # [None, None]sampling_results []for i in range(num_imgs):assign_result self.bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],gt_labels[i])sampling_result self.bbox_sampler.sample(assign_result,proposal_list[i],gt_bboxes[i],gt_labels[i],feats[lvl_feat[i][None] for lvl_feat in x])sampling_results.append(sampling_result)losses dict()# bbox head forward and lossif self.with_bbox:bbox_results self._bbox_forward_train(x, sampling_results,gt_bboxes, gt_labels,img_metas)losses.update(bbox_results[loss_bbox])bbox_targets bbox_results[bbox_targets]num_imgs len(img_metas)max_proposal 2000for res in sampling_results:max_proposal min(max_proposal, res.bboxes.shape[0])ori_coords bbox2roi([res.bboxes for res in sampling_results])ori_proposals, ori_labels, ori_bbox_targets, ori_bbox_feats [], [], [], []for i in range(num_imgs):idx (ori_coords[:,0]i).nonzero().squeeze(1)idx idx[:max_proposal]ori_proposal ori_coords[idx][:, 1:].unsqueeze(0)ori_label bbox_targets[0][idx].unsqueeze(0)ori_bbox_target bbox_targets[2][idx].unsqueeze(0)ori_bbox_feat bbox_results[bbox_feats].mean(-1).mean(-1)ori_bbox_feat ori_bbox_feat[idx].unsqueeze(0)ori_proposals.append(ori_proposal) ori_labels.append(ori_label)ori_bbox_targets.append(ori_bbox_target)ori_bbox_feats.append(ori_bbox_feat)ori_coords torch.cat(ori_proposals, dim0)ori_labels torch.cat(ori_labels, dim0)ori_bbox_targets torch.cat(ori_bbox_targets, dim0)ori_bbox_feats torch.cat(ori_bbox_feats, dim0)pos_coords (ori_coords, ori_labels, ori_bbox_targets, ori_bbox_feats, rcnn)losses.update(pos_coordspos_coords)# mask head forward and lossif self.with_mask:mask_results self._mask_forward_train(x, sampling_results,bbox_results[bbox_feats],gt_masks, img_metas)losses.update(mask_results[loss_mask])return lossesdef _bbox_forward(self, x, rois):Box head forward function used in both training and testing.# TODO: a more flexible way to decide which feature maps to usebbox_feats self.bbox_roi_extractor(x[:self.bbox_roi_extractor.num_inputs], rois) # x[:4] bbox_feats [1024 256 7 7]if self.with_shared_head:bbox_feats self.shared_head(bbox_feats)cls_score, bbox_pred self.bbox_head(bbox_feats) # cls_score[1024 81] bbox_pred[1024 320]bbox_results dict(cls_scorecls_score, bbox_predbbox_pred, bbox_featsbbox_feats)return bbox_resultsdef _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,img_metas):Run forward function and calculate loss for box head in training.rois bbox2roi([res.bboxes for res in sampling_results])bbox_results self._bbox_forward(x, rois)bbox_targets self.bbox_head.get_targets(sampling_results, gt_bboxes,gt_labels, self.train_cfg)loss_bbox self.bbox_head.loss(bbox_results[cls_score],bbox_results[bbox_pred], rois,*bbox_targets)bbox_results.update(loss_bboxloss_bbox)bbox_results.update(bbox_targetsbbox_targets)return bbox_results
对提取的roi区域进行ROIAlign
class SingleRoIExtractor(BaseRoIExtractor):Extract RoI features from a single level feature map.If there are multiple input feature levels, each RoI is mapped to a levelaccording to its scale. The mapping rule is proposed inFPN https://arxiv.org/abs/1612.03144_.Args:roi_layer (dict): Specify RoI layer type and arguments.out_channels (int): Output channels of RoI layers.featmap_strides (List[int]): Strides of input feature maps.finest_scale (int): Scale threshold of mapping to level 0. Default: 56.init_cfg (dict or list[dict], optional): Initialization config dict.Default: Nonedef __init__(self,roi_layer,out_channels,featmap_strides,finest_scale56,init_cfgNone):super(SingleRoIExtractor, self).__init__(roi_layer, out_channels,featmap_strides, init_cfg)self.finest_scale finest_scaledef map_roi_levels(self, rois, num_levels):Map rois to corresponding feature levels by scales.- scale finest_scale * 2: level 0- finest_scale * 2 scale finest_scale * 4: level 1- finest_scale * 4 scale finest_scale * 8: level 2- scale finest_scale * 8: level 3Args:rois (Tensor): Input RoIs, shape (k, 5).num_levels (int): Total level number.Returns:Tensor: Level index (0-based) of each RoI, shape (k, )scale torch.sqrt((rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2])) # roi区域面积的0.5次方target_lvls torch.floor(torch.log2(scale / self.finest_scale 1e-6))target_lvls target_lvls.clamp(min0, maxnum_levels - 1).long() # 限制在0-3的范围内return target_lvlsforce_fp32(apply_to(feats, ), out_fp16True)def forward(self, feats, rois, roi_scale_factorNone):Forward function.out_size self.roi_layers[0].output_sizenum_levels len(feats) # feats为memory reshape后前四层的特征图expand_dims (-1, self.out_channels * out_size[0] * out_size[1])if torch.onnx.is_in_onnx_export():# Work around to export mask-rcnn to onnxroi_feats rois[:, :1].clone().detach()roi_feats roi_feats.expand(*expand_dims)roi_feats roi_feats.reshape(-1, self.out_channels, *out_size)roi_feats roi_feats * 0else:roi_feats feats[0].new_zeros(rois.size(0), self.out_channels, *out_size) # 创建roi feat [1024 256 7 7]# TODO: remove this when parrots supportsif torch.__version__ parrots:roi_feats.requires_grad Trueif num_levels 1:if len(rois) 0:return roi_featsreturn self.roi_layers[0](feats[0], rois)target_lvls self.map_roi_levels(rois, num_levels)if roi_scale_factor is not None:rois self.roi_rescale(rois, roi_scale_factor)for i in range(num_levels):mask target_lvls iif torch.onnx.is_in_onnx_export():# To keep all roi_align nodes exported to onnx# and skip nonzero opmask mask.float().unsqueeze(-1)# select target level rois and reset the rest rois to zero.rois_i rois.clone().detach()rois_i rois_i * maskmask_exp mask.expand(*expand_dims).reshape(roi_feats.shape)roi_feats_t self.roi_layers[i](feats[i], rois_i)roi_feats_t roi_feats_t * mask_exproi_feats roi_feats roi_feats_tcontinueinds mask.nonzero(as_tupleFalse).squeeze(1)if inds.numel() 0:rois_ rois[inds]roi_feats_t self.roi_layers[i](feats[i], rois_) # roialignroi_feats[inds] roi_feats_telse:# Sometimes some pyramid levels will not be used for RoI# feature extraction and this will cause an incomplete# computation graph in one GPU, which is different from those# in other GPUs and will cause a hanging error.# Therefore, we add it to ensure each feature pyramid is# included in the computation graph to avoid runtime bugs.roi_feats roi_feats sum(x.view(-1)[0]for x in self.parameters()) * 0. feats[i].sum() * 0.return roi_feats
roi head输出ori_coords, ori_labels, ori_bbox_targets, ori_bbox_feats, rcnn
ATSS head
# ATSS headfor i in range(len(self.bbox_head)):bbox_losses self.bbox_head[i].forward_train(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore)if self.with_pos_coord: # pos_coords (ori_anchors, ori_labels, ori_bbox_targets, atss)pos_coords bbox_losses.pop(pos_coords)positive_coords.append(pos_coords)else:if pos_coords in bbox_losses.keys():tmp bbox_losses.pop(pos_coords) bbox_losses upd_loss(bbox_losses, idxilen(self.roi_head))losses.update(bbox_losses) CoATSSHead( (loss_cls): FocalLoss() (loss_bbox): GIoULoss() (relu): ReLU(inplaceTrue) (cls_convs): ModuleList( (0): ConvModule( (conv): Conv2d(256, 256, kernel_size(3, 3), stride(1, 1), padding(1, 1), biasFalse) (gn): GroupNorm(32, 256, eps1e-05, affineTrue) (activate): ReLU(inplaceTrue) ) ) (reg_convs): ModuleList( (0): ConvModule( (conv): Conv2d(256, 256, kernel_size(3, 3), stride(1, 1), padding(1, 1), biasFalse) (gn): GroupNorm(32, 256, eps1e-05, affineTrue) (activate): ReLU(inplaceTrue) ) ) (atss_cls): Conv2d(256, 80, kernel_size(3, 3), stride(1, 1), padding(1, 1)) (atss_reg): Conv2d(256, 4, kernel_size(3, 3), stride(1, 1), padding(1, 1)) (atss_centerness): Conv2d(256, 1, kernel_size(3, 3), stride(1, 1), padding(1, 1)) (scales): ModuleList( (0): Scale() (1): Scale() (2): Scale() (3): Scale() (4): Scale() ) (loss_centerness): CrossEntropyLoss(avg_non_ignoreFalse) ) init_cfg{type: Normal, layer: Conv2d, std: 0.01, override: {type: Normal, name: atss_cls, std: 0.01, bias_prob: 0.01}} 逻辑上来说就是在每层特征图上取每个锚框的中心点与GT中心点计算最短L2距离并且根据距离最短的中心点取锚框与GT的最大iou的topk个锚框作为正样本然后动态设置io阈值
class CoATSSHead(AnchorHead):Bridging the Gap Between Anchor-based and Anchor-free Detection viaAdaptive Training Sample Selection.ATSS head structure is similar with FCOS, however ATSS use anchor boxesand assign label by Adaptive Training Sample Selection instead max-iou.https://arxiv.org/abs/1912.02424def __init__(self,num_classes,in_channels,stacked_convs4,conv_cfgNone,norm_cfgdict(typeGN, num_groups32, requires_gradTrue),reg_decoded_bboxTrue,loss_centernessdict(typeCrossEntropyLoss,use_sigmoidTrue,loss_weight1.0),init_cfgdict(typeNormal,layerConv2d,std0.01,overridedict(typeNormal,nameatss_cls,std0.01,bias_prob0.01)),**kwargs):self.stacked_convs stacked_convsself.conv_cfg conv_cfgself.norm_cfg norm_cfgsuper(CoATSSHead, self).__init__(num_classes,in_channels,reg_decoded_bboxreg_decoded_bbox,init_cfginit_cfg,**kwargs)self.sampling Falseif self.train_cfg:self.assigner build_assigner(self.train_cfg.assigner)# SSD samplingFalse so use PseudoSamplersampler_cfg dict(typePseudoSampler)self.sampler build_sampler(sampler_cfg, contextself)self.loss_centerness build_loss(loss_centerness)def _init_layers(self):Initialize layers of the head.self.relu nn.ReLU(inplaceTrue)self.cls_convs nn.ModuleList()self.reg_convs nn.ModuleList()for i in range(self.stacked_convs):chn self.in_channels if i 0 else self.feat_channelsself.cls_convs.append(ConvModule(chn,self.feat_channels,3,stride1,padding1,conv_cfgself.conv_cfg,norm_cfgself.norm_cfg))self.reg_convs.append(ConvModule(chn,self.feat_channels,3,stride1,padding1,conv_cfgself.conv_cfg,norm_cfgself.norm_cfg))self.atss_cls nn.Conv2d(self.feat_channels,self.num_anchors * self.cls_out_channels,3,padding1)self.atss_reg nn.Conv2d(self.feat_channels, self.num_base_priors * 4, 3, padding1)self.atss_centerness nn.Conv2d(self.feat_channels, self.num_base_priors * 1, 3, padding1)self.scales nn.ModuleList([Scale(1.0) for _ in self.prior_generator.strides])def forward(self, feats):Forward features from the upstream network.Args:feats (tuple[Tensor]): Features from the upstream network, each isa 4D-tensor.Returns:tuple: Usually a tuple of classification scores and bbox predictioncls_scores (list[Tensor]): Classification scores for all scalelevels, each is a 4D-tensor, the channels number isnum_anchors * num_classes.bbox_preds (list[Tensor]): Box energies / deltas for all scalelevels, each is a 4D-tensor, the channels number isnum_anchors * 4.return multi_apply(self.forward_single, feats, self.scales)def forward_single(self, x, scale):Forward feature of a single scale level.Args:x (Tensor): Features of a single scale level.scale (:obj: mmcv.cnn.Scale): Learnable scale module to resizethe bbox prediction.Returns:tuple:cls_score (Tensor): Cls scores for a single scale levelthe channels number is num_anchors * num_classes.bbox_pred (Tensor): Box energies / deltas for a single scalelevel, the channels number is num_anchors * 4.centerness (Tensor): Centerness for a single scale level, thechannel number is (N, num_anchors * 1, H, W).cls_feat xreg_feat xfor cls_conv in self.cls_convs:cls_feat cls_conv(cls_feat)for reg_conv in self.reg_convs:reg_feat reg_conv(reg_feat)cls_score self.atss_cls(cls_feat) # [N 80 H W]# we just follow atss, not apply exp in bbox_predbbox_pred scale(self.atss_reg(reg_feat)).float() # [N 4 H W]centerness self.atss_centerness(reg_feat) # [N 1 H W]return cls_score, bbox_pred, centernessdef loss_single(self, anchors, cls_score, bbox_pred, centerness, labels,label_weights, bbox_targets, img_metas, num_total_samples):Compute loss of a single scale level.Args:cls_score (Tensor): Box scores for each scale levelHas shape (N, num_anchors * num_classes, H, W).bbox_pred (Tensor): Box energies / deltas for each scalelevel with shape (N, num_anchors * 4, H, W).anchors (Tensor): Box reference for each scale level with shape(N, num_total_anchors, 4).labels (Tensor): Labels of each anchors with shape(N, num_total_anchors).label_weights (Tensor): Label weights of each anchor with shape(N, num_total_anchors)bbox_targets (Tensor): BBox regression targets of each anchorweight shape (N, num_total_anchors, 4).num_total_samples (int): Number os positive samples that isreduced over all GPUs.Returns:dict[str, Tensor]: A dictionary of loss components.anchors anchors.reshape(-1, 4)cls_score cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous()bbox_pred bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)centerness centerness.permute(0, 2, 3, 1).reshape(-1)bbox_targets bbox_targets.reshape(-1, 4)labels labels.reshape(-1)label_weights label_weights.reshape(-1)# classification lossloss_cls self.loss_cls(cls_score, labels, label_weights, avg_factornum_total_samples)# FG cat_id: [0, num_classes -1], BG cat_id: num_classesbg_class_ind self.num_classespos_inds ((labels 0) (labels bg_class_ind)).nonzero().squeeze(1)if len(pos_inds) 0:pos_bbox_targets bbox_targets[pos_inds]pos_bbox_pred bbox_pred[pos_inds]pos_anchors anchors[pos_inds]pos_centerness centerness[pos_inds]centerness_targets self.centerness_target(pos_anchors, pos_bbox_targets)pos_decode_bbox_pred self.bbox_coder.decode(pos_anchors, pos_bbox_pred)# regression lossloss_bbox self.loss_bbox(pos_decode_bbox_pred,pos_bbox_targets,weightcenterness_targets,avg_factor1.0)# centerness lossloss_centerness self.loss_centerness(pos_centerness,centerness_targets,avg_factornum_total_samples)else:loss_bbox bbox_pred.sum() * 0loss_centerness centerness.sum() * 0centerness_targets bbox_targets.new_tensor(0.)return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()force_fp32(apply_to(cls_scores, bbox_preds, centernesses))def loss(self,cls_scores,bbox_preds,centernesses,gt_bboxes,gt_labels,img_metas,gt_bboxes_ignoreNone):Compute losses of the head.Args:cls_scores (list[Tensor]): Box scores for each scale levelHas shape (N, num_anchors * num_classes, H, W)bbox_preds (list[Tensor]): Box energies / deltas for each scalelevel with shape (N, num_anchors * 4, H, W)centernesses (list[Tensor]): Centerness for each scalelevel with shape (N, num_anchors * 1, H, W)gt_bboxes (list[Tensor]): Ground truth bboxes for each image withshape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels (list[Tensor]): class indices corresponding to each boximg_metas (list[dict]): Meta information of each image, e.g.,image size, scaling factor, etc.gt_bboxes_ignore (list[Tensor] | None): specify which boundingboxes can be ignored when computing the loss.Returns:dict[str, Tensor]: A dictionary of loss components.featmap_sizes [featmap.size()[-2:] for featmap in cls_scores] # 得到每层feature map的尺寸assert len(featmap_sizes) self.prior_generator.num_levelsdevice cls_scores[0].deviceanchor_list, valid_flag_list self.get_anchors(featmap_sizes, img_metas, devicedevice)label_channels self.cls_out_channels if self.use_sigmoid_cls else 1cls_reg_targets self.get_targets(anchor_list,valid_flag_list,gt_bboxes,img_metas,gt_bboxes_ignore_listgt_bboxes_ignore,gt_labels_listgt_labels,label_channelslabel_channels)if cls_reg_targets is None:return None(anchor_list, labels_list, label_weights_list, bbox_targets_list,bbox_weights_list, num_total_pos, num_total_neg,ori_anchors, ori_labels, ori_bbox_targets) cls_reg_targetsnum_total_samples reduce_mean(torch.tensor(num_total_pos, dtypetorch.float,devicedevice)).item()num_total_samples max(num_total_samples, 1.0)new_img_metas [img_metas for _ in range(len(anchor_list))]losses_cls, losses_bbox, loss_centerness,\bbox_avg_factor multi_apply(self.loss_single,anchor_list,cls_scores,bbox_preds,centernesses,labels_list,label_weights_list,bbox_targets_list,new_img_metas,num_total_samplesnum_total_samples)bbox_avg_factor sum(bbox_avg_factor)bbox_avg_factor reduce_mean(bbox_avg_factor).clamp_(min1).item()losses_bbox list(map(lambda x: x / bbox_avg_factor, losses_bbox))pos_coords (ori_anchors, ori_labels, ori_bbox_targets, atss)return dict(loss_clslosses_cls,loss_bboxlosses_bbox,loss_centernessloss_centerness,pos_coordspos_coords)def centerness_target(self, anchors, gts):# only calculate pos centerness targets, otherwise there may be nananchors_cx (anchors[:, 2] anchors[:, 0]) / 2anchors_cy (anchors[:, 3] anchors[:, 1]) / 2l_ anchors_cx - gts[:, 0]t_ anchors_cy - gts[:, 1]r_ gts[:, 2] - anchors_cxb_ gts[:, 3] - anchors_cyleft_right torch.stack([l_, r_], dim1)top_bottom torch.stack([t_, b_], dim1)centerness torch.sqrt((left_right.min(dim-1)[0] / left_right.max(dim-1)[0]) *(top_bottom.min(dim-1)[0] / top_bottom.max(dim-1)[0]))assert not torch.isnan(centerness).any()return centernessdef get_targets(self,anchor_list,valid_flag_list,gt_bboxes_list,img_metas,gt_bboxes_ignore_listNone,gt_labels_listNone,label_channels1,unmap_outputsTrue):Get targets for ATSS head.This method is almost the same as AnchorHead.get_targets(). Besidesreturning the targets as the parent method does, it also returns theanchors as the first element of the returned tuple.num_imgs len(img_metas)assert len(anchor_list) len(valid_flag_list) num_imgs# anchor number of multi levelsnum_level_anchors [anchors.size(0) for anchors in anchor_list[0]]num_level_anchors_list [num_level_anchors] * num_imgs# concat all level anchors and flags to a single tensorfor i in range(num_imgs):assert len(anchor_list[i]) len(valid_flag_list[i])anchor_list[i] torch.cat(anchor_list[i])valid_flag_list[i] torch.cat(valid_flag_list[i])# compute targets for each imageif gt_bboxes_ignore_list is None:gt_bboxes_ignore_list [None for _ in range(num_imgs)]if gt_labels_list is None:gt_labels_list [None for _ in range(num_imgs)](all_anchors, all_labels, all_label_weights, all_bbox_targets,all_bbox_weights, pos_inds_list, neg_inds_list) multi_apply(self._get_target_single,anchor_list,valid_flag_list,num_level_anchors_list,gt_bboxes_list,gt_bboxes_ignore_list,gt_labels_list,img_metas,label_channelslabel_channels,unmap_outputsunmap_outputs)# no valid anchorsif any([labels is None for labels in all_labels]):return None# sampled anchors of all imagesnum_total_pos sum([max(inds.numel(), 1) for inds in pos_inds_list])num_total_neg sum([max(inds.numel(), 1) for inds in neg_inds_list])# split targets to a list w.r.t. multiple levelsori_anchors all_anchorsori_labels all_labelsori_bbox_targets all_bbox_targetsanchors_list images_to_levels(all_anchors, num_level_anchors)labels_list images_to_levels(all_labels, num_level_anchors)label_weights_list images_to_levels(all_label_weights,num_level_anchors)bbox_targets_list images_to_levels(all_bbox_targets,num_level_anchors)bbox_weights_list images_to_levels(all_bbox_weights,num_level_anchors)return (anchors_list, labels_list, label_weights_list,bbox_targets_list, bbox_weights_list, num_total_pos,num_total_neg, ori_anchors, ori_labels, ori_bbox_targets)def _get_target_single(self,flat_anchors,valid_flags,num_level_anchors,gt_bboxes,gt_bboxes_ignore,gt_labels,img_meta,label_channels1,unmap_outputsTrue):Compute regression, classification targets for anchors in a singleimage.Args:flat_anchors (Tensor): Multi-level anchors of the image, which areconcatenated into a single tensor of shape (num_anchors ,4)valid_flags (Tensor): Multi level valid flags of the image,which are concatenated into a single tensor ofshape (num_anchors,).num_level_anchors Tensor): Number of anchors of each scale level.gt_bboxes (Tensor): Ground truth bboxes of the image,shape (num_gts, 4).gt_bboxes_ignore (Tensor): Ground truth bboxes to beignored, shape (num_ignored_gts, 4).gt_labels (Tensor): Ground truth labels of each box,shape (num_gts,).img_meta (dict): Meta info of the image.label_channels (int): Channel of label.unmap_outputs (bool): Whether to map outputs back to the originalset of anchors.Returns:tuple: N is the number of total anchors in the image.labels (Tensor): Labels of all anchors in the image with shape(N,).label_weights (Tensor): Label weights of all anchor in theimage with shape (N,).bbox_targets (Tensor): BBox targets of all anchors in theimage with shape (N, 4).bbox_weights (Tensor): BBox weights of all anchors in theimage with shape (N, 4)pos_inds (Tensor): Indices of positive anchor with shape(num_pos,).neg_inds (Tensor): Indices of negative anchor with shape(num_neg,).inside_flags anchor_inside_flags(flat_anchors, valid_flags,img_meta[img_shape][:2],self.train_cfg.allowed_border)if not inside_flags.any():return (None, ) * 7# assign gt and sample anchorsanchors flat_anchors[inside_flags, :]num_level_anchors_inside self.get_num_level_anchors_inside(num_level_anchors, inside_flags) # 每层特征图上对应的有效anchor的数量assign_result self.assigner.assign(anchors, num_level_anchors_inside,gt_bboxes, gt_bboxes_ignore,gt_labels)sampling_result self.sampler.sample(assign_result, anchors,gt_bboxes)num_valid_anchors anchors.shape[0]bbox_targets torch.zeros_like(anchors)bbox_weights torch.zeros_like(anchors)labels anchors.new_full((num_valid_anchors, ),self.num_classes,dtypetorch.long)label_weights anchors.new_zeros(num_valid_anchors, dtypetorch.float)pos_inds sampling_result.pos_indsneg_inds sampling_result.neg_indsif len(pos_inds) 0:if self.reg_decoded_bbox:pos_bbox_targets sampling_result.pos_gt_bboxeselse:pos_bbox_targets self.bbox_coder.encode(sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)bbox_targets[pos_inds, :] pos_bbox_targetsbbox_weights[pos_inds, :] 1.0if gt_labels is None:# Only rpn gives gt_labels as None# Foreground is the first class since v2.5.0labels[pos_inds] 0else:labels[pos_inds] gt_labels[sampling_result.pos_assigned_gt_inds]if self.train_cfg.pos_weight 0:label_weights[pos_inds] 1.0else:label_weights[pos_inds] self.train_cfg.pos_weightif len(neg_inds) 0:label_weights[neg_inds] 1.0# map up to original set of anchorsif unmap_outputs:num_total_anchors flat_anchors.size(0)anchors unmap(anchors, num_total_anchors, inside_flags)labels unmap(labels, num_total_anchors, inside_flags, fillself.num_classes)label_weights unmap(label_weights, num_total_anchors,inside_flags)bbox_targets unmap(bbox_targets, num_total_anchors, inside_flags)bbox_weights unmap(bbox_weights, num_total_anchors, inside_flags)return (anchors, labels, label_weights, bbox_targets, bbox_weights,pos_inds, neg_inds)def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):split_inside_flags torch.split(inside_flags, num_level_anchors)num_level_anchors_inside [int(flags.sum()) for flags in split_inside_flags]return num_level_anchors_inside # 确定每层特征图上对应的有效anchor的数量def forward_train(self,x,img_metas,gt_bboxes,gt_labelsNone,gt_bboxes_ignoreNone,proposal_cfgNone,**kwargs):Args:x (list[Tensor]): Features from FPN.img_metas (list[dict]): Meta information of each image, e.g.,image size, scaling factor, etc.gt_bboxes (Tensor): Ground truth bboxes of the image,shape (num_gts, 4).gt_labels (Tensor): Ground truth labels of each box,shape (num_gts,).gt_bboxes_ignore (Tensor): Ground truth bboxes to beignored, shape (num_ignored_gts, 4).proposal_cfg (mmcv.Config): Test / postprocessing configuration,if None, test_cfg would be usedReturns:tuple:losses: (dict[str, Tensor]): A dictionary of loss components.proposal_list (list[Tensor]): Proposals of each image.outs self(x) # tuple [0]存放五层feature map得到的CLS [1]存放五层feature map得到的BBOX [2]存放五层feature map得到的CENTERif gt_labels is None:loss_inputs outs (gt_bboxes, img_metas)else:loss_inputs outs (gt_bboxes, gt_labels, img_metas)losses self.loss(*loss_inputs, gt_bboxes_ignoregt_bboxes_ignore)if proposal_cfg is None:return losseselse:proposal_list self.get_bboxes(*outs, img_metasimg_metas, cfgproposal_cfg)return losses, proposal_list
辅助头
辅助头就是对faster rcnn和atss中输出的预测编码成query在输入到decoder中并计算loss
# aux headif self.with_pos_coord and len(positive_coords)0:for i in range(len(positive_coords)):bbox_losses self.query_head.forward_train_aux(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore, positive_coords[i], i)bbox_losses upd_loss(bbox_losses, idxi)losses.update(bbox_losses) def forward_aux(self, mlvl_feats, img_metas, aux_targets, head_idx):Forward function.Args:mlvl_feats (tuple[Tensor]): Features from the upstreamnetwork, each is a 4D-tensor with shape(N, C, H, W).img_metas (list[dict]): List of image information.Returns:all_cls_scores (Tensor): Outputs from the classification head, \shape [nb_dec, bs, num_query, cls_out_channels]. Note \cls_out_channels should includes background.all_bbox_preds (Tensor): Sigmoid outputs from the regression \head with normalized coordinate format (cx, cy, w, h). \Shape [nb_dec, bs, num_query, 4].enc_outputs_class (Tensor): The score of each point on encode \feature map, has shape (N, h*w, num_class). Only when \as_two_stage is True it would be returned, otherwise \None would be returned.enc_outputs_coord (Tensor): The proposal generate from the \encode feature map, has shape (N, h*w, 4). Only when \as_two_stage is True it would be returned, otherwise \None would be returned.aux_coords, aux_labels, aux_targets, aux_label_weights, aux_bbox_weights, aux_feats, attn_masks aux_targetsbatch_size mlvl_feats[0].size(0)input_img_h, input_img_w img_metas[0][batch_input_shape]img_masks mlvl_feats[0].new_ones((batch_size, input_img_h, input_img_w))for img_id in range(batch_size):img_h, img_w, _ img_metas[img_id][img_shape]img_masks[img_id, :img_h, :img_w] 0mlvl_masks []mlvl_positional_encodings []for feat in mlvl_feats:mlvl_masks.append(F.interpolate(img_masks[None],sizefeat.shape[-2:]).to(torch.bool).squeeze(0))mlvl_positional_encodings.append(self.positional_encoding(mlvl_masks[-1]))query_embeds Nonehs, init_reference, inter_references self.transformer.forward_aux(mlvl_feats,mlvl_masks,query_embeds,mlvl_positional_encodings,aux_coords,pos_featsaux_feats,reg_branchesself.reg_branches if self.with_box_refine else None, # noqa:E501cls_branchesself.cls_branches if self.as_two_stage else None, # noqa:E501return_encoder_outputTrue,attn_masksattn_masks,head_idxhead_idx)hs hs.permute(0, 2, 1, 3)outputs_classes []outputs_coords []for lvl in range(hs.shape[0]):if lvl 0:reference init_referenceelse:reference inter_references[lvl - 1]reference inverse_sigmoid(reference)outputs_class self.cls_branches[lvl](hs[lvl])tmp self.reg_branches[lvl](hs[lvl])if reference.shape[-1] 4:tmp referenceelse:assert reference.shape[-1] 2tmp[..., :2] referenceoutputs_coord tmp.sigmoid()outputs_classes.append(outputs_class)outputs_coords.append(outputs_coord)outputs_classes torch.stack(outputs_classes)outputs_coords torch.stack(outputs_coords)return outputs_classes, outputs_coords, \None, None 其中transformer的部分 def forward_aux(self,mlvl_feats,mlvl_masks,query_embed,mlvl_pos_embeds,pos_anchors,pos_featsNone,reg_branchesNone,cls_branchesNone,return_encoder_outputFalse,attn_masksNone,head_idx0,**kwargs):feat_flatten []mask_flatten []spatial_shapes []for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):bs, c, h, w feat.shapespatial_shape (h, w)spatial_shapes.append(spatial_shape)feat feat.flatten(2).transpose(1, 2)mask mask.flatten(1)feat_flatten.append(feat)mask_flatten.append(mask)feat_flatten torch.cat(feat_flatten, 1)mask_flatten torch.cat(mask_flatten, 1)spatial_shapes torch.as_tensor(spatial_shapes, dtypetorch.long, devicefeat_flatten.device)level_start_index torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))valid_ratios torch.stack([self.get_valid_ratio(m) for m in mlvl_masks], 1)feat_flatten feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims)memory feat_flattenmemory memory.permute(1, 0, 2)bs, _, c memory.shapetopk pos_anchors.shape[1]topk_coords_unact inverse_sigmoid((pos_anchors))reference_points pos_anchorsinit_reference_out reference_pointsif self.num_co_heads 0:pos_trans_out self.aux_pos_trans_norm[head_idx](self.aux_pos_trans[head_idx](self.get_proposal_pos_embed(topk_coords_unact)))query_pos, query torch.split(pos_trans_out, c, dim2)if self.with_coord_feat:query query self.pos_feats_norm[head_idx](self.pos_feats_trans[head_idx](pos_feats))query_pos query_pos self.head_pos_embed.weight[head_idx]# decoderquery query.permute(1, 0, 2)memory memory.permute(1, 0, 2)query_pos query_pos.permute(1, 0, 2)inter_states, inter_references self.decoder(queryquery,keyNone,valuememory,query_posquery_pos,key_padding_maskmask_flatten,reference_pointsreference_points,spatial_shapesspatial_shapes,level_start_indexlevel_start_index,valid_ratiosvalid_ratios,reg_branchesreg_branches,attn_masksattn_masks,**kwargs)inter_references_out inter_referencesreturn inter_states, init_reference_out, \inter_references_out
这里的feature是之前encoder的输出
计算loss时就不再用匈牙利算法进行匹配了因为这些query已经是预测得到的结果 def loss_single_aux(self,cls_scores,bbox_preds,labels,label_weights,bbox_targets,bbox_weights,img_metas,gt_bboxes_ignore_listNone):Loss function for outputs from a single decoder layer of a singlefeature level.Args:cls_scores (Tensor): Box score logits from a single decoder layerfor all images. Shape [bs, num_query, cls_out_channels].bbox_preds (Tensor): Sigmoid outputs from a single decoder layerfor all images, with normalized coordinate (cx, cy, w, h) andshape [bs, num_query, 4].gt_bboxes_list (list[Tensor]): Ground truth bboxes for each imagewith shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.gt_labels_list (list[Tensor]): Ground truth class indices for eachimage with shape (num_gts, ).img_metas (list[dict]): List of image meta information.gt_bboxes_ignore_list (list[Tensor], optional): Boundingboxes which can be ignored for each image. Default None.Returns:dict[str, Tensor]: A dictionary of loss components for outputs froma single decoder layer.num_imgs cls_scores.size(0)num_q cls_scores.size(1)try:labels labels.reshape(num_imgs * num_q)label_weights label_weights.reshape(num_imgs * num_q)bbox_targets bbox_targets.reshape(num_imgs * num_q, 4)bbox_weights bbox_weights.reshape(num_imgs * num_q, 4)except:return cls_scores.mean()*0, cls_scores.mean()*0, cls_scores.mean()*0bg_class_ind self.num_classesnum_total_pos len(((labels 0) (labels bg_class_ind)).nonzero().squeeze(1))num_total_neg num_imgs*num_q - num_total_pos# classification losscls_scores cls_scores.reshape(-1, self.cls_out_channels)# construct weighted avg_factor to match with the official DETR repocls_avg_factor num_total_pos * 1.0 \num_total_neg * self.bg_cls_weightif self.sync_cls_avg_factor:cls_avg_factor reduce_mean(cls_scores.new_tensor([cls_avg_factor]))cls_avg_factor max(cls_avg_factor, 1)loss_cls self.loss_cls(cls_scores, labels, label_weights, avg_factorcls_avg_factor)# Compute the average number of gt boxes across all gpus, for# normalization purposesnum_total_pos loss_cls.new_tensor([num_total_pos])num_total_pos torch.clamp(reduce_mean(num_total_pos), min1).item()# construct factors used for rescale bboxesfactors []for img_meta, bbox_pred in zip(img_metas, bbox_preds):img_h, img_w, _ img_meta[img_shape]factor bbox_pred.new_tensor([img_w, img_h, img_w,img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1)factors.append(factor)factors torch.cat(factors, 0)# DETR regress the relative position of boxes (cxcywh) in the image,# thus the learning target is normalized by the image size. So here# we need to re-scale them for calculating IoU lossbbox_preds bbox_preds.reshape(-1, 4)bboxes bbox_cxcywh_to_xyxy(bbox_preds) * factorsbboxes_gt bbox_cxcywh_to_xyxy(bbox_targets) * factors# regression IoU loss, defaultly GIoU lossloss_iou self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factornum_total_pos)# regression L1 lossloss_bbox self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factornum_total_pos)return loss_cls*self.lambda_1, loss_bbox*self.lambda_1, loss_iou*self.lambda_1