在nuscens数据集上,
Results and Models
Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
---|---|---|---|---|---|---|---|
R-50 | DETR | 150e | 7.9 | 40.1 | config | model | log |
我们先看检测器
/mmdetection-2.28.2/mmdet/models/detectors/detr.py
def forward_train(self,img,img_metas,gt_bboxes,gt_labels,gt_bboxes_ignore=None):"""Args:img (Tensor): Input images of shape (N, C, H, W).Typically these should be mean centered and std scaled.img_metas (list[dict]): A List of image info dict where each dicthas: 'img_shape', 'scale_factor', 'flip', and may also contain'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.For details on the values of these keys see:class:`mmdet.datasets.pipelines.Collect`.gt_bboxes (list[Tensor]): Each item are the truth boxes for eachimage in [tl_x, tl_y, br_x, br_y] format.gt_labels (list[Tensor]): Class indices corresponding to each boxgt_bboxes_ignore (None | list[Tensor]): Specify which boundingboxes can be ignored when computing the loss.Returns:dict[str, Tensor]: A dictionary of loss components."""super(SingleStageDetector, self).forward_train(img, img_metas)x = self.extract_feat(img)losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,gt_labels, gt_bboxes_ignore)return losses
具体的代码
def forward_single(self, x, img_metas): # 单独处理每个特征层 # construct binary masks which used for the transformer.# NOTE following the official DETR repo, non-zero values representing# ignored positions, while zero values means valid positions.batch_size = x.size(0) # 获取Binput_img_h, input_img_w = img_metas[0]['batch_input_shape'] # 获取输入batch的shapemasks = x.new_ones((batch_size, input_img_h, input_img_w)) # Tensor [B, H, W] 默认为1 for img_id in range(batch_size): # 遍历每张图像img_h, img_w, _ = img_metas[img_id]['img_shape'] # Resize后的图像尺寸masks[img_id, :img_h, :img_w] = 0 # 有效位置标记为0, 其余位置为1x = self.input_proj(x) # 1*1卷积 特征降维 [B, C, H, W]# interpolate masks to have the same spatial shape with xmasks = F.interpolate( # 将masks resize到和特征相同尺度masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1) # Tensor [B, H, W]# position encoding 通过配置文件可知positional_encoding为类SinePositionalEncodingpos_embed = self.positional_encoding(masks) # [B, C, H, W]# outs_dec: [nb_dec, bs, num_query, embed_dim]# 由配置文件可知, transformer为类Transformerouts_dec, _ = self.transformer(x, # Tensor [B, C, H, W]masks, # Tensor [B, H, W]self.query_embedding.weight, # Tensor [num_query, C] 可学习的querypos_embed) # Tensor [B, C, H, W]# outs_dec: Tensor [num_layer, B, num_query, C] 经过transformer encode和decoder后的结果all_cls_scores = self.fc_cls(outs_dec) # 初始化可知 fc_cls为全链接层 [num_layer, B, num_query, num_cls+1]all_bbox_preds = self.fc_reg(self.activate(self.reg_ffn(outs_dec))).sigmoid() # 初始化可知 Sigmoid(Linear(ReLU(FFN(outs_dec)))) [num_layer, B, num_query, 4]return all_cls_scores, all_bbox_preds # Tensor: [num_layer, B, num_query, num_cls+1], [num_layer, B, num_query, 4]
看 backbone
stem部分 = 7*7conv + bn + relu + maxpool。这部分通常只是提取图像低级特征,故一般都需要固定这部分权重。
backbone=dict(type='ResNet',depth=50,num_stages=4, # assert num_stages >= 1 and num_stages <= 4 最大是4,最小是1# 表示本模块输出的特征图索引,(0, 1, 2, 3),表示4个 stage 输出都需要,(3, ),表示第4个 stage 输出都需要# 其对应的 stride 为 (4,8,16,32),channel 为 (256, 512, 1024, 2048) out_indices=(3, ),frozen_stages=1,norm_cfg=dict(type='BN', requires_grad=False),norm_eval=True,style='pytorch',init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
arch_settings = {18: (BasicBlock, (2, 2, 2, 2)),34: (BasicBlock, (3, 4, 6, 3)),50: (Bottleneck, (3, 4, 6, 3)), # ResNet-50101: (Bottleneck, (3, 4, 23, 3)),152: (Bottleneck, (3, 8, 36, 3))}# BasicBlock,Bottleneck都是一个单独的类,可以理解成模块#链接:https://blog.csdn.net/weixin_47691066/article/details/126032709
看 bbox_head
1、Transformer encoder部分首先将输入的特征图降维并flatten,然后送入下图左半部分所示的结构中,和空间位置编码一起并行经过多个自注意力分支、正则化和FFN,得到一组长度为N的预测目标序列。
2、接着,将Transformer encoder得到的预测目标序列经过上图右半部分所示的Transformer decoder,并行的解码得到输出序列(而不是像机器翻译那样逐个元素输出)。和传统的autogreesive机制不同,每个层可以解码N个目标,由于解码器的位置不变性,即调换输入顺序结果不变,除了每个像素本身的信息,位置信息也很重要,所以这N个输入嵌入必须不同以产生不同的结果,所以学习NLP里面的方法,加入positional encoding并且每层都加,作者非常用力的在处理position的问题,在使用 transformer 处理图片类的输入的时候,一定要注意position的问题。
bbox_head=dict(type='DETRHead',num_classes=80,in_channels=2048,# 这个是 transformer 模块transformer=dict(type='Transformer',encoder=dict(type='DetrTransformerEncoder', # 编码器num_layers=6,# 一共六层transformerlayers=dict(type='BaseTransformerLayer', attn_cfgs=[dict(type='MultiheadAttention',# 多头注意力embed_dims=256,# 嵌入量维度256num_heads=8,# 多头数量8dropout=0.1)# 随机丢弃],feedforward_channels=2048,# 返回通道数2048ffn_dropout=0.1,operation_order=('self_attn', 'norm', 'ffn', 'norm'))),# 操作顺序,自注意力、norm、FFNdecoder=dict(type='DetrTransformerDecoder',# 解码器return_intermediate=True,num_layers=6,transformerlayers=dict(type='DetrTransformerDecoderLayer',attn_cfgs=dict(type='MultiheadAttention',# 多头embed_dims=256,num_heads=8,dropout=0.1),feedforward_channels=2048,ffn_dropout=0.1,operation_order=('self_attn', 'norm', 'cross_attn', 'norm','ffn', 'norm')),# 自注意力、交叉注意力、FFN)),
使用共享参数的FFNs(由一个具有ReLU激活函数和d维隐藏层的3层感知器和一个线性投影层构成)独立解码为包含类别得分和预测框坐标的最终检测结果(N个),FFN预测框的标准化中心坐标,高度和宽度w.r.t. 输入图像,然后线性层使用softmax函数预测类标签。
Transformer类位于mmdet/models/utils/transformer.py,如下class Transformer(BaseModule):"""Following the official DETR implementation, this module copy-pastefrom torch.nn.Transformer with modifications:* positional encodings are passed in MultiheadAttention* extra LN at the end of encoder is removed* decoder returns a stack of activations from all decoding layers"""def __init__(self, encoder=None, decoder=None, init_cfg=None):super(Transformer, self).__init__(init_cfg=init_cfg)self.encoder = build_transformer_layer_sequence(encoder)self.decoder = build_transformer_layer_sequence(decoder)self.embed_dims = self.encoder.embed_dimsdef forward(self, x, mask, query_embed, pos_embed):bs, c, h, w = x.shape# use `view` instead of `flatten` for dynamically exporting to ONNXx = x.view(bs, c, -1).permute(2, 0, 1) # [B, C, H, W] -> [N, B, C]pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) # [B, C, H, W] -> [N, B, C]query_embed = query_embed.unsqueeze(1).repeat( # 直接复制可学习的query1, bs, 1) # [num_query, C] -> [num_query, B, dim]mask = mask.view(bs, -1) # [B, H, W] -> [B, N]memory = self.encoder( # 由配置文件可知encoder为类DetrTransformerEncoderquery=x, # [N, B, C]key=None,value=None,query_pos=pos_embed, # [N, B, C]query_key_padding_mask=mask) # [B, N]# memory: Tensor [N, B, C]target = torch.zeros_like(query_embed) # [num_query, B, dim] 全0# out_dec: [num_layers, num_query, bs, dim]out_dec = self.decoder( # 由配置文件可知decoder为类DetrTransformerDecoderquery=target,key=memory,value=memory,key_pos=pos_embed,query_pos=query_embed,key_padding_mask=mask) # out_dec: Tensor [num_layer, num_query, B, C]out_dec = out_dec.transpose(1, 2) # [num_layer, B, num_query, C]memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) # [B, C, H, W]return out_dec, memoryclass DetrTransformerEncoder(TransformerLayerSequence):def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):super(DetrTransformerEncoder, self).__init__(*args, **kwargs)if post_norm_cfg is not None:self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] if self.pre_norm else Noneelse:assert not self.pre_norm, f'Use prenorm in ' \f'{self.__class__.__name__},' \f'Please specify post_norm_cfg'self.post_norm = Nonedef forward(self, *args, **kwargs):# 调用父类TransformerLayerSequence中的forward方法, 而类TransformerLayerSequence集成在mmcv中 # mmcv.cnn.bricks.transformer 暂时不做解析x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)if self.post_norm is not None: # 不满足x = self.post_norm(x)return x # Tensor [N, B, C]class DetrTransformerDecoder(TransformerLayerSequence):def __init__(self,*args,post_norm_cfg=dict(type='LN'),return_intermediate=False,**kwargs):super(DetrTransformerDecoder, self).__init__(*args, **kwargs)self.return_intermediate = return_intermediateif post_norm_cfg is not None:self.post_norm = build_norm_layer(post_norm_cfg,self.embed_dims)[1]else:self.post_norm = Nonedef forward(self, query, *args, **kwargs):if not self.return_intermediate: # 不满足x = super().forward(query, *args, **kwargs)if self.post_norm:x = self.post_norm(x)[None]return xintermediate = [] # 保存每个layer的输出结果 [num_query, B, C]for layer in self.layers: # 遍历Decoder中的每个layer 由配置文件可知layer为类DetrTransformerDecoderLayer # 实际上调用的是父类BaseTransformerLayer中的forward函数# 该类被封装在mmcv中,暂时不做解析query = layer(query, *args, **kwargs)if self.return_intermediate: # 满足if self.post_norm is not None: # 满足intermediate.append(self.post_norm(query)) else:intermediate.append(query)return torch.stack(intermediate) # 将每个layer的结果拼接在一起 [num_layer, num_query, B, C]class DetrTransformerDecoderLayer(BaseTransformerLayer): def __init__(self,attn_cfgs,feedforward_channels,ffn_dropout=0.0,operation_order=None,act_cfg=dict(type='ReLU', inplace=True),norm_cfg=dict(type='LN'),ffn_num_fcs=2,**kwargs):super(DetrTransformerDecoderLayer, self).__init__(attn_cfgs=attn_cfgs,feedforward_channels=feedforward_channels,ffn_dropout=ffn_dropout,operation_order=operation_order,act_cfg=act_cfg,norm_cfg=norm_cfg,ffn_num_fcs=ffn_num_fcs,**kwargs)assert len(operation_order) == 6assert set(operation_order) == set(['self_attn', 'norm', 'cross_attn', 'ffn'])
看 positional_encoding
SinePositionalEncoding类位于mmdet/models/utils/positional_encoding.py,如下
positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True),
class SinePositionalEncoding(BaseModule):def __init__(self,num_feats,temperature=10000,normalize=False, scale=2 * math.pi,eps=1e-6,offset=0., init_cfg=None):super(SinePositionalEncoding, self).__init__(init_cfg)if normalize:assert isinstance(scale, (float, int)), 'when normalize is set,' \'scale should be provided and in float or int type, ' \f'found {type(scale)}'self.num_feats = num_feats # 实际上等于feature特征channel的一半self.temperature = temperatureself.normalize = normalize # 由配置可知为Trueself.scale = scale # 默认为2piself.eps = epsself.offset = offset # 默认为0def forward(self, mask):# For convenience of exporting to ONNX, it's required to convert# `masks` from bool to int.mask = mask.to(torch.int) # Bool -> Int [B, H, W]not_mask = 1 - mask # logical_not 反转后,有效位置为1, 无效位置为0# cumsum方法可参考https://pytorch.org/docs/stable/generated/torch.cumsum.html?highlight=cumsum#torch.cumsumx_embed = not_mask.cumsum(2, dtype=torch.float32) # 按行求累加和 y_embed = not_mask.cumsum(1, dtype=torch.float32) # 按列求累加和 # 是否对数据进行标准化处理, y_embed[:,-1:, :]代表h方向的最大值if self.normalize: # 满足 normalize并scale=2*piy_embed = (y_embed + self.offset) /(y_embed[:, -1:, :] + self.eps) * self.scalex_embed = (x_embed + self.offset) /(x_embed[:, :, -1:] + self.eps) * self.scale# 可参考Transformer论文中的公式dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device) # 生成[0, 128]的数组dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) # 归一化pos_x = x_embed[:, :, :, None] / dim_t # [B, H, W, C]# [b, h, w, 1] -> [b, h, w, 128]pos_y = y_embed[:, :, :, None] / dim_t # [b, h, w, 1] -> [b, h, w, 128]# use `view` instead of `flatten` for dynamically exporting to ONNXB, H, W = mask.size()pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),dim=4).view(B, H, W, -1) # [B, H, W, C]# 对偶数位置进行sin处理, 对奇数位置进行cos处理. [b, h, w, 64, 2] -> [b, h, w, 128]pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),dim=4).view(B, H, W, -1) # [B, H, W, C]pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # [B, C, H, W] # [b, h, w, 128] -> [b, h, w, 256] -> [b, 256, h, w]return pos
损失函数
https://zhuanlan.zhihu.com/p/572772363?utm_id=0
分配器
https://zhuanlan.zhihu.com/p/572772363?utm_id=0