
Interpretation:
The difference between ROI Align and ROI Pooling:http://blog.leanote.com/post/[email protected]/b5f4f526490b
ROI Align code:https://github.com/katotetsuro/roi_align/blob/master/roi_align_2d.py
Bilinear interpolation:
Mask-RCNN code:https://github.com/facebookresearch/maskrcnn-benchmark
This constitutes the framework of the entire network, divided into three parts backbone, rpn and roi heads
The backbone extracts the feature, rpn generates the candidate box, and the roi heads generates the final result.
class GeneralizedRCNN(nn.Module):
"""
Main class for Generalized R-CNN. Currently supports boxes and masks.
It consists of three main parts:
- backbone
- rpn
- heads: takes the features + the proposals from the RPN and computes
detections / masks from it.
"""
def __init__(self, cfg):
super(GeneralizedRCNN, self).__init__()
self.backbone = build_backbone(cfg)
self.rpn = build_rpn(cfg, self.backbone.out_channels)
self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
def forward(self, images, targets=None):
"""
Arguments:
images (list[Tensor] or ImageList): images to be processed
targets (list[BoxList]): ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
During testing, it returns list[BoxList] contains additional fields
like `scores`, `labels` and `mask` (for Mask R-CNN models).
"""
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
images = to_image_list(images)
features = self.backbone(images.tensors)
proposals, proposal_losses = self.rpn(images, features, targets)
if self.roi_heads:
x, result, detector_losses = self.roi_heads(features, proposals, targets)
else:
# RPN-only models don't have roi_heads
x = features
result = proposals
detector_losses = {}
if self.training:
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
return result
The network structure of the entire extracted feature, FPN represents feature pyramid networks

Here, a region proposal network is constructed to calculate the objectness object confidence of the rpn output and the coordinates of the anchor box regression candidate box.

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import torch
import torch.nn.functional as F
from torch import nn
from maskrcnn_benchmark.modeling import registry
from maskrcnn_benchmark.modeling.box_coder import BoxCoder
from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet
from .loss import make_rpn_loss_evaluator
from .anchor_generator import make_anchor_generator
from .inference import make_rpn_postprocessor
class RPNHeadConvRegressor(nn.Module):
"""
A simple RPN Head for classification and bbox regression
"""
def __init__(self, cfg, in_channels, num_anchors):
"""
Arguments:
cfg : config
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
"""
super(RPNHeadConvRegressor, self).__init__()
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
self.bbox_pred = nn.Conv2d(
in_channels, num_anchors * 4, kernel_size=1, stride=1
)
for l in [self.cls_logits, self.bbox_pred]:
torch.nn.init.normal_(l.weight, std=0.01)
torch.nn.init.constant_(l.bias, 0)
def forward(self, x):
assert isinstance(x, (list, tuple))
logits = [self.cls_logits(y) for y in x]
bbox_reg = [self.bbox_pred(y) for y in x]
return logits, bbox_reg
class RPNHeadFeatureSingleConv(nn.Module):
"""
Adds a simple RPN Head with one conv to extract the feature
"""
def __init__(self, cfg, in_channels):
"""
Arguments:
cfg : config
in_channels (int): number of channels of the input feature
"""
super(RPNHeadFeatureSingleConv, self).__init__()
self.conv = nn.Conv2d(
in_channels, in_channels, kernel_size=3, stride=1, padding=1
)
for l in [self.conv]:
torch.nn.init.normal_(l.weight, std=0.01)
torch.nn.init.constant_(l.bias, 0)
self.out_channels = in_channels
def forward(self, x):
assert isinstance(x, (list, tuple))
x = [F.relu(self.conv(z)) for z in x]
return x
@registry.RPN_HEADS.register("SingleConvRPNHead")
class RPNHead(nn.Module):
"""
Adds a simple RPN Head with classification and regression heads
"""
def __init__(self, cfg, in_channels, num_anchors):
"""
Arguments:
cfg : config
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
"""
super(RPNHead, self).__init__()
self.conv = nn.Conv2d(
in_channels, in_channels, kernel_size=3, stride=1, padding=1
)
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
self.bbox_pred = nn.Conv2d(
in_channels, num_anchors * 4, kernel_size=1, stride=1
)
for l in [self.conv, self.cls_logits, self.bbox_pred]:
torch.nn.init.normal_(l.weight, std=0.01)
torch.nn.init.constant_(l.bias, 0)
def forward(self, x):
logits = []
bbox_reg = []
for feature in x:
t = F.relu(self.conv(feature))
logits.append(self.cls_logits(t))
bbox_reg.append(self.bbox_pred(t))
return logits, bbox_reg
class RPNModule(torch.nn.Module):
"""
Module for RPN computation. Takes feature maps from the backbone and RPN
proposals and losses. Works for both FPN and non-FPN.
"""
def __init__(self, cfg, in_channels):
super(RPNModule, self).__init__()
self.cfg = cfg.clone()
anchor_generator = make_anchor_generator(cfg)
rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD]
head = rpn_head(
cfg, in_channels, anchor_generator.num_anchors_per_location()[0]
)
rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True)
box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False)
loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder)
self.anchor_generator = anchor_generator
self.head = head
self.box_selector_train = box_selector_train
self.box_selector_test = box_selector_test
self.loss_evaluator = loss_evaluator
def forward(self, images, features, targets=None):
"""
Arguments:
images (ImageList): images for which we want to compute the predictions
features (list[Tensor]): features computed from the images that are
used for computing the predictions. Each tensor in the list
correspond to different feature levels
targets (list[BoxList): ground-truth boxes present in the image (optional)
Returns:
boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
image.
losses (dict[Tensor]): the losses for the model during training. During
testing, it is an empty dict.
"""
objectness, rpn_box_regression = self.head(features)
anchors = self.anchor_generator(images, features)
if self.training:
return self._forward_train(anchors, objectness, rpn_box_regression, targets)
else:
return self._forward_test(anchors, objectness, rpn_box_regression)
def _forward_train(self, anchors, objectness, rpn_box_regression, targets):
if self.cfg.MODEL.RPN_ONLY:
# When training an RPN-only model, the loss is determined by the
# predicted objectness and rpn_box_regression values and there is
# no need to transform the anchors into predicted boxes; this is an
# optimization that avoids the unnecessary transformation.
boxes = anchors
else:
# For end-to-end models, anchors must be transformed into boxes and
# sampled into a training batch.
with torch.no_grad():
boxes = self.box_selector_train(
anchors, objectness, rpn_box_regression, targets
)
loss_objectness, loss_rpn_box_reg = self.loss_evaluator(
anchors, objectness, rpn_box_regression, targets
)
losses = {
"loss_objectness": loss_objectness,
"loss_rpn_box_reg": loss_rpn_box_reg,
}
return boxes, losses
def _forward_test(self, anchors, objectness, rpn_box_regression):
boxes = self.box_selector_test(anchors, objectness, rpn_box_regression)
if self.cfg.MODEL.RPN_ONLY:
# For end-to-end models, the RPN proposals are an intermediate state
# and don't bother to sort them in decreasing score order. For RPN-only
# models, the proposals are the final output and we return them in
# high-to-low confidence order.
inds = [
box.get_field("objectness").sort(descending=True)[1] for box in boxes
]
boxes = [box[ind] for box, ind in zip(boxes, inds)]
return boxes, {}
def build_rpn(cfg, in_channels):
"""
This gives the gist of it. Not super important because it doesn't change as much
"""
if cfg.MODEL.RETINANET_ON:
return build_retinanet(cfg, in_channels)
return RPNModule(cfg, in_channels)
Here is the final output layer, output mask or box and objectness
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import torch
from .box_head.box_head import build_roi_box_head
from .mask_head.mask_head import build_roi_mask_head
from .keypoint_head.keypoint_head import build_roi_keypoint_head
class CombinedROIHeads(torch.nn.ModuleDict):
"""
Combines a set of individual heads (for box prediction or masks) into a single
head.
"""
def __init__(self, cfg, heads):
super(CombinedROIHeads, self).__init__(heads)
self.cfg = cfg.clone()
if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
self.mask.feature_extractor = self.box.feature_extractor
if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
self.keypoint.feature_extractor = self.box.feature_extractor
def forward(self, features, proposals, targets=None):
losses = {}
# TODO rename x to roi_box_features, if it doesn't increase memory consumption
x, detections, loss_box = self.box(features, proposals, targets)
losses.update(loss_box)
if self.cfg.MODEL.MASK_ON:
mask_features = features
# optimization: during training, if we share the feature extractor between
# the box and the mask heads, then we can reuse the features already computed
if (
self.training
and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
):
mask_features = x
# During training, self.box() will return the unaltered proposals as "detections"
# this makes the API consistent during training and testing
x, detections, loss_mask = self.mask(mask_features, detections, targets)
losses.update(loss_mask)
if self.cfg.MODEL.KEYPOINT_ON:
keypoint_features = features
# optimization: during training, if we share the feature extractor between
# the box and the mask heads, then we can reuse the features already computed
if (
self.training
and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
):
keypoint_features = x
# During training, self.box() will return the unaltered proposals as "detections"
# this makes the API consistent during training and testing
x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets)
losses.update(loss_keypoint)
return x, detections, losses
def build_roi_heads(cfg, in_channels):
# individually create the heads, that will be combined together
# afterwards
roi_heads = []
if cfg.MODEL.RETINANET_ON:
return []
if not cfg.MODEL.RPN_ONLY:
roi_heads.append(("box", build_roi_box_head(cfg, in_channels)))
if cfg.MODEL.MASK_ON:
roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels)))
if cfg.MODEL.KEYPOINT_ON:
roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels)))
# combine individual heads in a single module
if roi_heads:
roi_heads = CombinedROIHeads(cfg, roi_heads)
return roi_heads
Calculating the loss of rpn here will first find the corresponding targets for each anchor, then set the anchor label of the corresponding background to 0, the object to 1, and the discarded -1 and then calculate the loss with smooth l1 and binary cross entropy.



# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""
This file contains specific functions for computing losses on the RPN
file
"""
import torch
from torch.nn import functional as F
from .utils import concat_box_prediction_layers
from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler
from ..utils import cat
from maskrcnn_benchmark.layers import smooth_l1_loss
from maskrcnn_benchmark.modeling.matcher import Matcher
from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
class RPNLossComputation(object):
"""
This class computes the RPN loss.
"""
def __init__(self, proposal_matcher, fg_bg_sampler, box_coder,
generate_labels_func):
"""
Arguments:
proposal_matcher (Matcher)
fg_bg_sampler (BalancedPositiveNegativeSampler)
box_coder (BoxCoder)
"""
# self.target_preparator = target_preparator
self.proposal_matcher = proposal_matcher
self.fg_bg_sampler = fg_bg_sampler
self.box_coder = box_coder
self.copied_fields = []
self.generate_labels_func = generate_labels_func
self.discard_cases = ['not_visibility', 'between_thresholds']
def match_targets_to_anchors(self, anchor, target, copied_fields=[]):
match_quality_matrix = boxlist_iou(target, anchor)
matched_idxs = self.proposal_matcher(match_quality_matrix)
# RPN doesn't need any fields from target
# for creating the labels, so clear them all
target = target.copy_with_fields(copied_fields)
# get the targets corresponding GT for each anchor
# NB: need to clamp the indices because we can have a single
# GT in the image, and matched_idxs can be -2, which goes
# out of bounds
matched_targets = target[matched_idxs.clamp(min=0)]
matched_targets.add_field("matched_idxs", matched_idxs)
return matched_targets
def prepare_targets(self, anchors, targets):
labels = []
regression_targets = []
for anchors_per_image, targets_per_image in zip(anchors, targets):
matched_targets = self.match_targets_to_anchors(
anchors_per_image, targets_per_image, self.copied_fields
)
matched_idxs = matched_targets.get_field("matched_idxs")
labels_per_image = self.generate_labels_func(matched_targets)
labels_per_image = labels_per_image.to(dtype=torch.float32)
# Background (negative examples)
bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
labels_per_image[bg_indices] = 0
# discard anchors that go out of the boundaries of the image
if "not_visibility" in self.discard_cases:
labels_per_image[~anchors_per_image.get_field("visibility")] = -1
# discard indices that are between thresholds
if "between_thresholds" in self.discard_cases:
inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
labels_per_image[inds_to_discard] = -1
# compute regression targets
regression_targets_per_image = self.box_coder.encode(
matched_targets.bbox, anchors_per_image.bbox
)
labels.append(labels_per_image)
regression_targets.append(regression_targets_per_image)
return labels, regression_targets
def __call__(self, anchors, objectness, box_regression, targets):
"""
Arguments:
anchors (list[BoxList])
objectness (list[Tensor])
box_regression (list[Tensor])
targets (list[BoxList])
Returns:
objectness_loss (Tensor)
box_loss (Tensor
"""
anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
labels, regression_targets = self.prepare_targets(anchors, targets)
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
objectness, box_regression = \
concat_box_prediction_layers(objectness, box_regression)
objectness = objectness.squeeze()
labels = torch.cat(labels, dim=0)
regression_targets = torch.cat(regression_targets, dim=0)
box_loss = smooth_l1_loss(
box_regression[sampled_pos_inds],
regression_targets[sampled_pos_inds],
beta=1.0 / 9,
size_average=False,
) / (sampled_inds.numel())
objectness_loss = F.binary_cross_entropy_with_logits(
objectness[sampled_inds], labels[sampled_inds]
)
return objectness_loss, box_loss
# This function should be overwritten in RetinaNet
def generate_rpn_labels(matched_targets):
matched_idxs = matched_targets.get_field("matched_idxs")
labels_per_image = matched_idxs >= 0
return labels_per_image
def make_rpn_loss_evaluator(cfg, box_coder):
matcher = Matcher(
cfg.MODEL.RPN.FG_IOU_THRESHOLD,
cfg.MODEL.RPN.BG_IOU_THRESHOLD,
allow_low_quality_matches=True,
)
fg_bg_sampler = BalancedPositiveNegativeSampler(
cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION
)
loss_evaluator = RPNLossComputation(
matcher,
fg_bg_sampler,
box_coder,
generate_rpn_labels
)
return loss_evaluator
First, import the package. Here we will use a model trained on the MS_COCO dataset. The configuration information of this model is located in the CocoConfig class of the coco.py file. When making pred...
1. Guide package Second, the configuration 3. Notebook Preferences Fourth, load the verification data set Five, load the model Sixth, running detection 6.1 Precision-Recall 6.2 Calculate mAP @ IoU = 5...
1. Guide package Second, the configuration 3. Notebook Preferences Three, load the Model Below are some weights. 4. Histogram of Weights...
Second, Mask R-CNN code interpretation FAIR released a series of tutorial files at the same time it released detectron.Detectron/GETTING_STARTED.mdFile to interpret the code. First look at the file st...
The source code of Mask R-CNN is released with FAIR's object detection platform detectron. Detectron includes all important object detection algorithms from Fast R-CNN to Mask R-CNN. The entire platfo...
Code analysis-data preprocessing Project source codematterport/Mask_RCNN inspect_data.ipynbShowsPrepare preprocessing steps for training data. Guide package The imported coco package needs tococo/Pyth...
Operating environment: TensorFlow2.0+Keras Question 1: Errors caused by the version upgrade (mainly my own cabbage...) The API names and parameters of TensorFlow 2.0 and 1.X have changed. E.g: #######...
The main code file path: General architecture file:detectron2/detectron2/modeling/meta_arch/rcnn.py default allocation:detectron2/detectron2/config/defaults.py RPN_head:detectron2/detectron2/modeling/...
Foreword We have talked more about the target detection algorithm of Anchor-Based. In addition, we have briefly interpreted the target detection of Anchor-Free. DenseBox has started, and today we will...
According to a number of articles, only for reference to learn from each other. Mask R-CNN Effectively detects targets in the image while also generating a high-quality segmentation mask for eac...