|
@@ -1,4 +1,3 @@
|
|
-
|
|
|
|
from typing import Any, Callable, List, Optional, Tuple, Union
|
|
from typing import Any, Callable, List, Optional, Tuple, Union
|
|
import torch
|
|
import torch
|
|
from torch import nn
|
|
from torch import nn
|
|
@@ -13,7 +12,6 @@ from libs.vision_libs.ops import misc as misc_nn_ops
|
|
from libs.vision_libs.transforms._presets import ObjectDetection
|
|
from libs.vision_libs.transforms._presets import ObjectDetection
|
|
from .line_head import LineRCNNHeads
|
|
from .line_head import LineRCNNHeads
|
|
from .line_predictor import LineRCNNPredictor
|
|
from .line_predictor import LineRCNNPredictor
|
|
-from .roi_heads import RoIHeads
|
|
|
|
from libs.vision_libs.models._api import register_model, Weights, WeightsEnum
|
|
from libs.vision_libs.models._api import register_model, Weights, WeightsEnum
|
|
from libs.vision_libs.models._meta import _COCO_PERSON_CATEGORIES, _COCO_PERSON_KEYPOINT_NAMES, _COCO_CATEGORIES
|
|
from libs.vision_libs.models._meta import _COCO_PERSON_CATEGORIES, _COCO_PERSON_KEYPOINT_NAMES, _COCO_CATEGORIES
|
|
from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface
|
|
from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface
|
|
@@ -22,10 +20,13 @@ from libs.vision_libs.models.detection._utils import overwrite_eps
|
|
from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
|
|
from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
|
|
from libs.vision_libs.models.detection.faster_rcnn import FasterRCNN, TwoMLPHead, FastRCNNPredictor
|
|
from libs.vision_libs.models.detection.faster_rcnn import FasterRCNN, TwoMLPHead, FastRCNNPredictor
|
|
|
|
|
|
-from models.config.config_tool import read_yaml
|
|
|
|
-import numpy as np
|
|
|
|
|
|
+from .roi_heads import RoIHeads
|
|
|
|
+from .trainer import Trainer
|
|
|
|
+from ..base.base_detection_net import BaseDetectionNet
|
|
import torch.nn.functional as F
|
|
import torch.nn.functional as F
|
|
|
|
|
|
|
|
+from ..config.config_tool import read_yaml
|
|
|
|
+
|
|
FEATURE_DIM = 8
|
|
FEATURE_DIM = 8
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
|
@@ -40,50 +41,6 @@ __all__ = [
|
|
"linenet_mobilenet_v3_large_fpn",
|
|
"linenet_mobilenet_v3_large_fpn",
|
|
"linenet_mobilenet_v3_large_320_fpn",
|
|
"linenet_mobilenet_v3_large_320_fpn",
|
|
]
|
|
]
|
|
-# __all__ = [
|
|
|
|
-# "LineNet",
|
|
|
|
-# "LineRCNN_ResNet50_FPN_Weights",
|
|
|
|
-# "linercnn_resnet50_fpn",
|
|
|
|
-# ]
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def non_maximum_suppression(a):
|
|
|
|
- ap = F.max_pool2d(a, 3, stride=1, padding=1)
|
|
|
|
- mask = (a == ap).float().clamp(min=0.0)
|
|
|
|
- return a * mask
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-# class Bottleneck1D(nn.Module):
|
|
|
|
-# def __init__(self, inplanes, outplanes):
|
|
|
|
-# super(Bottleneck1D, self).__init__()
|
|
|
|
-#
|
|
|
|
-# planes = outplanes // 2
|
|
|
|
-# self.op = nn.Sequential(
|
|
|
|
-# nn.BatchNorm1d(inplanes),
|
|
|
|
-# nn.ReLU(inplace=True),
|
|
|
|
-# nn.Conv1d(inplanes, planes, kernel_size=1),
|
|
|
|
-# nn.BatchNorm1d(planes),
|
|
|
|
-# nn.ReLU(inplace=True),
|
|
|
|
-# nn.Conv1d(planes, planes, kernel_size=3, padding=1),
|
|
|
|
-# nn.BatchNorm1d(planes),
|
|
|
|
-# nn.ReLU(inplace=True),
|
|
|
|
-# nn.Conv1d(planes, outplanes, kernel_size=1),
|
|
|
|
-# )
|
|
|
|
-#
|
|
|
|
-# def forward(self, x):
|
|
|
|
-# return x + self.op(x)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-from .roi_heads import RoIHeads
|
|
|
|
-
|
|
|
|
-from ..base.base_detection_net import BaseDetectionNet
|
|
|
|
|
|
|
|
|
|
|
|
def _default_anchorgen():
|
|
def _default_anchorgen():
|
|
@@ -93,259 +50,229 @@ def _default_anchorgen():
|
|
|
|
|
|
|
|
|
|
class LineNet(BaseDetectionNet):
|
|
class LineNet(BaseDetectionNet):
|
|
- """
|
|
|
|
- Implements Faster R-CNN.
|
|
|
|
-
|
|
|
|
- The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
|
|
|
|
- image, and should be in 0-1 range. Different images can have different sizes.
|
|
|
|
-
|
|
|
|
- The behavior of the model changes depending on if it is in training or evaluation mode.
|
|
|
|
-
|
|
|
|
- During training, the model expects both the input tensors and targets (list of dictionary),
|
|
|
|
- containing:
|
|
|
|
- - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
|
|
|
|
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
|
|
|
|
- - labels (Int64Tensor[N]): the class label for each ground-truth box
|
|
|
|
-
|
|
|
|
- The model returns a Dict[Tensor] during training, containing the classification and regression
|
|
|
|
- losses for both the RPN and the R-CNN.
|
|
|
|
-
|
|
|
|
- During inference, the model requires only the input tensors, and returns the post-processed
|
|
|
|
- predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
|
|
|
|
- follows:
|
|
|
|
- - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
|
|
|
|
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
|
|
|
|
- - labels (Int64Tensor[N]): the predicted labels for each image
|
|
|
|
- - scores (Tensor[N]): the scores or each prediction
|
|
|
|
-
|
|
|
|
- Args:
|
|
|
|
- backbone (nn.Module): the network used to compute the features for the model.
|
|
|
|
- It should contain an out_channels attribute, which indicates the number of output
|
|
|
|
- channels that each feature map has (and it should be the same for all feature maps).
|
|
|
|
- The backbone should return a single Tensor or and OrderedDict[Tensor].
|
|
|
|
- num_classes (int): number of output classes of the model (including the background).
|
|
|
|
- If box_predictor is specified, num_classes should be None.
|
|
|
|
- min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
|
|
|
|
- max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
|
|
|
|
- image_mean (Tuple[float, float, float]): mean values used for input normalization.
|
|
|
|
- They are generally the mean values of the dataset on which the backbone has been trained
|
|
|
|
- on
|
|
|
|
- image_std (Tuple[float, float, float]): std values used for input normalization.
|
|
|
|
- They are generally the std values of the dataset on which the backbone has been trained on
|
|
|
|
- rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
|
|
|
|
- maps.
|
|
|
|
- rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN
|
|
|
|
- rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training
|
|
|
|
- rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing
|
|
|
|
- rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training
|
|
|
|
- rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing
|
|
|
|
- rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals
|
|
|
|
- rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
|
|
|
|
- considered as positive during training of the RPN.
|
|
|
|
- rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
|
|
|
|
- considered as negative during training of the RPN.
|
|
|
|
- rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN
|
|
|
|
- for computing the loss
|
|
|
|
- rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training
|
|
|
|
- of the RPN
|
|
|
|
- rpn_score_thresh (float): during inference, only return proposals with a classification score
|
|
|
|
- greater than rpn_score_thresh
|
|
|
|
- box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in
|
|
|
|
- the locations indicated by the bounding boxes
|
|
|
|
- box_head (nn.Module): module that takes the cropped feature maps as input
|
|
|
|
- box_predictor (nn.Module): module that takes the output of box_head and returns the
|
|
|
|
- classification logits and box regression deltas.
|
|
|
|
- box_score_thresh (float): during inference, only return proposals with a classification score
|
|
|
|
- greater than box_score_thresh
|
|
|
|
- box_nms_thresh (float): NMS threshold for the prediction head. Used during inference
|
|
|
|
- box_detections_per_img (int): maximum number of detections per image, for all classes.
|
|
|
|
- box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be
|
|
|
|
- considered as positive during training of the classification head
|
|
|
|
- box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be
|
|
|
|
- considered as negative during training of the classification head
|
|
|
|
- box_batch_size_per_image (int): number of proposals that are sampled during training of the
|
|
|
|
- classification head
|
|
|
|
- box_positive_fraction (float): proportion of positive proposals in a mini-batch during training
|
|
|
|
- of the classification head
|
|
|
|
- bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the
|
|
|
|
- bounding boxes
|
|
|
|
-
|
|
|
|
- Example::
|
|
|
|
-
|
|
|
|
- >>> import torch
|
|
|
|
- >>> import torchvision
|
|
|
|
- >>> from torchvision.models.detection import FasterRCNN
|
|
|
|
- >>> from torchvision.models.detection.rpn import AnchorGenerator
|
|
|
|
- >>> # load a pre-trained model for classification and return
|
|
|
|
- >>> # only the features
|
|
|
|
- >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
|
|
|
|
- >>> # FasterRCNN needs to know the number of
|
|
|
|
- >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
|
|
|
|
- >>> # so we need to add it here
|
|
|
|
- >>> backbone.out_channels = 1280
|
|
|
|
- >>>
|
|
|
|
- >>> # let's make the RPN generate 5 x 3 anchors per spatial
|
|
|
|
- >>> # location, with 5 different sizes and 3 different aspect
|
|
|
|
- >>> # ratios. We have a Tuple[Tuple[int]] because each feature
|
|
|
|
- >>> # map could potentially have different sizes and
|
|
|
|
- >>> # aspect ratios
|
|
|
|
- >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
|
|
|
|
- >>> aspect_ratios=((0.5, 1.0, 2.0),))
|
|
|
|
- >>>
|
|
|
|
- >>> # let's define what are the feature maps that we will
|
|
|
|
- >>> # use to perform the region of interest cropping, as well as
|
|
|
|
- >>> # the size of the crop after rescaling.
|
|
|
|
- >>> # if your backbone returns a Tensor, featmap_names is expected to
|
|
|
|
- >>> # be ['0']. More generally, the backbone should return an
|
|
|
|
- >>> # OrderedDict[Tensor], and in featmap_names you can choose which
|
|
|
|
- >>> # feature maps to use.
|
|
|
|
- >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
|
|
|
|
- >>> output_size=7,
|
|
|
|
- >>> sampling_ratio=2)
|
|
|
|
- >>>
|
|
|
|
- >>> # put the pieces together inside a FasterRCNN model
|
|
|
|
- >>> model = FasterRCNN(backbone,
|
|
|
|
- >>> num_classes=2,
|
|
|
|
- >>> rpn_anchor_generator=anchor_generator,
|
|
|
|
- >>> box_roi_pool=roi_pooler)
|
|
|
|
- >>> model.eval()
|
|
|
|
- >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
|
|
|
|
- >>> predictions = model(x)
|
|
|
|
- """
|
|
|
|
-
|
|
|
|
- def __init__(
|
|
|
|
- self,
|
|
|
|
- backbone,
|
|
|
|
- num_classes=None,
|
|
|
|
- # transform parameters
|
|
|
|
- min_size=512,
|
|
|
|
- max_size=1333,
|
|
|
|
- image_mean=None,
|
|
|
|
- image_std=None,
|
|
|
|
- # RPN parameters
|
|
|
|
- rpn_anchor_generator=None,
|
|
|
|
- rpn_head=None,
|
|
|
|
- rpn_pre_nms_top_n_train=2000,
|
|
|
|
- rpn_pre_nms_top_n_test=1000,
|
|
|
|
- rpn_post_nms_top_n_train=2000,
|
|
|
|
- rpn_post_nms_top_n_test=1000,
|
|
|
|
- rpn_nms_thresh=0.7,
|
|
|
|
- rpn_fg_iou_thresh=0.7,
|
|
|
|
- rpn_bg_iou_thresh=0.3,
|
|
|
|
- rpn_batch_size_per_image=256,
|
|
|
|
- rpn_positive_fraction=0.5,
|
|
|
|
- rpn_score_thresh=0.0,
|
|
|
|
- # Box parameters
|
|
|
|
- box_roi_pool=None,
|
|
|
|
- box_head=None,
|
|
|
|
- box_predictor=None,
|
|
|
|
- box_score_thresh=0.05,
|
|
|
|
- box_nms_thresh=0.5,
|
|
|
|
- box_detections_per_img=100,
|
|
|
|
- box_fg_iou_thresh=0.5,
|
|
|
|
- box_bg_iou_thresh=0.5,
|
|
|
|
- box_batch_size_per_image=512,
|
|
|
|
- box_positive_fraction=0.25,
|
|
|
|
- bbox_reg_weights=None,
|
|
|
|
- # line parameters
|
|
|
|
- line_head=None,
|
|
|
|
- line_predictor=None,
|
|
|
|
- **kwargs,
|
|
|
|
- ):
|
|
|
|
-
|
|
|
|
- if not hasattr(backbone, "out_channels"):
|
|
|
|
- raise ValueError(
|
|
|
|
- "backbone should contain an attribute out_channels "
|
|
|
|
- "specifying the number of output channels (assumed to be the "
|
|
|
|
- "same for all the levels)"
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))):
|
|
|
|
- raise TypeError(
|
|
|
|
- f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}"
|
|
|
|
- )
|
|
|
|
- if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))):
|
|
|
|
- raise TypeError(
|
|
|
|
- f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}"
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- if num_classes is not None:
|
|
|
|
- if box_predictor is not None:
|
|
|
|
- raise ValueError("num_classes should be None when box_predictor is specified")
|
|
|
|
- else:
|
|
|
|
- if box_predictor is None:
|
|
|
|
- raise ValueError("num_classes should not be None when box_predictor is not specified")
|
|
|
|
-
|
|
|
|
- out_channels = backbone.out_channels
|
|
|
|
-
|
|
|
|
- if line_head is None:
|
|
|
|
- num_class = 5
|
|
|
|
- line_head = LineRCNNHeads(out_channels, num_class)
|
|
|
|
-
|
|
|
|
- if line_predictor is None:
|
|
|
|
- line_predictor = LineRCNNPredictor()
|
|
|
|
-
|
|
|
|
- if rpn_anchor_generator is None:
|
|
|
|
|
|
+ def __init__(self, cfg, **kwargs):
|
|
|
|
+ cfg = read_yaml(cfg)
|
|
|
|
+ backbone = cfg['model']['backbone']
|
|
|
|
+ num_classes = cfg['model']['num_classes']
|
|
|
|
+
|
|
|
|
+ if backbone == 'resnet50_fpn':
|
|
|
|
+ is_trained = False
|
|
|
|
+ trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3)
|
|
|
|
+ norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
|
|
|
|
+ backbone = resnet50(weights=None, progress=True, norm_layer=norm_layer)
|
|
|
|
+ backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
|
|
|
|
+ out_channels = backbone.out_channels
|
|
|
|
+
|
|
|
|
+ min_size = 512,
|
|
|
|
+ max_size = 1333,
|
|
|
|
+ rpn_pre_nms_top_n_train = 2000,
|
|
|
|
+ rpn_pre_nms_top_n_test = 1000,
|
|
|
|
+ rpn_post_nms_top_n_train = 2000,
|
|
|
|
+ rpn_post_nms_top_n_test = 1000,
|
|
|
|
+ rpn_nms_thresh = 0.7,
|
|
|
|
+ rpn_fg_iou_thresh = 0.7,
|
|
|
|
+ rpn_bg_iou_thresh = 0.3,
|
|
|
|
+ rpn_batch_size_per_image = 256,
|
|
|
|
+ rpn_positive_fraction = 0.5,
|
|
|
|
+ rpn_score_thresh = 0.0,
|
|
|
|
+ box_score_thresh = 0.05,
|
|
|
|
+ box_nms_thresh = 0.5,
|
|
|
|
+ box_detections_per_img = 100,
|
|
|
|
+ box_fg_iou_thresh = 0.5,
|
|
|
|
+ box_bg_iou_thresh = 0.5,
|
|
|
|
+ box_batch_size_per_image = 512,
|
|
|
|
+ box_positive_fraction = 0.25,
|
|
|
|
+ bbox_reg_weights = None,
|
|
|
|
+
|
|
|
|
+ line_head = LineRCNNHeads(out_channels, 5)
|
|
|
|
+ line_predictor = LineRCNNPredictor(cfg)
|
|
rpn_anchor_generator = _default_anchorgen()
|
|
rpn_anchor_generator = _default_anchorgen()
|
|
- if rpn_head is None:
|
|
|
|
rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
|
|
rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
|
|
|
|
+ rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
|
|
|
|
+ rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
|
|
|
|
+
|
|
|
|
+ rpn = RegionProposalNetwork(
|
|
|
|
+ rpn_anchor_generator,
|
|
|
|
+ rpn_head,
|
|
|
|
+ rpn_fg_iou_thresh,
|
|
|
|
+ rpn_bg_iou_thresh,
|
|
|
|
+ rpn_batch_size_per_image,
|
|
|
|
+ rpn_positive_fraction,
|
|
|
|
+ rpn_pre_nms_top_n,
|
|
|
|
+ rpn_post_nms_top_n,
|
|
|
|
+ rpn_nms_thresh,
|
|
|
|
+ score_thresh=rpn_score_thresh,
|
|
|
|
+ )
|
|
|
|
|
|
- rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
|
|
|
|
- rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
|
|
|
|
-
|
|
|
|
- rpn = RegionProposalNetwork(
|
|
|
|
- rpn_anchor_generator,
|
|
|
|
- rpn_head,
|
|
|
|
- rpn_fg_iou_thresh,
|
|
|
|
- rpn_bg_iou_thresh,
|
|
|
|
- rpn_batch_size_per_image,
|
|
|
|
- rpn_positive_fraction,
|
|
|
|
- rpn_pre_nms_top_n,
|
|
|
|
- rpn_post_nms_top_n,
|
|
|
|
- rpn_nms_thresh,
|
|
|
|
- score_thresh=rpn_score_thresh,
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- if box_roi_pool is None:
|
|
|
|
box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
|
|
box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
|
|
|
|
|
|
- if box_head is None:
|
|
|
|
resolution = box_roi_pool.output_size[0]
|
|
resolution = box_roi_pool.output_size[0]
|
|
representation_size = 1024
|
|
representation_size = 1024
|
|
- box_head = TwoMLPHead(out_channels * resolution**2, representation_size)
|
|
|
|
-
|
|
|
|
- if box_predictor is None:
|
|
|
|
|
|
+ box_head = TwoMLPHead(out_channels * resolution ** 2, representation_size)
|
|
representation_size = 1024
|
|
representation_size = 1024
|
|
box_predictor = BoxPredictor(representation_size, num_classes)
|
|
box_predictor = BoxPredictor(representation_size, num_classes)
|
|
|
|
|
|
- roi_heads = RoIHeads(
|
|
|
|
- # Box
|
|
|
|
- box_roi_pool,
|
|
|
|
- box_head,
|
|
|
|
- box_predictor,
|
|
|
|
- line_head,
|
|
|
|
- line_predictor,
|
|
|
|
- box_fg_iou_thresh,
|
|
|
|
- box_bg_iou_thresh,
|
|
|
|
- box_batch_size_per_image,
|
|
|
|
- box_positive_fraction,
|
|
|
|
- bbox_reg_weights,
|
|
|
|
- box_score_thresh,
|
|
|
|
- box_nms_thresh,
|
|
|
|
- box_detections_per_img,
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- if image_mean is None:
|
|
|
|
|
|
+ roi_heads = RoIHeads(
|
|
|
|
+ # Box
|
|
|
|
+ box_roi_pool,
|
|
|
|
+ box_head,
|
|
|
|
+ box_predictor,
|
|
|
|
+ line_head,
|
|
|
|
+ line_predictor,
|
|
|
|
+ box_fg_iou_thresh,
|
|
|
|
+ box_bg_iou_thresh,
|
|
|
|
+ box_batch_size_per_image,
|
|
|
|
+ box_positive_fraction,
|
|
|
|
+ bbox_reg_weights,
|
|
|
|
+ box_score_thresh,
|
|
|
|
+ box_nms_thresh,
|
|
|
|
+ box_detections_per_img,
|
|
|
|
+ )
|
|
image_mean = [0.485, 0.456, 0.406]
|
|
image_mean = [0.485, 0.456, 0.406]
|
|
- if image_std is None:
|
|
|
|
image_std = [0.229, 0.224, 0.225]
|
|
image_std = [0.229, 0.224, 0.225]
|
|
- transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
|
|
|
|
-
|
|
|
|
- super().__init__(backbone, rpn, roi_heads, transform)
|
|
|
|
-
|
|
|
|
- self.roi_heads = roi_heads
|
|
|
|
- # self.roi_heads.line_head = line_head
|
|
|
|
- # self.roi_heads.line_predictor = line_predictor
|
|
|
|
|
|
+ transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
|
|
|
|
+ super().__init__(backbone, rpn, roi_heads, transform)
|
|
|
|
+ self.roi_heads = roi_heads
|
|
|
|
+
|
|
|
|
+ # def __init__(
|
|
|
|
+ # self,
|
|
|
|
+ # backbone,
|
|
|
|
+ # num_classes=None,
|
|
|
|
+ # # transform parameters
|
|
|
|
+ # min_size=512,
|
|
|
|
+ # max_size=1333,
|
|
|
|
+ # image_mean=None,
|
|
|
|
+ # image_std=None,
|
|
|
|
+ # # RPN parameters
|
|
|
|
+ # rpn_anchor_generator=None,
|
|
|
|
+ # rpn_head=None,
|
|
|
|
+ # rpn_pre_nms_top_n_train=2000,
|
|
|
|
+ # rpn_pre_nms_top_n_test=1000,
|
|
|
|
+ # rpn_post_nms_top_n_train=2000,
|
|
|
|
+ # rpn_post_nms_top_n_test=1000,
|
|
|
|
+ # rpn_nms_thresh=0.7,
|
|
|
|
+ # rpn_fg_iou_thresh=0.7,
|
|
|
|
+ # rpn_bg_iou_thresh=0.3,
|
|
|
|
+ # rpn_batch_size_per_image=256,
|
|
|
|
+ # rpn_positive_fraction=0.5,
|
|
|
|
+ # rpn_score_thresh=0.0,
|
|
|
|
+ # # Box parameters
|
|
|
|
+ # box_roi_pool=None,
|
|
|
|
+ # box_head=None,
|
|
|
|
+ # box_predictor=None,
|
|
|
|
+ # box_score_thresh=0.05,
|
|
|
|
+ # box_nms_thresh=0.5,
|
|
|
|
+ # box_detections_per_img=100,
|
|
|
|
+ # box_fg_iou_thresh=0.5,
|
|
|
|
+ # box_bg_iou_thresh=0.5,
|
|
|
|
+ # box_batch_size_per_image=512,
|
|
|
|
+ # box_positive_fraction=0.25,
|
|
|
|
+ # bbox_reg_weights=None,
|
|
|
|
+ # # line parameters
|
|
|
|
+ # line_head=None,
|
|
|
|
+ # line_predictor=None,
|
|
|
|
+ # **kwargs,
|
|
|
|
+ # ):
|
|
|
|
+ #
|
|
|
|
+ # if not hasattr(backbone, "out_channels"):
|
|
|
|
+ # raise ValueError(
|
|
|
|
+ # "backbone should contain an attribute out_channels "
|
|
|
|
+ # "specifying the number of output channels (assumed to be the "
|
|
|
|
+ # "same for all the levels)"
|
|
|
|
+ # )
|
|
|
|
+ #
|
|
|
|
+ # if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))):
|
|
|
|
+ # raise TypeError(
|
|
|
|
+ # f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}"
|
|
|
|
+ # )
|
|
|
|
+ # if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))):
|
|
|
|
+ # raise TypeError(
|
|
|
|
+ # f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}"
|
|
|
|
+ # )
|
|
|
|
+ #
|
|
|
|
+ # if num_classes is not None:
|
|
|
|
+ # if box_predictor is not None:
|
|
|
|
+ # raise ValueError("num_classes should be None when box_predictor is specified")
|
|
|
|
+ # else:
|
|
|
|
+ # if box_predictor is None:
|
|
|
|
+ # raise ValueError("num_classes should not be None when box_predictor is not specified")
|
|
|
|
+ #
|
|
|
|
+ # out_channels = backbone.out_channels
|
|
|
|
+ #
|
|
|
|
+ # if line_head is None:
|
|
|
|
+ # num_class = 5
|
|
|
|
+ # line_head = LineRCNNHeads(out_channels, num_class)
|
|
|
|
+ #
|
|
|
|
+ # if line_predictor is None:
|
|
|
|
+ # line_predictor = LineRCNNPredictor()
|
|
|
|
+ #
|
|
|
|
+ # if rpn_anchor_generator is None:
|
|
|
|
+ # rpn_anchor_generator = _default_anchorgen()
|
|
|
|
+ # if rpn_head is None:
|
|
|
|
+ # rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
|
|
|
|
+ #
|
|
|
|
+ # rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
|
|
|
|
+ # rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
|
|
|
|
+ #
|
|
|
|
+ # rpn = RegionProposalNetwork(
|
|
|
|
+ # rpn_anchor_generator,
|
|
|
|
+ # rpn_head,
|
|
|
|
+ # rpn_fg_iou_thresh,
|
|
|
|
+ # rpn_bg_iou_thresh,
|
|
|
|
+ # rpn_batch_size_per_image,
|
|
|
|
+ # rpn_positive_fraction,
|
|
|
|
+ # rpn_pre_nms_top_n,
|
|
|
|
+ # rpn_post_nms_top_n,
|
|
|
|
+ # rpn_nms_thresh,
|
|
|
|
+ # score_thresh=rpn_score_thresh,
|
|
|
|
+ # )
|
|
|
|
+ #
|
|
|
|
+ # if box_roi_pool is None:
|
|
|
|
+ # box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
|
|
|
|
+ #
|
|
|
|
+ # if box_head is None:
|
|
|
|
+ # resolution = box_roi_pool.output_size[0]
|
|
|
|
+ # representation_size = 1024
|
|
|
|
+ # box_head = TwoMLPHead(out_channels * resolution ** 2, representation_size)
|
|
|
|
+ #
|
|
|
|
+ # if box_predictor is None:
|
|
|
|
+ # representation_size = 1024
|
|
|
|
+ # box_predictor = BoxPredictor(representation_size, num_classes)
|
|
|
|
+ #
|
|
|
|
+ # roi_heads = RoIHeads(
|
|
|
|
+ # # Box
|
|
|
|
+ # box_roi_pool,
|
|
|
|
+ # box_head,
|
|
|
|
+ # box_predictor,
|
|
|
|
+ # line_head,
|
|
|
|
+ # line_predictor,
|
|
|
|
+ # box_fg_iou_thresh,
|
|
|
|
+ # box_bg_iou_thresh,
|
|
|
|
+ # box_batch_size_per_image,
|
|
|
|
+ # box_positive_fraction,
|
|
|
|
+ # bbox_reg_weights,
|
|
|
|
+ # box_score_thresh,
|
|
|
|
+ # box_nms_thresh,
|
|
|
|
+ # box_detections_per_img,
|
|
|
|
+ # )
|
|
|
|
+ #
|
|
|
|
+ # if image_mean is None:
|
|
|
|
+ # image_mean = [0.485, 0.456, 0.406]
|
|
|
|
+ # if image_std is None:
|
|
|
|
+ # image_std = [0.229, 0.224, 0.225]
|
|
|
|
+ # transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
|
|
|
|
+ #
|
|
|
|
+ # super().__init__(backbone, rpn, roi_heads, transform)
|
|
|
|
+ #
|
|
|
|
+ # self.roi_heads = roi_heads
|
|
|
|
+
|
|
|
|
+ # self.roi_heads.line_head = line_head
|
|
|
|
+ # self.roi_heads.line_predictor = line_predictor
|
|
|
|
+
|
|
|
|
+ def train(self, cfg):
|
|
|
|
+ # cfg = read_yaml(cfg)
|
|
|
|
+ self.trainer = Trainer()
|
|
|
|
+ self.trainer.train_cfg(model=self, cfg=cfg)
|
|
|
|
|
|
|
|
|
|
class TwoMLPHead(nn.Module):
|
|
class TwoMLPHead(nn.Module):
|
|
@@ -374,11 +301,11 @@ class TwoMLPHead(nn.Module):
|
|
|
|
|
|
class LineNetConvFCHead(nn.Sequential):
|
|
class LineNetConvFCHead(nn.Sequential):
|
|
def __init__(
|
|
def __init__(
|
|
- self,
|
|
|
|
- input_size: Tuple[int, int, int],
|
|
|
|
- conv_layers: List[int],
|
|
|
|
- fc_layers: List[int],
|
|
|
|
- norm_layer: Optional[Callable[..., nn.Module]] = None,
|
|
|
|
|
|
+ self,
|
|
|
|
+ input_size: Tuple[int, int, int],
|
|
|
|
+ conv_layers: List[int],
|
|
|
|
+ fc_layers: List[int],
|
|
|
|
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
|
|
):
|
|
):
|
|
"""
|
|
"""
|
|
Args:
|
|
Args:
|
|
@@ -533,13 +460,13 @@ class LineNet_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
|
|
weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
|
|
weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
|
|
)
|
|
)
|
|
def linenet_resnet50_fpn(
|
|
def linenet_resnet50_fpn(
|
|
- *,
|
|
|
|
- weights: Optional[LineNet_ResNet50_FPN_Weights] = None,
|
|
|
|
- progress: bool = True,
|
|
|
|
- num_classes: Optional[int] = None,
|
|
|
|
- weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
|
|
|
|
- trainable_backbone_layers: Optional[int] = None,
|
|
|
|
- **kwargs: Any,
|
|
|
|
|
|
+ *,
|
|
|
|
+ weights: Optional[LineNet_ResNet50_FPN_Weights] = None,
|
|
|
|
+ progress: bool = True,
|
|
|
|
+ num_classes: Optional[int] = None,
|
|
|
|
+ weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
|
|
|
|
+ trainable_backbone_layers: Optional[int] = None,
|
|
|
|
+ **kwargs: Any,
|
|
) -> LineNet:
|
|
) -> LineNet:
|
|
"""
|
|
"""
|
|
Faster R-CNN model with a ResNet-50-FPN backbone from the `Faster R-CNN: Towards Real-Time Object
|
|
Faster R-CNN model with a ResNet-50-FPN backbone from the `Faster R-CNN: Towards Real-Time Object
|
|
@@ -652,13 +579,13 @@ def linenet_resnet50_fpn(
|
|
weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
|
|
weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
|
|
)
|
|
)
|
|
def linenet_resnet50_fpn_v2(
|
|
def linenet_resnet50_fpn_v2(
|
|
- *,
|
|
|
|
- weights: Optional[LineNet_ResNet50_FPN_V2_Weights] = None,
|
|
|
|
- progress: bool = True,
|
|
|
|
- num_classes: Optional[int] = None,
|
|
|
|
- weights_backbone: Optional[ResNet50_Weights] = None,
|
|
|
|
- trainable_backbone_layers: Optional[int] = None,
|
|
|
|
- **kwargs: Any,
|
|
|
|
|
|
+ *,
|
|
|
|
+ weights: Optional[LineNet_ResNet50_FPN_V2_Weights] = None,
|
|
|
|
+ progress: bool = True,
|
|
|
|
+ num_classes: Optional[int] = None,
|
|
|
|
+ weights_backbone: Optional[ResNet50_Weights] = None,
|
|
|
|
+ trainable_backbone_layers: Optional[int] = None,
|
|
|
|
+ **kwargs: Any,
|
|
) -> LineNet:
|
|
) -> LineNet:
|
|
"""
|
|
"""
|
|
Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone from `Benchmarking Detection
|
|
Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone from `Benchmarking Detection
|
|
@@ -727,13 +654,13 @@ def linenet_resnet50_fpn_v2(
|
|
|
|
|
|
|
|
|
|
def _linenet_mobilenet_v3_large_fpn(
|
|
def _linenet_mobilenet_v3_large_fpn(
|
|
- *,
|
|
|
|
- weights: Optional[Union[LineNet_MobileNet_V3_Large_FPN_Weights, LineNet_MobileNet_V3_Large_320_FPN_Weights]],
|
|
|
|
- progress: bool,
|
|
|
|
- num_classes: Optional[int],
|
|
|
|
- weights_backbone: Optional[MobileNet_V3_Large_Weights],
|
|
|
|
- trainable_backbone_layers: Optional[int],
|
|
|
|
- **kwargs: Any,
|
|
|
|
|
|
+ *,
|
|
|
|
+ weights: Optional[Union[LineNet_MobileNet_V3_Large_FPN_Weights, LineNet_MobileNet_V3_Large_320_FPN_Weights]],
|
|
|
|
+ progress: bool,
|
|
|
|
+ num_classes: Optional[int],
|
|
|
|
+ weights_backbone: Optional[MobileNet_V3_Large_Weights],
|
|
|
|
+ trainable_backbone_layers: Optional[int],
|
|
|
|
+ **kwargs: Any,
|
|
) -> LineNet:
|
|
) -> LineNet:
|
|
if weights is not None:
|
|
if weights is not None:
|
|
weights_backbone = None
|
|
weights_backbone = None
|
|
@@ -748,14 +675,14 @@ def _linenet_mobilenet_v3_large_fpn(
|
|
backbone = mobilenet_v3_large(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
|
|
backbone = mobilenet_v3_large(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
|
|
backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
|
|
backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
|
|
anchor_sizes = (
|
|
anchor_sizes = (
|
|
- (
|
|
|
|
- 32,
|
|
|
|
- 64,
|
|
|
|
- 128,
|
|
|
|
- 256,
|
|
|
|
- 512,
|
|
|
|
- ),
|
|
|
|
- ) * 3
|
|
|
|
|
|
+ (
|
|
|
|
+ 32,
|
|
|
|
+ 64,
|
|
|
|
+ 128,
|
|
|
|
+ 256,
|
|
|
|
+ 512,
|
|
|
|
+ ),
|
|
|
|
+ ) * 3
|
|
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
|
|
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
|
|
model = LineNet(
|
|
model = LineNet(
|
|
backbone, num_classes, rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs
|
|
backbone, num_classes, rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs
|
|
@@ -773,13 +700,13 @@ def _linenet_mobilenet_v3_large_fpn(
|
|
weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
|
|
weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
|
|
)
|
|
)
|
|
def linenet_mobilenet_v3_large_320_fpn(
|
|
def linenet_mobilenet_v3_large_320_fpn(
|
|
- *,
|
|
|
|
- weights: Optional[LineNet_MobileNet_V3_Large_320_FPN_Weights] = None,
|
|
|
|
- progress: bool = True,
|
|
|
|
- num_classes: Optional[int] = None,
|
|
|
|
- weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
|
|
|
|
- trainable_backbone_layers: Optional[int] = None,
|
|
|
|
- **kwargs: Any,
|
|
|
|
|
|
+ *,
|
|
|
|
+ weights: Optional[LineNet_MobileNet_V3_Large_320_FPN_Weights] = None,
|
|
|
|
+ progress: bool = True,
|
|
|
|
+ num_classes: Optional[int] = None,
|
|
|
|
+ weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
|
|
|
|
+ trainable_backbone_layers: Optional[int] = None,
|
|
|
|
+ **kwargs: Any,
|
|
) -> LineNet:
|
|
) -> LineNet:
|
|
"""
|
|
"""
|
|
Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
|
|
Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
|
|
@@ -847,13 +774,13 @@ def linenet_mobilenet_v3_large_320_fpn(
|
|
weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
|
|
weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
|
|
)
|
|
)
|
|
def linenet_mobilenet_v3_large_fpn(
|
|
def linenet_mobilenet_v3_large_fpn(
|
|
- *,
|
|
|
|
- weights: Optional[LineNet_MobileNet_V3_Large_FPN_Weights] = None,
|
|
|
|
- progress: bool = True,
|
|
|
|
- num_classes: Optional[int] = None,
|
|
|
|
- weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
|
|
|
|
- trainable_backbone_layers: Optional[int] = None,
|
|
|
|
- **kwargs: Any,
|
|
|
|
|
|
+ *,
|
|
|
|
+ weights: Optional[LineNet_MobileNet_V3_Large_FPN_Weights] = None,
|
|
|
|
+ progress: bool = True,
|
|
|
|
+ num_classes: Optional[int] = None,
|
|
|
|
+ weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
|
|
|
|
+ trainable_backbone_layers: Optional[int] = None,
|
|
|
|
+ **kwargs: Any,
|
|
) -> LineNet:
|
|
) -> LineNet:
|
|
"""
|
|
"""
|
|
Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
|
|
Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
|
|
@@ -909,4 +836,3 @@ def linenet_mobilenet_v3_large_fpn(
|
|
trainable_backbone_layers=trainable_backbone_layers,
|
|
trainable_backbone_layers=trainable_backbone_layers,
|
|
**kwargs,
|
|
**kwargs,
|
|
)
|
|
)
|
|
-
|
|
|