line_net.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. from typing import Any, Callable, List, Optional, Tuple, Union
  2. import torch
  3. from torch import nn
  4. from torchvision.ops import MultiScaleRoIAlign
  5. from libs.vision_libs.models import MobileNet_V3_Large_Weights, mobilenet_v3_large
  6. from libs.vision_libs.models.detection.anchor_utils import AnchorGenerator
  7. from libs.vision_libs.models.detection.rpn import RPNHead, RegionProposalNetwork
  8. from libs.vision_libs.models.detection.ssdlite import _mobilenet_extractor
  9. from libs.vision_libs.models.detection.transform import GeneralizedRCNNTransform
  10. from libs.vision_libs.ops import misc as misc_nn_ops
  11. from libs.vision_libs.transforms._presets import ObjectDetection
  12. from .line_head import LineRCNNHeads
  13. from .line_predictor import LineRCNNPredictor
  14. from libs.vision_libs.models._api import register_model, Weights, WeightsEnum
  15. from libs.vision_libs.models._meta import _COCO_PERSON_CATEGORIES, _COCO_PERSON_KEYPOINT_NAMES, _COCO_CATEGORIES
  16. from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface
  17. from libs.vision_libs.models.resnet import resnet50, ResNet50_Weights
  18. from libs.vision_libs.models.detection._utils import overwrite_eps
  19. from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
  20. from libs.vision_libs.models.detection.faster_rcnn import FasterRCNN, TwoMLPHead, FastRCNNPredictor
  21. from .roi_heads import RoIHeads
  22. from .trainer import Trainer
  23. from ..base.base_detection_net import BaseDetectionNet
  24. import torch.nn.functional as F
  25. from ..config.config_tool import read_yaml
  26. FEATURE_DIM = 8
  27. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  28. __all__ = [
  29. "LineNet",
  30. "LineNet_ResNet50_FPN_Weights",
  31. "LineNet_ResNet50_FPN_V2_Weights",
  32. "LineNet_MobileNet_V3_Large_FPN_Weights",
  33. "LineNet_MobileNet_V3_Large_320_FPN_Weights",
  34. "linenet_resnet50_fpn",
  35. "linenet_resnet50_fpn_v2",
  36. "linenet_mobilenet_v3_large_fpn",
  37. "linenet_mobilenet_v3_large_320_fpn",
  38. ]
  39. def _default_anchorgen():
  40. anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
  41. aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
  42. return AnchorGenerator(anchor_sizes, aspect_ratios)
  43. class LineNet(BaseDetectionNet):
  44. def __init__(self, cfg, **kwargs):
  45. cfg = read_yaml(cfg)
  46. self.cfg=cfg
  47. backbone = cfg['backbone']
  48. num_classes = cfg['num_classes']
  49. if backbone == 'resnet50_fpn':
  50. is_trained = False
  51. trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3)
  52. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  53. backbone = resnet50(weights=None, progress=True, norm_layer=norm_layer)
  54. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
  55. print(f'out_chanenels:{backbone.out_channels}')
  56. self.construct(backbone=backbone,num_classes=num_classes,**kwargs)
  57. # out_channels = backbone.out_channels
  58. #
  59. # min_size = 512,
  60. # max_size = 1333,
  61. # rpn_pre_nms_top_n_train = 2000,
  62. # rpn_pre_nms_top_n_test = 1000,
  63. # rpn_post_nms_top_n_train = 2000,
  64. # rpn_post_nms_top_n_test = 1000,
  65. # rpn_nms_thresh = 0.7,
  66. # rpn_fg_iou_thresh = 0.7,
  67. # rpn_bg_iou_thresh = 0.3,
  68. # rpn_batch_size_per_image = 256,
  69. # rpn_positive_fraction = 0.5,
  70. # rpn_score_thresh = 0.0,
  71. # box_score_thresh = 0.05,
  72. # box_nms_thresh = 0.5,
  73. # box_detections_per_img = 100,
  74. # box_fg_iou_thresh = 0.5,
  75. # box_bg_iou_thresh = 0.5,
  76. # box_batch_size_per_image = 512,
  77. # box_positive_fraction = 0.25,
  78. # bbox_reg_weights = None,
  79. #
  80. # line_head = LineRCNNHeads(out_channels, 5)
  81. # line_predictor = LineRCNNPredictor(cfg)
  82. # rpn_anchor_generator = _default_anchorgen()
  83. # rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
  84. # rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
  85. # rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
  86. #
  87. # rpn = RegionProposalNetwork(
  88. # rpn_anchor_generator,
  89. # rpn_head,
  90. # rpn_fg_iou_thresh,
  91. # rpn_bg_iou_thresh,
  92. # rpn_batch_size_per_image,
  93. # rpn_positive_fraction,
  94. # rpn_pre_nms_top_n,
  95. # rpn_post_nms_top_n,
  96. # rpn_nms_thresh,
  97. # score_thresh=rpn_score_thresh,
  98. # )
  99. #
  100. # box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
  101. #
  102. # resolution = box_roi_pool.output_size[0]
  103. # representation_size = 1024
  104. # box_head = TwoMLPHead(out_channels * resolution ** 2, representation_size)
  105. # representation_size = 1024
  106. # box_predictor = BoxPredictor(representation_size, num_classes)
  107. #
  108. # roi_heads = RoIHeads(
  109. # # Box
  110. # box_roi_pool,
  111. # box_head,
  112. # box_predictor,
  113. # line_head,
  114. # line_predictor,
  115. # box_fg_iou_thresh,
  116. # box_bg_iou_thresh,
  117. # box_batch_size_per_image,
  118. # box_positive_fraction,
  119. # bbox_reg_weights,
  120. # box_score_thresh,
  121. # box_nms_thresh,
  122. # box_detections_per_img,
  123. # )
  124. # image_mean = [0.485, 0.456, 0.406]
  125. # image_std = [0.229, 0.224, 0.225]
  126. # transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
  127. # super().__init__(backbone, rpn, roi_heads, transform)
  128. # self.roi_heads = roi_heads
  129. def construct(
  130. self,
  131. backbone,
  132. num_classes=None,
  133. # transform parameters
  134. min_size=512,
  135. max_size=1333,
  136. image_mean=None,
  137. image_std=None,
  138. # RPN parameters
  139. rpn_anchor_generator=None,
  140. rpn_head=None,
  141. rpn_pre_nms_top_n_train=2000,
  142. rpn_pre_nms_top_n_test=1000,
  143. rpn_post_nms_top_n_train=2000,
  144. rpn_post_nms_top_n_test=1000,
  145. rpn_nms_thresh=0.7,
  146. rpn_fg_iou_thresh=0.7,
  147. rpn_bg_iou_thresh=0.3,
  148. rpn_batch_size_per_image=256,
  149. rpn_positive_fraction=0.5,
  150. rpn_score_thresh=0.0,
  151. # Box parameters
  152. box_roi_pool=None,
  153. box_head=None,
  154. box_predictor=None,
  155. box_score_thresh=0.05,
  156. box_nms_thresh=0.5,
  157. box_detections_per_img=100,
  158. box_fg_iou_thresh=0.5,
  159. box_bg_iou_thresh=0.5,
  160. box_batch_size_per_image=512,
  161. box_positive_fraction=0.25,
  162. bbox_reg_weights=None,
  163. # line parameters
  164. line_head=None,
  165. line_predictor=None,
  166. **kwargs,
  167. ):
  168. if not hasattr(backbone, "out_channels"):
  169. raise ValueError(
  170. "backbone should contain an attribute out_channels "
  171. "specifying the number of output channels (assumed to be the "
  172. "same for all the levels)"
  173. )
  174. if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))):
  175. raise TypeError(
  176. f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}"
  177. )
  178. if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))):
  179. raise TypeError(
  180. f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}"
  181. )
  182. if num_classes is not None:
  183. if box_predictor is not None:
  184. raise ValueError("num_classes should be None when box_predictor is specified")
  185. else:
  186. if box_predictor is None:
  187. raise ValueError("num_classes should not be None when box_predictor is not specified")
  188. out_channels = backbone.out_channels
  189. if line_head is None:
  190. num_class = 5
  191. line_head = LineRCNNHeads(out_channels, num_class)
  192. if line_predictor is None:
  193. line_predictor = LineRCNNPredictor(self.cfg)
  194. if rpn_anchor_generator is None:
  195. rpn_anchor_generator = _default_anchorgen()
  196. if rpn_head is None:
  197. rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
  198. rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
  199. rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
  200. rpn = RegionProposalNetwork(
  201. rpn_anchor_generator,
  202. rpn_head,
  203. rpn_fg_iou_thresh,
  204. rpn_bg_iou_thresh,
  205. rpn_batch_size_per_image,
  206. rpn_positive_fraction,
  207. rpn_pre_nms_top_n,
  208. rpn_post_nms_top_n,
  209. rpn_nms_thresh,
  210. score_thresh=rpn_score_thresh,
  211. )
  212. if box_roi_pool is None:
  213. box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
  214. if box_head is None:
  215. resolution = box_roi_pool.output_size[0]
  216. representation_size = 1024
  217. box_head = TwoMLPHead(out_channels * resolution ** 2, representation_size)
  218. if box_predictor is None:
  219. representation_size = 1024
  220. box_predictor = BoxPredictor(representation_size, num_classes)
  221. roi_heads = RoIHeads(
  222. # Box
  223. box_roi_pool,
  224. box_head,
  225. box_predictor,
  226. line_head,
  227. line_predictor,
  228. box_fg_iou_thresh,
  229. box_bg_iou_thresh,
  230. box_batch_size_per_image,
  231. box_positive_fraction,
  232. bbox_reg_weights,
  233. box_score_thresh,
  234. box_nms_thresh,
  235. box_detections_per_img,
  236. )
  237. if image_mean is None:
  238. image_mean = [0.485, 0.456, 0.406]
  239. if image_std is None:
  240. image_std = [0.229, 0.224, 0.225]
  241. transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
  242. super().__init__(backbone, rpn, roi_heads, transform)
  243. self.roi_heads = roi_heads
  244. self.roi_heads.line_head = line_head
  245. self.roi_heads.line_predictor = line_predictor
  246. def train_by_cfg(self, cfg):
  247. # cfg = read_yaml(cfg)
  248. self.trainer = Trainer()
  249. self.trainer.train_cfg(model=self,cfg=cfg)
  250. class TwoMLPHead(nn.Module):
  251. """
  252. Standard heads for FPN-based models
  253. Args:
  254. in_channels (int): number of input channels
  255. representation_size (int): size of the intermediate representation
  256. """
  257. def __init__(self, in_channels, representation_size):
  258. super().__init__()
  259. self.fc6 = nn.Linear(in_channels, representation_size)
  260. self.fc7 = nn.Linear(representation_size, representation_size)
  261. def forward(self, x):
  262. x = x.flatten(start_dim=1)
  263. x = F.relu(self.fc6(x))
  264. x = F.relu(self.fc7(x))
  265. return x
  266. class LineNetConvFCHead(nn.Sequential):
  267. def __init__(
  268. self,
  269. input_size: Tuple[int, int, int],
  270. conv_layers: List[int],
  271. fc_layers: List[int],
  272. norm_layer: Optional[Callable[..., nn.Module]] = None,
  273. ):
  274. """
  275. Args:
  276. input_size (Tuple[int, int, int]): the input size in CHW format.
  277. conv_layers (list): feature dimensions of each Convolution layer
  278. fc_layers (list): feature dimensions of each FCN layer
  279. norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
  280. """
  281. in_channels, in_height, in_width = input_size
  282. blocks = []
  283. previous_channels = in_channels
  284. for current_channels in conv_layers:
  285. blocks.append(misc_nn_ops.Conv2dNormActivation(previous_channels, current_channels, norm_layer=norm_layer))
  286. previous_channels = current_channels
  287. blocks.append(nn.Flatten())
  288. previous_channels = previous_channels * in_height * in_width
  289. for current_channels in fc_layers:
  290. blocks.append(nn.Linear(previous_channels, current_channels))
  291. blocks.append(nn.ReLU(inplace=True))
  292. previous_channels = current_channels
  293. super().__init__(*blocks)
  294. for layer in self.modules():
  295. if isinstance(layer, nn.Conv2d):
  296. nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
  297. if layer.bias is not None:
  298. nn.init.zeros_(layer.bias)
  299. class BoxPredictor(nn.Module):
  300. """
  301. Standard classification + bounding box regression layers
  302. for Fast R-CNN.
  303. Args:
  304. in_channels (int): number of input channels
  305. num_classes (int): number of output classes (including background)
  306. """
  307. def __init__(self, in_channels, num_classes):
  308. super().__init__()
  309. self.cls_score = nn.Linear(in_channels, num_classes)
  310. self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
  311. def forward(self, x):
  312. if x.dim() == 4:
  313. torch._assert(
  314. list(x.shape[2:]) == [1, 1],
  315. f"x has the wrong shape, expecting the last two dimensions to be [1,1] instead of {list(x.shape[2:])}",
  316. )
  317. x = x.flatten(start_dim=1)
  318. scores = self.cls_score(x)
  319. bbox_deltas = self.bbox_pred(x)
  320. return scores, bbox_deltas
  321. _COMMON_META = {
  322. "categories": _COCO_CATEGORIES,
  323. "min_size": (1, 1),
  324. }
  325. class LineNet_ResNet50_FPN_Weights(WeightsEnum):
  326. COCO_V1 = Weights(
  327. url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth",
  328. transforms=ObjectDetection,
  329. meta={
  330. **_COMMON_META,
  331. "num_params": 41755286,
  332. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-resnet-50-fpn",
  333. "_metrics": {
  334. "COCO-val2017": {
  335. "box_map": 37.0,
  336. }
  337. },
  338. "_ops": 134.38,
  339. "_file_size": 159.743,
  340. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  341. },
  342. )
  343. DEFAULT = COCO_V1
  344. class LineNet_ResNet50_FPN_V2_Weights(WeightsEnum):
  345. COCO_V1 = Weights(
  346. url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth",
  347. transforms=ObjectDetection,
  348. meta={
  349. **_COMMON_META,
  350. "num_params": 43712278,
  351. "recipe": "https://github.com/pytorch/vision/pull/5763",
  352. "_metrics": {
  353. "COCO-val2017": {
  354. "box_map": 46.7,
  355. }
  356. },
  357. "_ops": 280.371,
  358. "_file_size": 167.104,
  359. "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
  360. },
  361. )
  362. DEFAULT = COCO_V1
  363. class LineNet_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
  364. COCO_V1 = Weights(
  365. url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth",
  366. transforms=ObjectDetection,
  367. meta={
  368. **_COMMON_META,
  369. "num_params": 19386354,
  370. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-fpn",
  371. "_metrics": {
  372. "COCO-val2017": {
  373. "box_map": 32.8,
  374. }
  375. },
  376. "_ops": 4.494,
  377. "_file_size": 74.239,
  378. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  379. },
  380. )
  381. DEFAULT = COCO_V1
  382. class LineNet_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
  383. COCO_V1 = Weights(
  384. url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_320_fpn-907ea3f9.pth",
  385. transforms=ObjectDetection,
  386. meta={
  387. **_COMMON_META,
  388. "num_params": 19386354,
  389. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-320-fpn",
  390. "_metrics": {
  391. "COCO-val2017": {
  392. "box_map": 22.8,
  393. }
  394. },
  395. "_ops": 0.719,
  396. "_file_size": 74.239,
  397. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  398. },
  399. )
  400. DEFAULT = COCO_V1
  401. @register_model()
  402. @handle_legacy_interface(
  403. weights=("pretrained", LineNet_ResNet50_FPN_Weights.COCO_V1),
  404. weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
  405. )
  406. def linenet_resnet50_fpn(
  407. *,
  408. weights: Optional[LineNet_ResNet50_FPN_Weights] = None,
  409. progress: bool = True,
  410. num_classes: Optional[int] = None,
  411. weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
  412. trainable_backbone_layers: Optional[int] = None,
  413. **kwargs: Any,
  414. ) -> LineNet:
  415. """
  416. Faster R-CNN model with a ResNet-50-FPN backbone from the `Faster R-CNN: Towards Real-Time Object
  417. Detection with Region Proposal Networks <https://arxiv.org/abs/1506.01497>`__
  418. paper.
  419. .. betastatus:: detection module
  420. The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
  421. image, and should be in ``0-1`` range. Different images can have different sizes.
  422. The behavior of the model changes depending on if it is in training or evaluation mode.
  423. During training, the model expects both the input tensors and a targets (list of dictionary),
  424. containing:
  425. - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
  426. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
  427. - labels (``Int64Tensor[N]``): the class label for each ground-truth box
  428. The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
  429. losses for both the RPN and the R-CNN.
  430. During inference, the model requires only the input tensors, and returns the post-processed
  431. predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
  432. follows, where ``N`` is the number of detections:
  433. - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
  434. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
  435. - labels (``Int64Tensor[N]``): the predicted labels for each detection
  436. - scores (``Tensor[N]``): the scores of each detection
  437. For more details on the output, you may refer to :ref:`instance_seg_output`.
  438. Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
  439. Example::
  440. >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
  441. >>> # For training
  442. >>> images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
  443. >>> boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]
  444. >>> labels = torch.randint(1, 91, (4, 11))
  445. >>> images = list(image for image in images)
  446. >>> targets = []
  447. >>> for i in range(len(images)):
  448. >>> d = {}
  449. >>> d['boxes'] = boxes[i]
  450. >>> d['labels'] = labels[i]
  451. >>> targets.append(d)
  452. >>> output = model(images, targets)
  453. >>> # For inference
  454. >>> model.eval()
  455. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  456. >>> predictions = model(x)
  457. >>>
  458. >>> # optionally, if you want to export the model to ONNX:
  459. >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)
  460. Args:
  461. weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights`, optional): The
  462. pretrained weights to use. See
  463. :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights` below for
  464. more details, and possible values. By default, no pre-trained
  465. weights are used.
  466. progress (bool, optional): If True, displays a progress bar of the
  467. download to stderr. Default is True.
  468. num_classes (int, optional): number of output classes of the model (including the background)
  469. weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
  470. pretrained weights for the backbone.
  471. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  472. final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
  473. trainable. If ``None`` is passed (the default) this value is set to 3.
  474. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  475. base class. Please refer to the `source code
  476. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  477. for more details about this class.
  478. .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights
  479. :members:
  480. """
  481. weights = LineNet_ResNet50_FPN_Weights.verify(weights)
  482. weights_backbone = ResNet50_Weights.verify(weights_backbone)
  483. if weights is not None:
  484. weights_backbone = None
  485. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  486. elif num_classes is None:
  487. num_classes = 91
  488. is_trained = weights is not None or weights_backbone is not None
  489. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
  490. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  491. backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
  492. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
  493. model = LineNet(backbone, num_classes=num_classes, **kwargs)
  494. if weights is not None:
  495. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  496. if weights == LineNet_ResNet50_FPN_Weights.COCO_V1:
  497. overwrite_eps(model, 0.0)
  498. return model
  499. @register_model()
  500. @handle_legacy_interface(
  501. weights=("pretrained", LineNet_ResNet50_FPN_V2_Weights.COCO_V1),
  502. weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
  503. )
  504. def linenet_resnet50_fpn_v2(
  505. *,
  506. weights: Optional[LineNet_ResNet50_FPN_V2_Weights] = None,
  507. progress: bool = True,
  508. num_classes: Optional[int] = None,
  509. weights_backbone: Optional[ResNet50_Weights] = None,
  510. trainable_backbone_layers: Optional[int] = None,
  511. **kwargs: Any,
  512. ) -> LineNet:
  513. """
  514. Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone from `Benchmarking Detection
  515. Transfer Learning with Vision Transformers <https://arxiv.org/abs/2111.11429>`__ paper.
  516. .. betastatus:: detection module
  517. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  518. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  519. details.
  520. Args:
  521. weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights`, optional): The
  522. pretrained weights to use. See
  523. :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights` below for
  524. more details, and possible values. By default, no pre-trained
  525. weights are used.
  526. progress (bool, optional): If True, displays a progress bar of the
  527. download to stderr. Default is True.
  528. num_classes (int, optional): number of output classes of the model (including the background)
  529. weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
  530. pretrained weights for the backbone.
  531. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  532. final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
  533. trainable. If ``None`` is passed (the default) this value is set to 3.
  534. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  535. base class. Please refer to the `source code
  536. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  537. for more details about this class.
  538. .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights
  539. :members:
  540. """
  541. weights = LineNet_ResNet50_FPN_V2_Weights.verify(weights)
  542. weights_backbone = ResNet50_Weights.verify(weights_backbone)
  543. if weights is not None:
  544. weights_backbone = None
  545. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  546. elif num_classes is None:
  547. num_classes = 91
  548. is_trained = weights is not None or weights_backbone is not None
  549. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
  550. backbone = resnet50(weights=weights_backbone, progress=progress)
  551. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
  552. rpn_anchor_generator = _default_anchorgen()
  553. rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
  554. box_head = LineNetConvFCHead(
  555. (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
  556. )
  557. model = LineNet(
  558. backbone,
  559. num_classes=num_classes,
  560. rpn_anchor_generator=rpn_anchor_generator,
  561. rpn_head=rpn_head,
  562. box_head=box_head,
  563. **kwargs,
  564. )
  565. if weights is not None:
  566. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  567. return model
  568. def _linenet_mobilenet_v3_large_fpn(
  569. *,
  570. weights: Optional[Union[LineNet_MobileNet_V3_Large_FPN_Weights, LineNet_MobileNet_V3_Large_320_FPN_Weights]],
  571. progress: bool,
  572. num_classes: Optional[int],
  573. weights_backbone: Optional[MobileNet_V3_Large_Weights],
  574. trainable_backbone_layers: Optional[int],
  575. **kwargs: Any,
  576. ) -> LineNet:
  577. if weights is not None:
  578. weights_backbone = None
  579. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  580. elif num_classes is None:
  581. num_classes = 91
  582. is_trained = weights is not None or weights_backbone is not None
  583. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 6, 3)
  584. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  585. backbone = mobilenet_v3_large(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
  586. backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
  587. anchor_sizes = (
  588. (
  589. 32,
  590. 64,
  591. 128,
  592. 256,
  593. 512,
  594. ),
  595. ) * 3
  596. aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
  597. model = LineNet(
  598. backbone, num_classes, rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs
  599. )
  600. if weights is not None:
  601. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  602. return model
  603. @register_model()
  604. @handle_legacy_interface(
  605. weights=("pretrained", LineNet_MobileNet_V3_Large_320_FPN_Weights.COCO_V1),
  606. weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
  607. )
  608. def linenet_mobilenet_v3_large_320_fpn(
  609. *,
  610. weights: Optional[LineNet_MobileNet_V3_Large_320_FPN_Weights] = None,
  611. progress: bool = True,
  612. num_classes: Optional[int] = None,
  613. weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
  614. trainable_backbone_layers: Optional[int] = None,
  615. **kwargs: Any,
  616. ) -> LineNet:
  617. """
  618. Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
  619. .. betastatus:: detection module
  620. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  621. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  622. details.
  623. Example::
  624. >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights=FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT)
  625. >>> model.eval()
  626. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  627. >>> predictions = model(x)
  628. Args:
  629. weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights`, optional): The
  630. pretrained weights to use. See
  631. :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights` below for
  632. more details, and possible values. By default, no pre-trained
  633. weights are used.
  634. progress (bool, optional): If True, displays a progress bar of the
  635. download to stderr. Default is True.
  636. num_classes (int, optional): number of output classes of the model (including the background)
  637. weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
  638. pretrained weights for the backbone.
  639. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  640. final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
  641. trainable. If ``None`` is passed (the default) this value is set to 3.
  642. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  643. base class. Please refer to the `source code
  644. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  645. for more details about this class.
  646. .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
  647. :members:
  648. """
  649. weights = LineNet_MobileNet_V3_Large_320_FPN_Weights.verify(weights)
  650. weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
  651. defaults = {
  652. "min_size": 320,
  653. "max_size": 640,
  654. "rpn_pre_nms_top_n_test": 150,
  655. "rpn_post_nms_top_n_test": 150,
  656. "rpn_score_thresh": 0.05,
  657. }
  658. kwargs = {**defaults, **kwargs}
  659. return _linenet_mobilenet_v3_large_fpn(
  660. weights=weights,
  661. progress=progress,
  662. num_classes=num_classes,
  663. weights_backbone=weights_backbone,
  664. trainable_backbone_layers=trainable_backbone_layers,
  665. **kwargs,
  666. )
  667. @register_model()
  668. @handle_legacy_interface(
  669. weights=("pretrained", LineNet_MobileNet_V3_Large_FPN_Weights.COCO_V1),
  670. weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
  671. )
  672. def linenet_mobilenet_v3_large_fpn(
  673. *,
  674. weights: Optional[LineNet_MobileNet_V3_Large_FPN_Weights] = None,
  675. progress: bool = True,
  676. num_classes: Optional[int] = None,
  677. weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
  678. trainable_backbone_layers: Optional[int] = None,
  679. **kwargs: Any,
  680. ) -> LineNet:
  681. """
  682. Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
  683. .. betastatus:: detection module
  684. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  685. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  686. details.
  687. Example::
  688. >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT)
  689. >>> model.eval()
  690. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  691. >>> predictions = model(x)
  692. Args:
  693. weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights`, optional): The
  694. pretrained weights to use. See
  695. :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights` below for
  696. more details, and possible values. By default, no pre-trained
  697. weights are used.
  698. progress (bool, optional): If True, displays a progress bar of the
  699. download to stderr. Default is True.
  700. num_classes (int, optional): number of output classes of the model (including the background)
  701. weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
  702. pretrained weights for the backbone.
  703. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  704. final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
  705. trainable. If ``None`` is passed (the default) this value is set to 3.
  706. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  707. base class. Please refer to the `source code
  708. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  709. for more details about this class.
  710. .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights
  711. :members:
  712. """
  713. weights = LineNet_MobileNet_V3_Large_FPN_Weights.verify(weights)
  714. weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
  715. defaults = {
  716. "rpn_score_thresh": 0.05,
  717. }
  718. kwargs = {**defaults, **kwargs}
  719. return _linenet_mobilenet_v3_large_fpn(
  720. weights=weights,
  721. progress=progress,
  722. num_classes=num_classes,
  723. weights_backbone=weights_backbone,
  724. trainable_backbone_layers=trainable_backbone_layers,
  725. **kwargs,
  726. )