line_net.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. import os
  2. from typing import Any, Callable, List, Optional, Tuple, Union
  3. import torch
  4. from torch import nn
  5. from torchvision.ops import MultiScaleRoIAlign
  6. from libs.vision_libs import ops
  7. from libs.vision_libs.models import MobileNet_V3_Large_Weights, mobilenet_v3_large, EfficientNet_V2_S_Weights, \
  8. efficientnet_v2_s, detection
  9. from libs.vision_libs.models.detection.anchor_utils import AnchorGenerator
  10. from libs.vision_libs.models.detection.rpn import RPNHead, RegionProposalNetwork
  11. from libs.vision_libs.models.detection.ssdlite import _mobilenet_extractor
  12. from libs.vision_libs.models.detection.transform import GeneralizedRCNNTransform
  13. from libs.vision_libs.ops import misc as misc_nn_ops
  14. from libs.vision_libs.transforms._presets import ObjectDetection
  15. from .line_head import LineRCNNHeads
  16. from .line_predictor import LineRCNNPredictor
  17. from libs.vision_libs.models._api import register_model, Weights, WeightsEnum
  18. from libs.vision_libs.models._meta import _COCO_PERSON_CATEGORIES, _COCO_PERSON_KEYPOINT_NAMES, _COCO_CATEGORIES
  19. from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface
  20. from libs.vision_libs.models.resnet import resnet50, ResNet50_Weights, ResNet18_Weights, resnet18
  21. from libs.vision_libs.models.detection._utils import overwrite_eps
  22. from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers, \
  23. BackboneWithFPN
  24. from libs.vision_libs.models.detection.faster_rcnn import FasterRCNN, TwoMLPHead, FastRCNNPredictor
  25. from .roi_heads import RoIHeads
  26. from .trainer import Trainer
  27. from ..base import backbone_factory
  28. from ..base.base_detection_net import BaseDetectionNet
  29. import torch.nn.functional as F
  30. from .predict import Predict1, Predict
  31. from ..config.config_tool import read_yaml
  32. FEATURE_DIM = 8
  33. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  34. __all__ = [
  35. "LineNet",
  36. "LineNet_ResNet50_FPN_Weights",
  37. "LineNet_ResNet50_FPN_V2_Weights",
  38. "LineNet_MobileNet_V3_Large_FPN_Weights",
  39. "LineNet_MobileNet_V3_Large_320_FPN_Weights",
  40. "linenet_resnet50_fpn",
  41. "linenet_resnet50_fpn_v2",
  42. "linenet_mobilenet_v3_large_fpn",
  43. "linenet_mobilenet_v3_large_320_fpn",
  44. ]
  45. def _default_anchorgen():
  46. anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
  47. aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
  48. return AnchorGenerator(anchor_sizes, aspect_ratios)
  49. class LineNet(BaseDetectionNet):
  50. # def __init__(self, cfg, **kwargs):
  51. # cfg = read_yaml(cfg)
  52. # self.cfg=cfg
  53. # backbone = cfg['backbone']
  54. # print(f'LineNet Backbone:{backbone}')
  55. # num_classes = cfg['num_classes']
  56. #
  57. # if backbone == 'resnet50_fpn':
  58. # backbone=backbone_factory.get_resnet50_fpn()
  59. # print(f'out_chanenels:{backbone.out_channels}')
  60. # elif backbone== 'mobilenet_v3_large_fpn':
  61. # backbone=backbone_factory.get_mobilenet_v3_large_fpn()
  62. # elif backbone=='resnet18_fpn':
  63. # backbone=backbone_factory.get_resnet18_fpn()
  64. #
  65. # self.__construct__(backbone=backbone, num_classes=num_classes, **kwargs)
  66. def __init__(
  67. self,
  68. backbone,
  69. num_classes=None,
  70. # transform parameters
  71. min_size=512,
  72. max_size=1333,
  73. image_mean=None,
  74. image_std=None,
  75. # RPN parameters
  76. rpn_anchor_generator=None,
  77. rpn_head=None,
  78. rpn_pre_nms_top_n_train=2000,
  79. rpn_pre_nms_top_n_test=1000,
  80. rpn_post_nms_top_n_train=2000,
  81. rpn_post_nms_top_n_test=1000,
  82. rpn_nms_thresh=0.7,
  83. rpn_fg_iou_thresh=0.7,
  84. rpn_bg_iou_thresh=0.3,
  85. rpn_batch_size_per_image=256,
  86. rpn_positive_fraction=0.5,
  87. rpn_score_thresh=0.0,
  88. # Box parameters
  89. box_roi_pool=None,
  90. box_head=None,
  91. box_predictor=None,
  92. box_score_thresh=0.05,
  93. box_nms_thresh=0.5,
  94. box_detections_per_img=100,
  95. box_fg_iou_thresh=0.5,
  96. box_bg_iou_thresh=0.5,
  97. box_batch_size_per_image=512,
  98. box_positive_fraction=0.25,
  99. bbox_reg_weights=None,
  100. # line parameters
  101. line_head=None,
  102. line_predictor=None,
  103. **kwargs,
  104. ):
  105. if not hasattr(backbone, "out_channels"):
  106. raise ValueError(
  107. "backbone should contain an attribute out_channels "
  108. "specifying the number of output channels (assumed to be the "
  109. "same for all the levels)"
  110. )
  111. if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))):
  112. raise TypeError(
  113. f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}"
  114. )
  115. if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))):
  116. raise TypeError(
  117. f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}"
  118. )
  119. if num_classes is not None:
  120. if box_predictor is not None:
  121. raise ValueError("num_classes should be None when box_predictor is specified")
  122. else:
  123. if box_predictor is None:
  124. raise ValueError("num_classes should not be None when box_predictor is not specified")
  125. out_channels = backbone.out_channels
  126. # cfg = read_yaml(cfg)
  127. # self.cfg=cfg
  128. if line_head is None:
  129. num_class = 5
  130. line_head = LineRCNNHeads(out_channels, num_class)
  131. if line_predictor is None:
  132. line_predictor = LineRCNNPredictor()
  133. if rpn_anchor_generator is None:
  134. rpn_anchor_generator = _default_anchorgen()
  135. if rpn_head is None:
  136. rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])
  137. rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
  138. rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
  139. rpn = RegionProposalNetwork(
  140. rpn_anchor_generator,
  141. rpn_head,
  142. rpn_fg_iou_thresh,
  143. rpn_bg_iou_thresh,
  144. rpn_batch_size_per_image,
  145. rpn_positive_fraction,
  146. rpn_pre_nms_top_n,
  147. rpn_post_nms_top_n,
  148. rpn_nms_thresh,
  149. score_thresh=rpn_score_thresh,
  150. )
  151. if box_roi_pool is None:
  152. box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2)
  153. if box_head is None:
  154. resolution = box_roi_pool.output_size[0]
  155. representation_size = 1024
  156. box_head = TwoMLPHead(out_channels * resolution ** 2, representation_size)
  157. if box_predictor is None:
  158. representation_size = 1024
  159. box_predictor = BoxPredictor(representation_size, num_classes)
  160. roi_heads = RoIHeads(
  161. # Box
  162. box_roi_pool,
  163. box_head,
  164. box_predictor,
  165. line_head,
  166. line_predictor,
  167. box_fg_iou_thresh,
  168. box_bg_iou_thresh,
  169. box_batch_size_per_image,
  170. box_positive_fraction,
  171. bbox_reg_weights,
  172. box_score_thresh,
  173. box_nms_thresh,
  174. box_detections_per_img,
  175. )
  176. if image_mean is None:
  177. image_mean = [0.485, 0.456, 0.406]
  178. if image_std is None:
  179. image_std = [0.229, 0.224, 0.225]
  180. transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs)
  181. super().__init__(backbone, rpn, roi_heads, transform)
  182. self.roi_heads = roi_heads
  183. # self.roi_heads.line_head = line_head
  184. # self.roi_heads.line_predictor = line_predictor
  185. def train_by_cfg(self, cfg):
  186. # cfg = read_yaml(cfg)
  187. self.trainer = Trainer()
  188. self.trainer.train_cfg(model=self, cfg=cfg)
  189. def load_best_model(self,model, save_path, device='cuda'):
  190. if os.path.exists(save_path):
  191. checkpoint = torch.load(save_path, map_location=device)
  192. model.load_state_dict(checkpoint['model_state_dict'])
  193. # if optimizer is not None:
  194. # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  195. # epoch = checkpoint['epoch']
  196. # loss = checkpoint['loss']
  197. # print(f"Loaded best model from {save_path} at epoch {epoch} with loss {loss:.4f}")
  198. print(f"Loaded model from {save_path}")
  199. else:
  200. print(f"No saved model found at {save_path}")
  201. return model
  202. # 加载权重和推理一起
  203. def predict(self, pt_path, model, img_path, type=0, threshold=0.5, save_path=None, show=False):
  204. self.predict = Predict(pt_path, model, img_path, type, threshold, save_path, show)
  205. self.predict.run()
  206. # 不加载权重
  207. def predict1(self, model, img_path, type=0, threshold=0.5, save_path=None, show=False):
  208. self.predict = Predict1(model, img_path, type, threshold, save_path, show)
  209. self.predict.run()
  210. class TwoMLPHead(nn.Module):
  211. """
  212. Standard heads for FPN-based models
  213. Args:
  214. in_channels (int): number of input channels
  215. representation_size (int): size of the intermediate representation
  216. """
  217. def __init__(self, in_channels, representation_size):
  218. super().__init__()
  219. self.fc6 = nn.Linear(in_channels, representation_size)
  220. self.fc7 = nn.Linear(representation_size, representation_size)
  221. def forward(self, x):
  222. x = x.flatten(start_dim=1)
  223. x = F.relu(self.fc6(x))
  224. x = F.relu(self.fc7(x))
  225. return x
  226. class LineNetConvFCHead(nn.Sequential):
  227. def __init__(
  228. self,
  229. input_size: Tuple[int, int, int],
  230. conv_layers: List[int],
  231. fc_layers: List[int],
  232. norm_layer: Optional[Callable[..., nn.Module]] = None,
  233. ):
  234. """
  235. Args:
  236. input_size (Tuple[int, int, int]): the input size in CHW format.
  237. conv_layers (list): feature dimensions of each Convolution layer
  238. fc_layers (list): feature dimensions of each FCN layer
  239. norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
  240. """
  241. in_channels, in_height, in_width = input_size
  242. blocks = []
  243. previous_channels = in_channels
  244. for current_channels in conv_layers:
  245. blocks.append(misc_nn_ops.Conv2dNormActivation(previous_channels, current_channels, norm_layer=norm_layer))
  246. previous_channels = current_channels
  247. blocks.append(nn.Flatten())
  248. previous_channels = previous_channels * in_height * in_width
  249. for current_channels in fc_layers:
  250. blocks.append(nn.Linear(previous_channels, current_channels))
  251. blocks.append(nn.ReLU(inplace=True))
  252. previous_channels = current_channels
  253. super().__init__(*blocks)
  254. for layer in self.modules():
  255. if isinstance(layer, nn.Conv2d):
  256. nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
  257. if layer.bias is not None:
  258. nn.init.zeros_(layer.bias)
  259. class BoxPredictor(nn.Module):
  260. """
  261. Standard classification + bounding box regression layers
  262. for Fast R-CNN.
  263. Args:
  264. in_channels (int): number of input channels
  265. num_classes (int): number of output classes (including background)
  266. """
  267. def __init__(self, in_channels, num_classes):
  268. super().__init__()
  269. self.cls_score = nn.Linear(in_channels, num_classes)
  270. self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
  271. def forward(self, x):
  272. if x.dim() == 4:
  273. torch._assert(
  274. list(x.shape[2:]) == [1, 1],
  275. f"x has the wrong shape, expecting the last two dimensions to be [1,1] instead of {list(x.shape[2:])}",
  276. )
  277. x = x.flatten(start_dim=1)
  278. scores = self.cls_score(x)
  279. bbox_deltas = self.bbox_pred(x)
  280. return scores, bbox_deltas
  281. _COMMON_META = {
  282. "categories": _COCO_CATEGORIES,
  283. "min_size": (1, 1),
  284. }
  285. def create_efficientnetv2_backbone(name='efficientnet_v2_s', pretrained=True):
  286. # 加载EfficientNetV2模型
  287. if name == 'efficientnet_v2_s':
  288. weights = EfficientNet_V2_S_Weights.IMAGENET1K_V1 if pretrained else None
  289. backbone = efficientnet_v2_s(weights=weights).features
  290. # 定义返回的层索引和名称
  291. return_layers = {"2": "0", "3": "1", "4": "2", "5": "3"}
  292. # 获取每个层输出通道数
  293. in_channels_list = []
  294. for layer_idx in [2, 3, 4, 5]:
  295. module = backbone[layer_idx]
  296. if hasattr(module, 'out_channels'):
  297. in_channels_list.append(module.out_channels)
  298. elif hasattr(module[-1], 'out_channels'):
  299. # 如果module本身没有out_channels,检查最后一个子模块
  300. in_channels_list.append(module[-1].out_channels)
  301. else:
  302. raise ValueError(f"Cannot determine out_channels for layer {layer_idx}")
  303. # 使用BackboneWithFPN包装backbone
  304. backbone_with_fpn = BackboneWithFPN(
  305. backbone=backbone,
  306. return_layers=return_layers,
  307. in_channels_list=in_channels_list,
  308. out_channels=256
  309. )
  310. return backbone_with_fpn
  311. def get_line_net_efficientnetv2(num_classes, pretrained_backbone=True):
  312. # 创建EfficientNetV2 backbone
  313. backbone = create_efficientnetv2_backbone(pretrained=pretrained_backbone)
  314. # 确认 backbone 输出特征图数量
  315. # with torch.no_grad():
  316. # images = torch.rand(1,3, 600, 800)
  317. # features = backbone(images)
  318. # featmap_names = list(features.keys())
  319. # print("Feature map names:", featmap_names) # 例如 ['0', '1', '2', '3']
  320. # 根据实际特征层数量设置 anchors
  321. # num_levels = len(featmap_names)
  322. num_levels=5
  323. featmap_names= ['0', '1', '2', '3', 'pool']
  324. anchor_sizes = tuple((int(16 * 2 ** i),) for i in range(num_levels)) # 自动生成不同大小
  325. aspect_ratios = ((0.5, 1.0, 2.0),) * num_levels # 所有层共享相同比例
  326. anchor_generator = AnchorGenerator(
  327. sizes=anchor_sizes,
  328. aspect_ratios=aspect_ratios
  329. )
  330. # ROI Pooling
  331. roi_pooler = MultiScaleRoIAlign(
  332. featmap_names=featmap_names,
  333. output_size=7,
  334. sampling_ratio=2
  335. )
  336. # 构建模型
  337. model = LineNet(
  338. backbone=backbone,
  339. num_classes=num_classes,
  340. rpn_anchor_generator=anchor_generator,
  341. box_roi_pool=roi_pooler
  342. )
  343. return model
  344. class LineNet_ResNet50_FPN_Weights(WeightsEnum):
  345. COCO_V1 = Weights(
  346. url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth",
  347. transforms=ObjectDetection,
  348. meta={
  349. **_COMMON_META,
  350. "num_params": 41755286,
  351. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-resnet-50-fpn",
  352. "_metrics": {
  353. "COCO-val2017": {
  354. "box_map": 37.0,
  355. }
  356. },
  357. "_ops": 134.38,
  358. "_file_size": 159.743,
  359. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  360. },
  361. )
  362. DEFAULT = COCO_V1
  363. class LineNet_ResNet50_FPN_V2_Weights(WeightsEnum):
  364. COCO_V1 = Weights(
  365. url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth",
  366. transforms=ObjectDetection,
  367. meta={
  368. **_COMMON_META,
  369. "num_params": 43712278,
  370. "recipe": "https://github.com/pytorch/vision/pull/5763",
  371. "_metrics": {
  372. "COCO-val2017": {
  373. "box_map": 46.7,
  374. }
  375. },
  376. "_ops": 280.371,
  377. "_file_size": 167.104,
  378. "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
  379. },
  380. )
  381. DEFAULT = COCO_V1
  382. class LineNet_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
  383. COCO_V1 = Weights(
  384. url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth",
  385. transforms=ObjectDetection,
  386. meta={
  387. **_COMMON_META,
  388. "num_params": 19386354,
  389. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-fpn",
  390. "_metrics": {
  391. "COCO-val2017": {
  392. "box_map": 32.8,
  393. }
  394. },
  395. "_ops": 4.494,
  396. "_file_size": 74.239,
  397. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  398. },
  399. )
  400. DEFAULT = COCO_V1
  401. class LineNet_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
  402. COCO_V1 = Weights(
  403. url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_320_fpn-907ea3f9.pth",
  404. transforms=ObjectDetection,
  405. meta={
  406. **_COMMON_META,
  407. "num_params": 19386354,
  408. "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-320-fpn",
  409. "_metrics": {
  410. "COCO-val2017": {
  411. "box_map": 22.8,
  412. }
  413. },
  414. "_ops": 0.719,
  415. "_file_size": 74.239,
  416. "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
  417. },
  418. )
  419. DEFAULT = COCO_V1
  420. @register_model()
  421. @handle_legacy_interface(
  422. weights=("pretrained", LineNet_ResNet50_FPN_Weights.COCO_V1),
  423. weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
  424. )
  425. def linenet_resnet18_fpn(
  426. *,
  427. weights: Optional[LineNet_ResNet50_FPN_Weights] = None,
  428. progress: bool = True,
  429. num_classes: Optional[int] = None,
  430. weights_backbone: Optional[ResNet18_Weights] = ResNet18_Weights.IMAGENET1K_V1,
  431. trainable_backbone_layers: Optional[int] = None,
  432. **kwargs: Any,
  433. ) -> LineNet:
  434. # weights = LineNet_ResNet50_FPN_Weights.verify(weights)
  435. # weights_backbone = ResNet50_Weights.verify(weights_backbone)
  436. if weights is not None:
  437. weights_backbone = None
  438. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  439. elif num_classes is None:
  440. num_classes = 91
  441. if weights_backbone is not None:
  442. print(f'resnet50 weights is not None')
  443. is_trained = weights is not None or weights_backbone is not None
  444. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
  445. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  446. backbone = resnet18(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
  447. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
  448. model = LineNet(backbone, num_classes=num_classes, **kwargs)
  449. if weights is not None:
  450. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  451. if weights == LineNet_ResNet50_FPN_Weights.COCO_V1:
  452. overwrite_eps(model, 0.0)
  453. return model
  454. def linenet_resnet50_fpn(
  455. *,
  456. weights: Optional[LineNet_ResNet50_FPN_Weights] = None,
  457. progress: bool = True,
  458. num_classes: Optional[int] = None,
  459. weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
  460. trainable_backbone_layers: Optional[int] = None,
  461. **kwargs: Any,
  462. ) -> LineNet:
  463. """
  464. Faster R-CNN model with a ResNet-50-FPN backbone from the `Faster R-CNN: Towards Real-Time Object
  465. Detection with Region Proposal Networks <https://arxiv.org/abs/1506.01497>`__
  466. paper.
  467. .. betastatus:: detection module
  468. The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
  469. image, and should be in ``0-1`` range. Different images can have different sizes.
  470. The behavior of the model changes depending on if it is in training or evaluation mode.
  471. During training, the model expects both the input tensors and a targets (list of dictionary),
  472. containing:
  473. - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
  474. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
  475. - labels (``Int64Tensor[N]``): the class label for each ground-truth box
  476. The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
  477. losses for both the RPN and the R-CNN.
  478. During inference, the model requires only the input tensors, and returns the post-processed
  479. predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
  480. follows, where ``N`` is the number of detections:
  481. - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
  482. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
  483. - labels (``Int64Tensor[N]``): the predicted labels for each detection
  484. - scores (``Tensor[N]``): the scores of each detection
  485. For more details on the output, you may refer to :ref:`instance_seg_output`.
  486. Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
  487. Example::
  488. >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
  489. >>> # For training
  490. >>> images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
  491. >>> boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]
  492. >>> labels = torch.randint(1, 91, (4, 11))
  493. >>> images = list(image for image in images)
  494. >>> targets = []
  495. >>> for i in range(len(images)):
  496. >>> d = {}
  497. >>> d['boxes'] = boxes[i]
  498. >>> d['labels'] = labels[i]
  499. >>> targets.append(d)
  500. >>> output = model(images, targets)
  501. >>> # For inference
  502. >>> model.eval()
  503. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  504. >>> predictions = model(x)
  505. >>>
  506. >>> # optionally, if you want to export the model to ONNX:
  507. >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)
  508. Args:
  509. weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights`, optional): The
  510. pretrained weights to use. See
  511. :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights` below for
  512. more details, and possible values. By default, no pre-trained
  513. weights are used.
  514. progress (bool, optional): If True, displays a progress bar of the
  515. download to stderr. Default is True.
  516. num_classes (int, optional): number of output classes of the model (including the background)
  517. weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
  518. pretrained weights for the backbone.
  519. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  520. final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
  521. trainable. If ``None`` is passed (the default) this value is set to 3.
  522. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  523. base class. Please refer to the `source code
  524. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  525. for more details about this class.
  526. .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights
  527. :members:
  528. """
  529. weights = LineNet_ResNet50_FPN_Weights.verify(weights)
  530. weights_backbone = ResNet50_Weights.verify(weights_backbone)
  531. if weights is not None:
  532. weights_backbone = None
  533. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  534. elif num_classes is None:
  535. num_classes = 91
  536. if weights_backbone is not None:
  537. print(f'resnet50 weights is not None')
  538. is_trained = weights is not None or weights_backbone is not None
  539. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
  540. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  541. backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
  542. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
  543. model = LineNet(backbone, num_classes=num_classes, **kwargs)
  544. if weights is not None:
  545. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  546. if weights == LineNet_ResNet50_FPN_Weights.COCO_V1:
  547. overwrite_eps(model, 0.0)
  548. return model
  549. @register_model()
  550. @handle_legacy_interface(
  551. weights=("pretrained", LineNet_ResNet50_FPN_V2_Weights.COCO_V1),
  552. weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
  553. )
  554. def linenet_resnet50_fpn_v2(
  555. *,
  556. weights: Optional[LineNet_ResNet50_FPN_V2_Weights] = None,
  557. progress: bool = True,
  558. num_classes: Optional[int] = None,
  559. weights_backbone: Optional[ResNet50_Weights] = None,
  560. trainable_backbone_layers: Optional[int] = None,
  561. **kwargs: Any,
  562. ) -> LineNet:
  563. """
  564. Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone from `Benchmarking Detection
  565. Transfer Learning with Vision Transformers <https://arxiv.org/abs/2111.11429>`__ paper.
  566. .. betastatus:: detection module
  567. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  568. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  569. details.
  570. Args:
  571. weights (:class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights`, optional): The
  572. pretrained weights to use. See
  573. :class:`~torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights` below for
  574. more details, and possible values. By default, no pre-trained
  575. weights are used.
  576. progress (bool, optional): If True, displays a progress bar of the
  577. download to stderr. Default is True.
  578. num_classes (int, optional): number of output classes of the model (including the background)
  579. weights_backbone (:class:`~torchvision.models.ResNet50_Weights`, optional): The
  580. pretrained weights for the backbone.
  581. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  582. final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are
  583. trainable. If ``None`` is passed (the default) this value is set to 3.
  584. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  585. base class. Please refer to the `source code
  586. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  587. for more details about this class.
  588. .. autoclass:: torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights
  589. :members:
  590. """
  591. weights = LineNet_ResNet50_FPN_V2_Weights.verify(weights)
  592. weights_backbone = ResNet50_Weights.verify(weights_backbone)
  593. if weights is not None:
  594. weights_backbone = None
  595. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  596. elif num_classes is None:
  597. num_classes = 91
  598. is_trained = weights is not None or weights_backbone is not None
  599. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
  600. backbone = resnet50(weights=weights_backbone, progress=progress)
  601. backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
  602. rpn_anchor_generator = _default_anchorgen()
  603. rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
  604. box_head = LineNetConvFCHead(
  605. (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
  606. )
  607. model = LineNet(
  608. backbone,
  609. num_classes=num_classes,
  610. rpn_anchor_generator=rpn_anchor_generator,
  611. rpn_head=rpn_head,
  612. box_head=box_head,
  613. **kwargs,
  614. )
  615. if weights is not None:
  616. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  617. return model
  618. def _linenet_mobilenet_v3_large_fpn(
  619. *,
  620. weights: Optional[Union[LineNet_MobileNet_V3_Large_FPN_Weights, LineNet_MobileNet_V3_Large_320_FPN_Weights]],
  621. progress: bool,
  622. num_classes: Optional[int],
  623. weights_backbone: Optional[MobileNet_V3_Large_Weights],
  624. trainable_backbone_layers: Optional[int],
  625. **kwargs: Any,
  626. ) -> LineNet:
  627. if weights is not None:
  628. weights_backbone = None
  629. num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
  630. elif num_classes is None:
  631. num_classes = 91
  632. is_trained = weights is not None or weights_backbone is not None
  633. trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 6, 3)
  634. norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
  635. backbone = mobilenet_v3_large(weights=weights_backbone, progress=progress, norm_layer=norm_layer)
  636. backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
  637. anchor_sizes = (
  638. (
  639. 32,
  640. 64,
  641. 128,
  642. 256,
  643. 512,
  644. ),
  645. ) * 3
  646. aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
  647. model = LineNet(
  648. backbone, num_classes, rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs
  649. )
  650. if weights is not None:
  651. model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
  652. return model
  653. @register_model()
  654. @handle_legacy_interface(
  655. weights=("pretrained", LineNet_MobileNet_V3_Large_320_FPN_Weights.COCO_V1),
  656. weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
  657. )
  658. def linenet_mobilenet_v3_large_320_fpn(
  659. *,
  660. weights: Optional[LineNet_MobileNet_V3_Large_320_FPN_Weights] = None,
  661. progress: bool = True,
  662. num_classes: Optional[int] = None,
  663. weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
  664. trainable_backbone_layers: Optional[int] = None,
  665. **kwargs: Any,
  666. ) -> LineNet:
  667. """
  668. Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
  669. .. betastatus:: detection module
  670. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  671. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  672. details.
  673. Example::
  674. >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights=FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT)
  675. >>> model.eval()
  676. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  677. >>> predictions = model(x)
  678. Args:
  679. weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights`, optional): The
  680. pretrained weights to use. See
  681. :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights` below for
  682. more details, and possible values. By default, no pre-trained
  683. weights are used.
  684. progress (bool, optional): If True, displays a progress bar of the
  685. download to stderr. Default is True.
  686. num_classes (int, optional): number of output classes of the model (including the background)
  687. weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
  688. pretrained weights for the backbone.
  689. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  690. final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
  691. trainable. If ``None`` is passed (the default) this value is set to 3.
  692. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  693. base class. Please refer to the `source code
  694. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  695. for more details about this class.
  696. .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
  697. :members:
  698. """
  699. weights = LineNet_MobileNet_V3_Large_320_FPN_Weights.verify(weights)
  700. weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
  701. defaults = {
  702. "min_size": 320,
  703. "max_size": 640,
  704. "rpn_pre_nms_top_n_test": 150,
  705. "rpn_post_nms_top_n_test": 150,
  706. "rpn_score_thresh": 0.05,
  707. }
  708. kwargs = {**defaults, **kwargs}
  709. return _linenet_mobilenet_v3_large_fpn(
  710. weights=weights,
  711. progress=progress,
  712. num_classes=num_classes,
  713. weights_backbone=weights_backbone,
  714. trainable_backbone_layers=trainable_backbone_layers,
  715. **kwargs,
  716. )
  717. @register_model()
  718. @handle_legacy_interface(
  719. weights=("pretrained", LineNet_MobileNet_V3_Large_FPN_Weights.COCO_V1),
  720. weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1),
  721. )
  722. def linenet_mobilenet_v3_large_fpn(
  723. *,
  724. weights: Optional[LineNet_MobileNet_V3_Large_FPN_Weights] = None,
  725. progress: bool = True,
  726. num_classes: Optional[int] = None,
  727. weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1,
  728. trainable_backbone_layers: Optional[int] = None,
  729. **kwargs: Any,
  730. ) -> LineNet:
  731. """
  732. Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
  733. .. betastatus:: detection module
  734. It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
  735. :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
  736. details.
  737. Example::
  738. >>> model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT)
  739. >>> model.eval()
  740. >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
  741. >>> predictions = model(x)
  742. Args:
  743. weights (:class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights`, optional): The
  744. pretrained weights to use. See
  745. :class:`~torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights` below for
  746. more details, and possible values. By default, no pre-trained
  747. weights are used.
  748. progress (bool, optional): If True, displays a progress bar of the
  749. download to stderr. Default is True.
  750. num_classes (int, optional): number of output classes of the model (including the background)
  751. weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
  752. pretrained weights for the backbone.
  753. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from
  754. final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are
  755. trainable. If ``None`` is passed (the default) this value is set to 3.
  756. **kwargs: parameters passed to the ``torchvision.models.detection.faster_rcnn.FasterRCNN``
  757. base class. Please refer to the `source code
  758. <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py>`_
  759. for more details about this class.
  760. .. autoclass:: torchvision.models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights
  761. :members:
  762. """
  763. weights = LineNet_MobileNet_V3_Large_FPN_Weights.verify(weights)
  764. weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone)
  765. defaults = {
  766. "rpn_score_thresh": 0.05,
  767. }
  768. kwargs = {**defaults, **kwargs}
  769. return _linenet_mobilenet_v3_large_fpn(
  770. weights=weights,
  771. progress=progress,
  772. num_classes=num_classes,
  773. weights_backbone=weights_backbone,
  774. trainable_backbone_layers=trainable_backbone_layers,
  775. **kwargs,
  776. )