roi_heads.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903
  1. from typing import Dict, List, Optional, Tuple
  2. import torch
  3. import torch.nn.functional as F
  4. import torchvision
  5. from torch import nn, Tensor
  6. from torchvision.ops import boxes as box_ops, roi_align
  7. from libs.vision_libs.models.detection import _utils as det_utils
  8. ###计算多头损失
  9. def line_loss():
  10. pass
  11. def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
  12. # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
  13. """
  14. Computes the loss for Faster R-CNN.
  15. Args:
  16. class_logits (Tensor)
  17. box_regression (Tensor)
  18. labels (list[BoxList])
  19. regression_targets (Tensor)
  20. Returns:
  21. classification_loss (Tensor)
  22. box_loss (Tensor)
  23. """
  24. labels = torch.cat(labels, dim=0)
  25. regression_targets = torch.cat(regression_targets, dim=0)
  26. classification_loss = F.cross_entropy(class_logits, labels)
  27. # get indices that correspond to the regression targets for
  28. # the corresponding ground truth labels, to be used with
  29. # advanced indexing
  30. sampled_pos_inds_subset = torch.where(labels > 0)[0]
  31. labels_pos = labels[sampled_pos_inds_subset]
  32. N, num_classes = class_logits.shape
  33. box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
  34. box_loss = F.smooth_l1_loss(
  35. box_regression[sampled_pos_inds_subset, labels_pos],
  36. regression_targets[sampled_pos_inds_subset],
  37. beta=1 / 9,
  38. reduction="sum",
  39. )
  40. box_loss = box_loss / labels.numel()
  41. return classification_loss, box_loss
  42. def maskrcnn_inference(x, labels):
  43. # type: (Tensor, List[Tensor]) -> List[Tensor]
  44. """
  45. From the results of the CNN, post process the masks
  46. by taking the mask corresponding to the class with max
  47. probability (which are of fixed size and directly output
  48. by the CNN) and return the masks in the mask field of the BoxList.
  49. Args:
  50. x (Tensor): the mask logits
  51. labels (list[BoxList]): bounding boxes that are used as
  52. reference, one for ech image
  53. Returns:
  54. results (list[BoxList]): one BoxList for each image, containing
  55. the extra field mask
  56. """
  57. mask_prob = x.sigmoid()
  58. # select masks corresponding to the predicted classes
  59. num_masks = x.shape[0]
  60. boxes_per_image = [label.shape[0] for label in labels]
  61. labels = torch.cat(labels)
  62. index = torch.arange(num_masks, device=labels.device)
  63. mask_prob = mask_prob[index, labels][:, None]
  64. mask_prob = mask_prob.split(boxes_per_image, dim=0)
  65. return mask_prob
  66. def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
  67. # type: (Tensor, Tensor, Tensor, int) -> Tensor
  68. """
  69. Given segmentation masks and the bounding boxes corresponding
  70. to the location of the masks in the image, this function
  71. crops and resizes the masks in the position defined by the
  72. boxes. This prepares the masks for them to be fed to the
  73. loss computation as the targets.
  74. """
  75. matched_idxs = matched_idxs.to(boxes)
  76. rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
  77. gt_masks = gt_masks[:, None].to(rois)
  78. return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
  79. def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
  80. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  81. """
  82. Args:
  83. proposals (list[BoxList])
  84. mask_logits (Tensor)
  85. targets (list[BoxList])
  86. Return:
  87. mask_loss (Tensor): scalar tensor containing the loss
  88. """
  89. discretization_size = mask_logits.shape[-1]
  90. labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
  91. mask_targets = [
  92. project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
  93. ]
  94. labels = torch.cat(labels, dim=0)
  95. mask_targets = torch.cat(mask_targets, dim=0)
  96. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  97. # accept empty tensors, so handle it separately
  98. if mask_targets.numel() == 0:
  99. return mask_logits.sum() * 0
  100. mask_loss = F.binary_cross_entropy_with_logits(
  101. mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
  102. )
  103. return mask_loss
  104. def keypoints_to_heatmap(keypoints, rois, heatmap_size):
  105. # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
  106. offset_x = rois[:, 0]
  107. offset_y = rois[:, 1]
  108. scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
  109. scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
  110. offset_x = offset_x[:, None]
  111. offset_y = offset_y[:, None]
  112. scale_x = scale_x[:, None]
  113. scale_y = scale_y[:, None]
  114. x = keypoints[..., 0]
  115. y = keypoints[..., 1]
  116. x_boundary_inds = x == rois[:, 2][:, None]
  117. y_boundary_inds = y == rois[:, 3][:, None]
  118. x = (x - offset_x) * scale_x
  119. x = x.floor().long()
  120. y = (y - offset_y) * scale_y
  121. y = y.floor().long()
  122. x[x_boundary_inds] = heatmap_size - 1
  123. y[y_boundary_inds] = heatmap_size - 1
  124. valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
  125. vis = keypoints[..., 2] > 0
  126. valid = (valid_loc & vis).long()
  127. lin_ind = y * heatmap_size + x
  128. heatmaps = lin_ind * valid
  129. return heatmaps, valid
  130. def _onnx_heatmaps_to_keypoints(
  131. maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
  132. ):
  133. num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
  134. width_correction = widths_i / roi_map_width
  135. height_correction = heights_i / roi_map_height
  136. roi_map = F.interpolate(
  137. maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
  138. )[:, 0]
  139. w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
  140. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  141. x_int = pos % w
  142. y_int = (pos - x_int) // w
  143. x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
  144. dtype=torch.float32
  145. )
  146. y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
  147. dtype=torch.float32
  148. )
  149. xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
  150. xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
  151. xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
  152. xy_preds_i = torch.stack(
  153. [
  154. xy_preds_i_0.to(dtype=torch.float32),
  155. xy_preds_i_1.to(dtype=torch.float32),
  156. xy_preds_i_2.to(dtype=torch.float32),
  157. ],
  158. 0,
  159. )
  160. # TODO: simplify when indexing without rank will be supported by ONNX
  161. base = num_keypoints * num_keypoints + num_keypoints + 1
  162. ind = torch.arange(num_keypoints)
  163. ind = ind.to(dtype=torch.int64) * base
  164. end_scores_i = (
  165. roi_map.index_select(1, y_int.to(dtype=torch.int64))
  166. .index_select(2, x_int.to(dtype=torch.int64))
  167. .view(-1)
  168. .index_select(0, ind.to(dtype=torch.int64))
  169. )
  170. return xy_preds_i, end_scores_i
  171. @torch.jit._script_if_tracing
  172. def _onnx_heatmaps_to_keypoints_loop(
  173. maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
  174. ):
  175. xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  176. end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  177. for i in range(int(rois.size(0))):
  178. xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
  179. maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
  180. )
  181. xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
  182. end_scores = torch.cat(
  183. (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
  184. )
  185. return xy_preds, end_scores
  186. def heatmaps_to_keypoints(maps, rois):
  187. """Extract predicted keypoint locations from heatmaps. Output has shape
  188. (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
  189. for each keypoint.
  190. """
  191. # This function converts a discrete image coordinate in a HEATMAP_SIZE x
  192. # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
  193. # consistency with keypoints_to_heatmap_labels by using the conversion from
  194. # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
  195. # continuous coordinate.
  196. offset_x = rois[:, 0]
  197. offset_y = rois[:, 1]
  198. widths = rois[:, 2] - rois[:, 0]
  199. heights = rois[:, 3] - rois[:, 1]
  200. widths = widths.clamp(min=1)
  201. heights = heights.clamp(min=1)
  202. widths_ceil = widths.ceil()
  203. heights_ceil = heights.ceil()
  204. num_keypoints = maps.shape[1]
  205. if torchvision._is_tracing():
  206. xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
  207. maps,
  208. rois,
  209. widths_ceil,
  210. heights_ceil,
  211. widths,
  212. heights,
  213. offset_x,
  214. offset_y,
  215. torch.scalar_tensor(num_keypoints, dtype=torch.int64),
  216. )
  217. return xy_preds.permute(0, 2, 1), end_scores
  218. xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
  219. end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
  220. for i in range(len(rois)):
  221. roi_map_width = int(widths_ceil[i].item())
  222. roi_map_height = int(heights_ceil[i].item())
  223. width_correction = widths[i] / roi_map_width
  224. height_correction = heights[i] / roi_map_height
  225. roi_map = F.interpolate(
  226. maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
  227. )[:, 0]
  228. # roi_map_probs = scores_to_probs(roi_map.copy())
  229. w = roi_map.shape[2]
  230. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  231. x_int = pos % w
  232. y_int = torch.div(pos - x_int, w, rounding_mode="floor")
  233. # assert (roi_map_probs[k, y_int, x_int] ==
  234. # roi_map_probs[k, :, :].max())
  235. x = (x_int.float() + 0.5) * width_correction
  236. y = (y_int.float() + 0.5) * height_correction
  237. xy_preds[i, 0, :] = x + offset_x[i]
  238. xy_preds[i, 1, :] = y + offset_y[i]
  239. xy_preds[i, 2, :] = 1
  240. end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
  241. return xy_preds.permute(0, 2, 1), end_scores
  242. def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
  243. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  244. N, K, H, W = keypoint_logits.shape
  245. if H != W:
  246. raise ValueError(
  247. f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  248. )
  249. discretization_size = H
  250. heatmaps = []
  251. valid = []
  252. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
  253. kp = gt_kp_in_image[midx]
  254. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  255. heatmaps.append(heatmaps_per_image.view(-1))
  256. valid.append(valid_per_image.view(-1))
  257. keypoint_targets = torch.cat(heatmaps, dim=0)
  258. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  259. valid = torch.where(valid)[0]
  260. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  261. # accept empty tensors, so handle it sepaartely
  262. if keypoint_targets.numel() == 0 or len(valid) == 0:
  263. return keypoint_logits.sum() * 0
  264. keypoint_logits = keypoint_logits.view(N * K, H * W)
  265. keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
  266. return keypoint_loss
  267. def keypointrcnn_inference(x, boxes):
  268. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  269. kp_probs = []
  270. kp_scores = []
  271. boxes_per_image = [box.size(0) for box in boxes]
  272. x2 = x.split(boxes_per_image, dim=0)
  273. for xx, bb in zip(x2, boxes):
  274. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  275. kp_probs.append(kp_prob)
  276. kp_scores.append(scores)
  277. return kp_probs, kp_scores
  278. def _onnx_expand_boxes(boxes, scale):
  279. # type: (Tensor, float) -> Tensor
  280. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  281. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  282. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  283. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  284. w_half = w_half.to(dtype=torch.float32) * scale
  285. h_half = h_half.to(dtype=torch.float32) * scale
  286. boxes_exp0 = x_c - w_half
  287. boxes_exp1 = y_c - h_half
  288. boxes_exp2 = x_c + w_half
  289. boxes_exp3 = y_c + h_half
  290. boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
  291. return boxes_exp
  292. # the next two functions should be merged inside Masker
  293. # but are kept here for the moment while we need them
  294. # temporarily for paste_mask_in_image
  295. def expand_boxes(boxes, scale):
  296. # type: (Tensor, float) -> Tensor
  297. if torchvision._is_tracing():
  298. return _onnx_expand_boxes(boxes, scale)
  299. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  300. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  301. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  302. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  303. w_half *= scale
  304. h_half *= scale
  305. boxes_exp = torch.zeros_like(boxes)
  306. boxes_exp[:, 0] = x_c - w_half
  307. boxes_exp[:, 2] = x_c + w_half
  308. boxes_exp[:, 1] = y_c - h_half
  309. boxes_exp[:, 3] = y_c + h_half
  310. return boxes_exp
  311. @torch.jit.unused
  312. def expand_masks_tracing_scale(M, padding):
  313. # type: (int, int) -> float
  314. return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  315. def expand_masks(mask, padding):
  316. # type: (Tensor, int) -> Tuple[Tensor, float]
  317. M = mask.shape[-1]
  318. if torch._C._get_tracing_state(): # could not import is_tracing(), not sure why
  319. scale = expand_masks_tracing_scale(M, padding)
  320. else:
  321. scale = float(M + 2 * padding) / M
  322. padded_mask = F.pad(mask, (padding,) * 4)
  323. return padded_mask, scale
  324. def paste_mask_in_image(mask, box, im_h, im_w):
  325. # type: (Tensor, Tensor, int, int) -> Tensor
  326. TO_REMOVE = 1
  327. w = int(box[2] - box[0] + TO_REMOVE)
  328. h = int(box[3] - box[1] + TO_REMOVE)
  329. w = max(w, 1)
  330. h = max(h, 1)
  331. # Set shape to [batchxCxHxW]
  332. mask = mask.expand((1, 1, -1, -1))
  333. # Resize mask
  334. mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
  335. mask = mask[0][0]
  336. im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
  337. x_0 = max(box[0], 0)
  338. x_1 = min(box[2] + 1, im_w)
  339. y_0 = max(box[1], 0)
  340. y_1 = min(box[3] + 1, im_h)
  341. im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  342. return im_mask
  343. def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
  344. one = torch.ones(1, dtype=torch.int64)
  345. zero = torch.zeros(1, dtype=torch.int64)
  346. w = box[2] - box[0] + one
  347. h = box[3] - box[1] + one
  348. w = torch.max(torch.cat((w, one)))
  349. h = torch.max(torch.cat((h, one)))
  350. # Set shape to [batchxCxHxW]
  351. mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
  352. # Resize mask
  353. mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
  354. mask = mask[0][0]
  355. x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
  356. x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
  357. y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
  358. y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
  359. unpaded_im_mask = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  360. # TODO : replace below with a dynamic padding when support is added in ONNX
  361. # pad y
  362. zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
  363. zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
  364. concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
  365. # pad x
  366. zeros_x0 = torch.zeros(concat_0.size(0), x_0)
  367. zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
  368. im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
  369. return im_mask
  370. @torch.jit._script_if_tracing
  371. def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
  372. res_append = torch.zeros(0, im_h, im_w)
  373. for i in range(masks.size(0)):
  374. mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
  375. mask_res = mask_res.unsqueeze(0)
  376. res_append = torch.cat((res_append, mask_res))
  377. return res_append
  378. def paste_masks_in_image(masks, boxes, img_shape, padding=1):
  379. # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
  380. masks, scale = expand_masks(masks, padding=padding)
  381. boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
  382. im_h, im_w = img_shape
  383. if torchvision._is_tracing():
  384. return _onnx_paste_masks_in_image_loop(
  385. masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
  386. )[:, None]
  387. res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
  388. if len(res) > 0:
  389. ret = torch.stack(res, dim=0)[:, None]
  390. else:
  391. ret = masks.new_empty((0, 1, im_h, im_w))
  392. return ret
  393. class RoIHeads(nn.Module):
  394. __annotations__ = {
  395. "box_coder": det_utils.BoxCoder,
  396. "proposal_matcher": det_utils.Matcher,
  397. "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
  398. }
  399. def __init__(
  400. self,
  401. box_roi_pool,
  402. box_head,
  403. box_predictor,
  404. line_head,
  405. line_predictor,
  406. # Faster R-CNN training
  407. fg_iou_thresh,
  408. bg_iou_thresh,
  409. batch_size_per_image,
  410. positive_fraction,
  411. bbox_reg_weights,
  412. # Faster R-CNN inference
  413. score_thresh,
  414. nms_thresh,
  415. detections_per_img,
  416. # Mask
  417. mask_roi_pool=None,
  418. mask_head=None,
  419. mask_predictor=None,
  420. keypoint_roi_pool=None,
  421. keypoint_head=None,
  422. keypoint_predictor=None,
  423. ):
  424. super().__init__()
  425. self.box_similarity = box_ops.box_iou
  426. # assign ground-truth boxes for each proposal
  427. self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
  428. self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
  429. if bbox_reg_weights is None:
  430. bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
  431. self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
  432. self.box_roi_pool = box_roi_pool
  433. self.box_head = box_head
  434. self.box_predictor = box_predictor
  435. self.line_head=line_head
  436. self.line_predictor=line_predictor
  437. self.score_thresh = score_thresh
  438. self.nms_thresh = nms_thresh
  439. self.detections_per_img = detections_per_img
  440. self.mask_roi_pool = mask_roi_pool
  441. self.mask_head = mask_head
  442. self.mask_predictor = mask_predictor
  443. self.keypoint_roi_pool = keypoint_roi_pool
  444. self.keypoint_head = keypoint_head
  445. self.keypoint_predictor = keypoint_predictor
  446. def has_line(self):
  447. pass
  448. def has_mask(self):
  449. if self.mask_roi_pool is None:
  450. return False
  451. if self.mask_head is None:
  452. return False
  453. if self.mask_predictor is None:
  454. return False
  455. return True
  456. def has_keypoint(self):
  457. if self.keypoint_roi_pool is None:
  458. return False
  459. if self.keypoint_head is None:
  460. return False
  461. if self.keypoint_predictor is None:
  462. return False
  463. return True
  464. def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
  465. # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  466. matched_idxs = []
  467. labels = []
  468. for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
  469. if gt_boxes_in_image.numel() == 0:
  470. # Background image
  471. device = proposals_in_image.device
  472. clamped_matched_idxs_in_image = torch.zeros(
  473. (proposals_in_image.shape[0],), dtype=torch.int64, device=device
  474. )
  475. labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
  476. else:
  477. # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
  478. match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
  479. matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
  480. clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
  481. labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
  482. labels_in_image = labels_in_image.to(dtype=torch.int64)
  483. # Label background (below the low threshold)
  484. bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
  485. labels_in_image[bg_inds] = 0
  486. # Label ignore proposals (between low and high thresholds)
  487. ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
  488. labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
  489. matched_idxs.append(clamped_matched_idxs_in_image)
  490. labels.append(labels_in_image)
  491. return matched_idxs, labels
  492. def subsample(self, labels):
  493. # type: (List[Tensor]) -> List[Tensor]
  494. sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  495. sampled_inds = []
  496. for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
  497. img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
  498. sampled_inds.append(img_sampled_inds)
  499. return sampled_inds
  500. def add_gt_proposals(self, proposals, gt_boxes):
  501. # type: (List[Tensor], List[Tensor]) -> List[Tensor]
  502. proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
  503. return proposals
  504. def check_targets(self, targets):
  505. # type: (Optional[List[Dict[str, Tensor]]]) -> None
  506. if targets is None:
  507. raise ValueError("targets should not be None")
  508. if not all(["boxes" in t for t in targets]):
  509. raise ValueError("Every element of targets should have a boxes key")
  510. if not all(["labels" in t for t in targets]):
  511. raise ValueError("Every element of targets should have a labels key")
  512. if self.has_mask():
  513. if not all(["masks" in t for t in targets]):
  514. raise ValueError("Every element of targets should have a masks key")
  515. def select_training_samples(
  516. self,
  517. proposals, # type: List[Tensor]
  518. targets, # type: Optional[List[Dict[str, Tensor]]]
  519. ):
  520. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
  521. self.check_targets(targets)
  522. if targets is None:
  523. raise ValueError("targets should not be None")
  524. dtype = proposals[0].dtype
  525. device = proposals[0].device
  526. gt_boxes = [t["boxes"].to(dtype) for t in targets]
  527. gt_labels = [t["labels"] for t in targets]
  528. # append ground-truth bboxes to propos
  529. proposals = self.add_gt_proposals(proposals, gt_boxes)
  530. # get matching gt indices for each proposal
  531. matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
  532. # sample a fixed proportion of positive-negative proposals
  533. sampled_inds = self.subsample(labels)
  534. matched_gt_boxes = []
  535. num_images = len(proposals)
  536. for img_id in range(num_images):
  537. img_sampled_inds = sampled_inds[img_id]
  538. proposals[img_id] = proposals[img_id][img_sampled_inds]
  539. labels[img_id] = labels[img_id][img_sampled_inds]
  540. matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
  541. gt_boxes_in_image = gt_boxes[img_id]
  542. if gt_boxes_in_image.numel() == 0:
  543. gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
  544. matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
  545. regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
  546. return proposals, matched_idxs, labels, regression_targets
  547. def postprocess_detections(
  548. self,
  549. class_logits, # type: Tensor
  550. box_regression, # type: Tensor
  551. proposals, # type: List[Tensor]
  552. image_shapes, # type: List[Tuple[int, int]]
  553. ):
  554. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
  555. device = class_logits.device
  556. num_classes = class_logits.shape[-1]
  557. boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
  558. pred_boxes = self.box_coder.decode(box_regression, proposals)
  559. pred_scores = F.softmax(class_logits, -1)
  560. pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
  561. pred_scores_list = pred_scores.split(boxes_per_image, 0)
  562. all_boxes = []
  563. all_scores = []
  564. all_labels = []
  565. for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
  566. boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
  567. # create labels for each prediction
  568. labels = torch.arange(num_classes, device=device)
  569. labels = labels.view(1, -1).expand_as(scores)
  570. # remove predictions with the background label
  571. boxes = boxes[:, 1:]
  572. scores = scores[:, 1:]
  573. labels = labels[:, 1:]
  574. # batch everything, by making every class prediction be a separate instance
  575. boxes = boxes.reshape(-1, 4)
  576. scores = scores.reshape(-1)
  577. labels = labels.reshape(-1)
  578. # remove low scoring boxes
  579. inds = torch.where(scores > self.score_thresh)[0]
  580. boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
  581. # remove empty boxes
  582. keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
  583. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  584. # non-maximum suppression, independently done per class
  585. keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
  586. # keep only topk scoring predictions
  587. keep = keep[: self.detections_per_img]
  588. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  589. all_boxes.append(boxes)
  590. all_scores.append(scores)
  591. all_labels.append(labels)
  592. return all_boxes, all_scores, all_labels
  593. def forward(
  594. self,
  595. features, # type: Dict[str, Tensor]
  596. proposals, # type: List[Tensor]
  597. image_shapes, # type: List[Tuple[int, int]]
  598. targets=None, # type: Optional[List[Dict[str, Tensor]]]
  599. ):
  600. # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
  601. """
  602. Args:
  603. features (List[Tensor])
  604. proposals (List[Tensor[N, 4]])
  605. image_shapes (List[Tuple[H, W]])
  606. targets (List[Dict])
  607. """
  608. if targets is not None:
  609. for t in targets:
  610. # TODO: https://github.com/pytorch/pytorch/issues/26731
  611. floating_point_types = (torch.float, torch.double, torch.half)
  612. if not t["boxes"].dtype in floating_point_types:
  613. raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
  614. if not t["labels"].dtype == torch.int64:
  615. raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
  616. if self.has_keypoint():
  617. if not t["keypoints"].dtype == torch.float32:
  618. raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
  619. if self.training:
  620. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  621. else:
  622. labels = None
  623. regression_targets = None
  624. matched_idxs = None
  625. box_features = self.box_roi_pool(features, proposals, image_shapes)
  626. box_features = self.box_head(box_features)
  627. class_logits, box_regression = self.box_predictor(box_features)
  628. result: List[Dict[str, torch.Tensor]] = []
  629. losses = {}
  630. if self.training:
  631. if labels is None:
  632. raise ValueError("labels cannot be None")
  633. if regression_targets is None:
  634. raise ValueError("regression_targets cannot be None")
  635. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  636. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  637. else:
  638. boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
  639. num_images = len(boxes)
  640. for i in range(num_images):
  641. result.append(
  642. {
  643. "boxes": boxes[i],
  644. "labels": labels[i],
  645. "scores": scores[i],
  646. }
  647. )
  648. if self.has_line():
  649. line_features = self.line_head(features)
  650. _ = self.line_predictor(line_features)
  651. ### line_loss(multitasklearner)
  652. ### infer
  653. pass
  654. if self.has_mask():
  655. mask_proposals = [p["boxes"] for p in result]
  656. if self.training:
  657. if matched_idxs is None:
  658. raise ValueError("if in training, matched_idxs should not be None")
  659. # during training, only focus on positive boxes
  660. num_images = len(proposals)
  661. mask_proposals = []
  662. pos_matched_idxs = []
  663. for img_id in range(num_images):
  664. pos = torch.where(labels[img_id] > 0)[0]
  665. mask_proposals.append(proposals[img_id][pos])
  666. pos_matched_idxs.append(matched_idxs[img_id][pos])
  667. else:
  668. pos_matched_idxs = None
  669. if self.mask_roi_pool is not None:
  670. mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
  671. mask_features = self.mask_head(mask_features)
  672. mask_logits = self.mask_predictor(mask_features)
  673. else:
  674. raise Exception("Expected mask_roi_pool to be not None")
  675. loss_mask = {}
  676. if self.training:
  677. if targets is None or pos_matched_idxs is None or mask_logits is None:
  678. raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
  679. gt_masks = [t["masks"] for t in targets]
  680. gt_labels = [t["labels"] for t in targets]
  681. rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
  682. loss_mask = {"loss_mask": rcnn_loss_mask}
  683. else:
  684. labels = [r["labels"] for r in result]
  685. masks_probs = maskrcnn_inference(mask_logits, labels)
  686. for mask_prob, r in zip(masks_probs, result):
  687. r["masks"] = mask_prob
  688. losses.update(loss_mask)
  689. # keep none checks in if conditional so torchscript will conditionally
  690. # compile each branch
  691. if (
  692. self.keypoint_roi_pool is not None
  693. and self.keypoint_head is not None
  694. and self.keypoint_predictor is not None
  695. ):
  696. keypoint_proposals = [p["boxes"] for p in result]
  697. if self.training:
  698. # during training, only focus on positive boxes
  699. num_images = len(proposals)
  700. keypoint_proposals = []
  701. pos_matched_idxs = []
  702. if matched_idxs is None:
  703. raise ValueError("if in trainning, matched_idxs should not be None")
  704. for img_id in range(num_images):
  705. pos = torch.where(labels[img_id] > 0)[0]
  706. keypoint_proposals.append(proposals[img_id][pos])
  707. pos_matched_idxs.append(matched_idxs[img_id][pos])
  708. else:
  709. pos_matched_idxs = None
  710. keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
  711. keypoint_features = self.keypoint_head(keypoint_features)
  712. keypoint_logits = self.keypoint_predictor(keypoint_features)
  713. loss_keypoint = {}
  714. if self.training:
  715. if targets is None or pos_matched_idxs is None:
  716. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  717. gt_keypoints = [t["keypoints"] for t in targets]
  718. rcnn_loss_keypoint = keypointrcnn_loss(
  719. keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
  720. )
  721. loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
  722. else:
  723. if keypoint_logits is None or keypoint_proposals is None:
  724. raise ValueError(
  725. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  726. )
  727. keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
  728. for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
  729. r["keypoints"] = keypoint_prob
  730. r["keypoints_scores"] = kps
  731. losses.update(loss_keypoint)
  732. return result, losses