roi_head.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. from typing import Dict, List, Optional, Tuple
  2. import matplotlib.pyplot as plt
  3. import torch
  4. import torch.nn.functional as F
  5. import torchvision
  6. from torch import nn, Tensor
  7. from torchvision.ops import boxes as box_ops, roi_align
  8. from . import _utils as det_utils
  9. def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
  10. # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
  11. """
  12. Computes the loss for Faster R-CNN.
  13. Args:
  14. class_logits (Tensor)
  15. box_regression (Tensor)
  16. labels (list[BoxList])
  17. regression_targets (Tensor)
  18. Returns:
  19. classification_loss (Tensor)
  20. box_loss (Tensor)
  21. """
  22. labels = torch.cat(labels, dim=0)
  23. regression_targets = torch.cat(regression_targets, dim=0)
  24. classification_loss = F.cross_entropy(class_logits, labels)
  25. # get indices that correspond to the regression targets for
  26. # the corresponding ground truth labels, to be used with
  27. # advanced indexing
  28. sampled_pos_inds_subset = torch.where(labels > 0)[0]
  29. labels_pos = labels[sampled_pos_inds_subset]
  30. N, num_classes = class_logits.shape
  31. box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
  32. box_loss = F.smooth_l1_loss(
  33. box_regression[sampled_pos_inds_subset, labels_pos],
  34. regression_targets[sampled_pos_inds_subset],
  35. beta=1 / 9,
  36. reduction="sum",
  37. )
  38. box_loss = box_loss / labels.numel()
  39. return classification_loss, box_loss
  40. def maskrcnn_inference(x, labels):
  41. # type: (Tensor, List[Tensor]) -> List[Tensor]
  42. """
  43. From the results of the CNN, post process the masks
  44. by taking the mask corresponding to the class with max
  45. probability (which are of fixed size and directly output
  46. by the CNN) and return the masks in the mask field of the BoxList.
  47. Args:
  48. x (Tensor): the mask logits
  49. labels (list[BoxList]): bounding boxes that are used as
  50. reference, one for ech image
  51. Returns:
  52. results (list[BoxList]): one BoxList for each image, containing
  53. the extra field mask
  54. """
  55. mask_prob = x.sigmoid()
  56. # select masks corresponding to the predicted classes
  57. num_masks = x.shape[0]
  58. boxes_per_image = [label.shape[0] for label in labels]
  59. labels = torch.cat(labels)
  60. index = torch.arange(num_masks, device=labels.device)
  61. mask_prob = mask_prob[index, labels][:, None]
  62. mask_prob = mask_prob.split(boxes_per_image, dim=0)
  63. return mask_prob
  64. def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
  65. # type: (Tensor, Tensor, Tensor, int) -> Tensor
  66. """
  67. Given segmentation masks and the bounding boxes corresponding
  68. to the location of the masks in the image, this function
  69. crops and resizes the masks in the position defined by the
  70. boxes. This prepares the masks for them to be fed to the
  71. loss computation as the targets.
  72. """
  73. matched_idxs = matched_idxs.to(boxes)
  74. rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
  75. gt_masks = gt_masks[:, None].to(rois)
  76. return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
  77. def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
  78. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  79. """
  80. Args:
  81. proposals (list[BoxList])
  82. mask_logits (Tensor)
  83. targets (list[BoxList])
  84. Return:
  85. mask_loss (Tensor): scalar tensor containing the loss
  86. """
  87. discretization_size = mask_logits.shape[-1]
  88. # print(f'mask_logits:{mask_logits},gt_masks:{gt_masks},,gt_labels:{gt_labels}]')
  89. # print(f'mask discretization_size:{discretization_size}')
  90. labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
  91. # print(f'mask labels:{labels}')
  92. mask_targets = [
  93. project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
  94. ]
  95. labels = torch.cat(labels, dim=0)
  96. # print(f'mask labels1:{labels}')
  97. mask_targets = torch.cat(mask_targets, dim=0)
  98. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  99. # accept empty tensors, so handle it separately
  100. if mask_targets.numel() == 0:
  101. return mask_logits.sum() * 0
  102. # print(f'mask_targets:{mask_targets.shape},mask_logits:{mask_logits.shape}')
  103. # print(f'mask_targets:{mask_targets}')
  104. mask_loss = F.binary_cross_entropy_with_logits(
  105. mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
  106. )
  107. # print(f'mask_loss:{mask_loss}')
  108. return mask_loss
  109. def keypoints_to_heatmap(keypoints, rois, heatmap_size):
  110. # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
  111. offset_x = rois[:, 0]
  112. offset_y = rois[:, 1]
  113. scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
  114. scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
  115. offset_x = offset_x[:, None]
  116. offset_y = offset_y[:, None]
  117. scale_x = scale_x[:, None]
  118. scale_y = scale_y[:, None]
  119. x = keypoints[..., 0]
  120. y = keypoints[..., 1]
  121. x_boundary_inds = x == rois[:, 2][:, None]
  122. y_boundary_inds = y == rois[:, 3][:, None]
  123. x = (x - offset_x) * scale_x
  124. x = x.floor().long()
  125. y = (y - offset_y) * scale_y
  126. y = y.floor().long()
  127. x[x_boundary_inds] = heatmap_size - 1
  128. y[y_boundary_inds] = heatmap_size - 1
  129. valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
  130. vis = keypoints[..., 2] > 0
  131. valid = (valid_loc & vis).long()
  132. lin_ind = y * heatmap_size + x
  133. heatmaps = lin_ind * valid
  134. return heatmaps, valid
  135. def _onnx_heatmaps_to_keypoints(
  136. maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
  137. ):
  138. num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
  139. width_correction = widths_i / roi_map_width
  140. height_correction = heights_i / roi_map_height
  141. roi_map = F.interpolate(
  142. maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
  143. )[:, 0]
  144. w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
  145. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  146. x_int = pos % w
  147. y_int = (pos - x_int) // w
  148. x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
  149. dtype=torch.float32
  150. )
  151. y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
  152. dtype=torch.float32
  153. )
  154. xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
  155. xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
  156. xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
  157. xy_preds_i = torch.stack(
  158. [
  159. xy_preds_i_0.to(dtype=torch.float32),
  160. xy_preds_i_1.to(dtype=torch.float32),
  161. xy_preds_i_2.to(dtype=torch.float32),
  162. ],
  163. 0,
  164. )
  165. # TODO: simplify when indexing without rank will be supported by ONNX
  166. base = num_keypoints * num_keypoints + num_keypoints + 1
  167. ind = torch.arange(num_keypoints)
  168. ind = ind.to(dtype=torch.int64) * base
  169. end_scores_i = (
  170. roi_map.index_select(1, y_int.to(dtype=torch.int64))
  171. .index_select(2, x_int.to(dtype=torch.int64))
  172. .view(-1)
  173. .index_select(0, ind.to(dtype=torch.int64))
  174. )
  175. return xy_preds_i, end_scores_i
  176. @torch.jit._script_if_tracing
  177. def _onnx_heatmaps_to_keypoints_loop(
  178. maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
  179. ):
  180. xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  181. end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  182. for i in range(int(rois.size(0))):
  183. xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
  184. maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
  185. )
  186. xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
  187. end_scores = torch.cat(
  188. (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
  189. )
  190. return xy_preds, end_scores
  191. def heatmaps_to_keypoints(maps, rois):
  192. """Extract predicted keypoint locations from heatmaps. Output has shape
  193. (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
  194. for each keypoint.
  195. """
  196. # This function converts a discrete image coordinate in a HEATMAP_SIZE x
  197. # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
  198. # consistency with keypoints_to_heatmap_labels by using the conversion from
  199. # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
  200. # continuous coordinate.
  201. offset_x = rois[:, 0]
  202. offset_y = rois[:, 1]
  203. widths = rois[:, 2] - rois[:, 0]
  204. heights = rois[:, 3] - rois[:, 1]
  205. widths = widths.clamp(min=1)
  206. heights = heights.clamp(min=1)
  207. widths_ceil = widths.ceil()
  208. heights_ceil = heights.ceil()
  209. num_keypoints = maps.shape[1]
  210. if torchvision._is_tracing():
  211. xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
  212. maps,
  213. rois,
  214. widths_ceil,
  215. heights_ceil,
  216. widths,
  217. heights,
  218. offset_x,
  219. offset_y,
  220. torch.scalar_tensor(num_keypoints, dtype=torch.int64),
  221. )
  222. return xy_preds.permute(0, 2, 1), end_scores
  223. xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
  224. end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
  225. for i in range(len(rois)):
  226. roi_map_width = int(widths_ceil[i].item())
  227. roi_map_height = int(heights_ceil[i].item())
  228. width_correction = widths[i] / roi_map_width
  229. height_correction = heights[i] / roi_map_height
  230. roi_map = F.interpolate(
  231. maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
  232. )[:, 0]
  233. # roi_map_probs = scores_to_probs(roi_map.copy())
  234. w = roi_map.shape[2]
  235. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  236. x_int = pos % w
  237. y_int = torch.div(pos - x_int, w, rounding_mode="floor")
  238. # assert (roi_map_probs[k, y_int, x_int] ==
  239. # roi_map_probs[k, :, :].max())
  240. x = (x_int.float() + 0.5) * width_correction
  241. y = (y_int.float() + 0.5) * height_correction
  242. xy_preds[i, 0, :] = x + offset_x[i]
  243. xy_preds[i, 1, :] = y + offset_y[i]
  244. xy_preds[i, 2, :] = 1
  245. end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
  246. return xy_preds.permute(0, 2, 1), end_scores
  247. def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
  248. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  249. N, K, H, W = keypoint_logits.shape
  250. if H != W:
  251. raise ValueError(
  252. f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  253. )
  254. discretization_size = H
  255. heatmaps = []
  256. valid = []
  257. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
  258. kp = gt_kp_in_image[midx]
  259. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  260. heatmaps.append(heatmaps_per_image.view(-1))
  261. valid.append(valid_per_image.view(-1))
  262. keypoint_targets = torch.cat(heatmaps, dim=0)
  263. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  264. valid = torch.where(valid)[0]
  265. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  266. # accept empty tensors, so handle it sepaartely
  267. if keypoint_targets.numel() == 0 or len(valid) == 0:
  268. return keypoint_logits.sum() * 0
  269. keypoint_logits = keypoint_logits.view(N * K, H * W)
  270. keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
  271. return keypoint_loss
  272. def keypointrcnn_inference(x, boxes):
  273. print(f'x:{x.shape}')
  274. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  275. kp_probs = []
  276. kp_scores = []
  277. boxes_per_image = [box.size(0) for box in boxes]
  278. x2 = x.split(boxes_per_image, dim=0)
  279. print(f'x2:{x2}')
  280. for xx, bb in zip(x2, boxes):
  281. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  282. kp_probs.append(kp_prob)
  283. kp_scores.append(scores)
  284. return kp_probs, kp_scores
  285. def _onnx_expand_boxes(boxes, scale):
  286. # type: (Tensor, float) -> Tensor
  287. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  288. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  289. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  290. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  291. w_half = w_half.to(dtype=torch.float32) * scale
  292. h_half = h_half.to(dtype=torch.float32) * scale
  293. boxes_exp0 = x_c - w_half
  294. boxes_exp1 = y_c - h_half
  295. boxes_exp2 = x_c + w_half
  296. boxes_exp3 = y_c + h_half
  297. boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
  298. return boxes_exp
  299. # the next two functions should be merged inside Masker
  300. # but are kept here for the moment while we need them
  301. # temporarily for paste_mask_in_image
  302. def expand_boxes(boxes, scale):
  303. # type: (Tensor, float) -> Tensor
  304. if torchvision._is_tracing():
  305. return _onnx_expand_boxes(boxes, scale)
  306. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  307. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  308. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  309. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  310. w_half *= scale
  311. h_half *= scale
  312. boxes_exp = torch.zeros_like(boxes)
  313. boxes_exp[:, 0] = x_c - w_half
  314. boxes_exp[:, 2] = x_c + w_half
  315. boxes_exp[:, 1] = y_c - h_half
  316. boxes_exp[:, 3] = y_c + h_half
  317. return boxes_exp
  318. @torch.jit.unused
  319. def expand_masks_tracing_scale(M, padding):
  320. # type: (int, int) -> float
  321. return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  322. def expand_masks(mask, padding):
  323. # type: (Tensor, int) -> Tuple[Tensor, float]
  324. M = mask.shape[-1]
  325. if torch._C._get_tracing_state(): # could not import is_tracing(), not sure why
  326. scale = expand_masks_tracing_scale(M, padding)
  327. else:
  328. scale = float(M + 2 * padding) / M
  329. padded_mask = F.pad(mask, (padding,) * 4)
  330. return padded_mask, scale
  331. def paste_mask_in_image(mask, box, im_h, im_w):
  332. # type: (Tensor, Tensor, int, int) -> Tensor
  333. TO_REMOVE = 1
  334. w = int(box[2] - box[0] + TO_REMOVE)
  335. h = int(box[3] - box[1] + TO_REMOVE)
  336. w = max(w, 1)
  337. h = max(h, 1)
  338. # Set shape to [batchxCxHxW]
  339. mask = mask.expand((1, 1, -1, -1))
  340. # Resize mask
  341. mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
  342. mask = mask[0][0]
  343. im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
  344. x_0 = max(box[0], 0)
  345. x_1 = min(box[2] + 1, im_w)
  346. y_0 = max(box[1], 0)
  347. y_1 = min(box[3] + 1, im_h)
  348. im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  349. return im_mask
  350. def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
  351. one = torch.ones(1, dtype=torch.int64)
  352. zero = torch.zeros(1, dtype=torch.int64)
  353. w = box[2] - box[0] + one
  354. h = box[3] - box[1] + one
  355. w = torch.max(torch.cat((w, one)))
  356. h = torch.max(torch.cat((h, one)))
  357. # Set shape to [batchxCxHxW]
  358. mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
  359. # Resize mask
  360. mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
  361. mask = mask[0][0]
  362. x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
  363. x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
  364. y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
  365. y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
  366. unpaded_im_mask = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  367. # TODO : replace below with a dynamic padding when support is added in ONNX
  368. # pad y
  369. zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
  370. zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
  371. concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
  372. # pad x
  373. zeros_x0 = torch.zeros(concat_0.size(0), x_0)
  374. zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
  375. im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
  376. return im_mask
  377. @torch.jit._script_if_tracing
  378. def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
  379. res_append = torch.zeros(0, im_h, im_w)
  380. for i in range(masks.size(0)):
  381. mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
  382. mask_res = mask_res.unsqueeze(0)
  383. res_append = torch.cat((res_append, mask_res))
  384. return res_append
  385. def paste_masks_in_image(masks, boxes, img_shape, padding=1):
  386. # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
  387. masks, scale = expand_masks(masks, padding=padding)
  388. boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
  389. im_h, im_w = img_shape
  390. if torchvision._is_tracing():
  391. return _onnx_paste_masks_in_image_loop(
  392. masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
  393. )[:, None]
  394. res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
  395. if len(res) > 0:
  396. ret = torch.stack(res, dim=0)[:, None]
  397. else:
  398. ret = masks.new_empty((0, 1, im_h, im_w))
  399. return ret
  400. class RoIHeads(nn.Module):
  401. __annotations__ = {
  402. "box_coder": det_utils.BoxCoder,
  403. "proposal_matcher": det_utils.Matcher,
  404. "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
  405. }
  406. def __init__(
  407. self,
  408. box_roi_pool,
  409. box_head,
  410. box_predictor,
  411. # Faster R-CNN training
  412. fg_iou_thresh,
  413. bg_iou_thresh,
  414. batch_size_per_image,
  415. positive_fraction,
  416. bbox_reg_weights,
  417. # Faster R-CNN inference
  418. score_thresh,
  419. nms_thresh,
  420. detections_per_img,
  421. # Mask
  422. mask_roi_pool=None,
  423. mask_head=None,
  424. mask_predictor=None,
  425. keypoint_roi_pool=None,
  426. keypoint_head=None,
  427. keypoint_predictor=None,
  428. ):
  429. super().__init__()
  430. self.box_similarity = box_ops.box_iou
  431. # assign ground-truth boxes for each proposal
  432. self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
  433. self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
  434. if bbox_reg_weights is None:
  435. bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
  436. self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
  437. self.box_roi_pool = box_roi_pool
  438. self.box_head = box_head
  439. self.box_predictor = box_predictor
  440. self.score_thresh = score_thresh
  441. self.nms_thresh = nms_thresh
  442. self.detections_per_img = detections_per_img
  443. self.mask_roi_pool = mask_roi_pool
  444. self.mask_head = mask_head
  445. self.mask_predictor = mask_predictor
  446. self.keypoint_roi_pool = keypoint_roi_pool
  447. self.keypoint_head = keypoint_head
  448. self.keypoint_predictor = keypoint_predictor
  449. def has_mask(self):
  450. if self.mask_roi_pool is None:
  451. return False
  452. if self.mask_head is None:
  453. return False
  454. if self.mask_predictor is None:
  455. return False
  456. return True
  457. def has_keypoint(self):
  458. if self.keypoint_roi_pool is None:
  459. return False
  460. if self.keypoint_head is None:
  461. return False
  462. if self.keypoint_predictor is None:
  463. return False
  464. return True
  465. def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
  466. # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  467. matched_idxs = []
  468. labels = []
  469. for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
  470. if gt_boxes_in_image.numel() == 0:
  471. # Background image
  472. device = proposals_in_image.device
  473. clamped_matched_idxs_in_image = torch.zeros(
  474. (proposals_in_image.shape[0],), dtype=torch.int64, device=device
  475. )
  476. labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
  477. else:
  478. # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
  479. match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
  480. matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
  481. clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
  482. labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
  483. labels_in_image = labels_in_image.to(dtype=torch.int64)
  484. # Label background (below the low threshold)
  485. bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
  486. labels_in_image[bg_inds] = 0
  487. # Label ignore proposals (between low and high thresholds)
  488. ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
  489. labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
  490. matched_idxs.append(clamped_matched_idxs_in_image)
  491. labels.append(labels_in_image)
  492. return matched_idxs, labels
  493. def subsample(self, labels):
  494. # type: (List[Tensor]) -> List[Tensor]
  495. sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  496. sampled_inds = []
  497. for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
  498. img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
  499. sampled_inds.append(img_sampled_inds)
  500. return sampled_inds
  501. def add_gt_proposals(self, proposals, gt_boxes):
  502. # type: (List[Tensor], List[Tensor]) -> List[Tensor]
  503. proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
  504. return proposals
  505. def check_targets(self, targets):
  506. # type: (Optional[List[Dict[str, Tensor]]]) -> None
  507. if targets is None:
  508. raise ValueError("targets should not be None")
  509. if not all(["boxes" in t for t in targets]):
  510. raise ValueError("Every element of targets should have a boxes key")
  511. if not all(["labels" in t for t in targets]):
  512. raise ValueError("Every element of targets should have a labels key")
  513. if self.has_mask():
  514. if not all(["masks" in t for t in targets]):
  515. raise ValueError("Every element of targets should have a masks key")
  516. def select_training_samples(
  517. self,
  518. proposals, # type: List[Tensor]
  519. targets, # type: Optional[List[Dict[str, Tensor]]]
  520. ):
  521. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
  522. self.check_targets(targets)
  523. if targets is None:
  524. raise ValueError("targets should not be None")
  525. dtype = proposals[0].dtype
  526. device = proposals[0].device
  527. gt_boxes = [t["boxes"].to(dtype) for t in targets]
  528. gt_labels = [t["labels"] for t in targets]
  529. # append ground-truth bboxes to propos
  530. proposals = self.add_gt_proposals(proposals, gt_boxes)
  531. # get matching gt indices for each proposal
  532. matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
  533. # sample a fixed proportion of positive-negative proposals
  534. sampled_inds = self.subsample(labels)
  535. matched_gt_boxes = []
  536. num_images = len(proposals)
  537. for img_id in range(num_images):
  538. img_sampled_inds = sampled_inds[img_id]
  539. proposals[img_id] = proposals[img_id][img_sampled_inds]
  540. labels[img_id] = labels[img_id][img_sampled_inds]
  541. matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
  542. gt_boxes_in_image = gt_boxes[img_id]
  543. if gt_boxes_in_image.numel() == 0:
  544. gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
  545. matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
  546. regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
  547. return proposals, matched_idxs, labels, regression_targets
  548. def postprocess_detections(
  549. self,
  550. class_logits, # type: Tensor
  551. box_regression, # type: Tensor
  552. proposals, # type: List[Tensor]
  553. image_shapes, # type: List[Tuple[int, int]]
  554. ):
  555. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
  556. device = class_logits.device
  557. num_classes = class_logits.shape[-1]
  558. boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
  559. pred_boxes = self.box_coder.decode(box_regression, proposals)
  560. pred_scores = F.softmax(class_logits, -1)
  561. pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
  562. pred_scores_list = pred_scores.split(boxes_per_image, 0)
  563. all_boxes = []
  564. all_scores = []
  565. all_labels = []
  566. for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
  567. boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
  568. # create labels for each prediction
  569. labels = torch.arange(num_classes, device=device)
  570. labels = labels.view(1, -1).expand_as(scores)
  571. # remove predictions with the background label
  572. boxes = boxes[:, 1:]
  573. scores = scores[:, 1:]
  574. labels = labels[:, 1:]
  575. # batch everything, by making every class prediction be a separate instance
  576. boxes = boxes.reshape(-1, 4)
  577. scores = scores.reshape(-1)
  578. labels = labels.reshape(-1)
  579. # remove low scoring boxes
  580. inds = torch.where(scores > self.score_thresh)[0]
  581. boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
  582. # remove empty boxes
  583. keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
  584. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  585. # non-maximum suppression, independently done per class
  586. keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
  587. # keep only topk scoring predictions
  588. keep = keep[: self.detections_per_img]
  589. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  590. all_boxes.append(boxes)
  591. all_scores.append(scores)
  592. all_labels.append(labels)
  593. return all_boxes, all_scores, all_labels
  594. def forward(
  595. self,
  596. features, # type: Dict[str, Tensor]
  597. proposals, # type: List[Tensor]
  598. image_shapes, # type: List[Tuple[int, int]]
  599. targets=None, # type: Optional[List[Dict[str, Tensor]]]
  600. ):
  601. # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
  602. """
  603. Args:
  604. features (List[Tensor])
  605. proposals (List[Tensor[N, 4]])
  606. image_shapes (List[Tuple[H, W]])
  607. targets (List[Dict])
  608. """
  609. if targets is not None:
  610. for t in targets:
  611. # TODO: https://github.com/pytorch/pytorch/issues/26731
  612. floating_point_types = (torch.float, torch.double, torch.half)
  613. if not t["boxes"].dtype in floating_point_types:
  614. raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
  615. if not t["labels"].dtype == torch.int64:
  616. raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
  617. if self.has_keypoint():
  618. if not t["keypoints"].dtype == torch.float32:
  619. raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
  620. if self.training:
  621. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  622. else:
  623. labels = None
  624. regression_targets = None
  625. matched_idxs = None
  626. box_features = self.box_roi_pool(features, proposals, image_shapes)
  627. box_features = self.box_head(box_features)
  628. class_logits, box_regression = self.box_predictor(box_features)
  629. result: List[Dict[str, torch.Tensor]] = []
  630. losses = {}
  631. if self.training:
  632. if labels is None:
  633. raise ValueError("labels cannot be None")
  634. if regression_targets is None:
  635. raise ValueError("regression_targets cannot be None")
  636. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  637. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  638. else:
  639. boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
  640. num_images = len(boxes)
  641. for i in range(num_images):
  642. result.append(
  643. {
  644. "boxes": boxes[i],
  645. "labels": labels[i],
  646. "scores": scores[i],
  647. }
  648. )
  649. if self.has_mask():
  650. mask_proposals = [p["boxes"] for p in result]
  651. if self.training:
  652. if matched_idxs is None:
  653. raise ValueError("if in training, matched_idxs should not be None")
  654. # during training, only focus on positive boxes
  655. num_images = len(proposals)
  656. mask_proposals = []
  657. pos_matched_idxs = []
  658. for img_id in range(num_images):
  659. pos = torch.where(labels[img_id] > 0)[0]
  660. mask_proposals.append(proposals[img_id][pos])
  661. pos_matched_idxs.append(matched_idxs[img_id][pos])
  662. else:
  663. pos_matched_idxs = None
  664. if self.mask_roi_pool is not None:
  665. mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
  666. mask_features = self.mask_head(mask_features)
  667. mask_logits = self.mask_predictor(mask_features)
  668. else:
  669. raise Exception("Expected mask_roi_pool to be not None")
  670. loss_mask = {}
  671. if self.training:
  672. if targets is None or pos_matched_idxs is None or mask_logits is None:
  673. raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
  674. gt_masks = [t["masks"] for t in targets]
  675. gt_labels = [t["labels"] for t in targets]
  676. rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
  677. loss_mask = {"loss_mask": rcnn_loss_mask}
  678. else:
  679. labels = [r["labels"] for r in result]
  680. masks_probs = maskrcnn_inference(mask_logits, labels)
  681. for mask_prob, r in zip(masks_probs, result):
  682. r["masks"] = mask_prob
  683. losses.update(loss_mask)
  684. # keep none checks in if conditional so torchscript will conditionally
  685. # compile each branch
  686. if (
  687. self.keypoint_roi_pool is not None
  688. and self.keypoint_head is not None
  689. and self.keypoint_predictor is not None
  690. ):
  691. keypoint_proposals = [p["boxes"] for p in result]
  692. if self.training:
  693. # during training, only focus on positive boxes
  694. num_images = len(proposals)
  695. keypoint_proposals = []
  696. pos_matched_idxs = []
  697. if matched_idxs is None:
  698. raise ValueError("if in trainning, matched_idxs should not be None")
  699. for img_id in range(num_images):
  700. pos = torch.where(labels[img_id] > 0)[0]
  701. keypoint_proposals.append(proposals[img_id][pos])
  702. pos_matched_idxs.append(matched_idxs[img_id][pos])
  703. else:
  704. pos_matched_idxs = None
  705. keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
  706. # tmp = keypoint_features[0][0]
  707. # plt.imshow(tmp.detach().numpy())
  708. print(f'keypoint_features from roi_pool:{keypoint_features.shape}')
  709. keypoint_features = self.keypoint_head(keypoint_features)
  710. print(f'keypoint_features:{keypoint_features.shape}')
  711. tmp=keypoint_features[0][0]
  712. plt.imshow(tmp.detach().numpy())
  713. keypoint_logits = self.keypoint_predictor(keypoint_features)
  714. print(f'keypoint_logits:{keypoint_logits.shape}')
  715. """
  716. 接wirenet
  717. """
  718. loss_keypoint = {}
  719. if self.training:
  720. if targets is None or pos_matched_idxs is None:
  721. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  722. gt_keypoints = [t["keypoints"] for t in targets]
  723. rcnn_loss_keypoint = keypointrcnn_loss(
  724. keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
  725. )
  726. loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
  727. else:
  728. if keypoint_logits is None or keypoint_proposals is None:
  729. raise ValueError(
  730. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  731. )
  732. keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
  733. for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
  734. r["keypoints"] = keypoint_prob
  735. r["keypoints_scores"] = kps
  736. losses.update(loss_keypoint)
  737. return result, losses