roi_heads.py 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. from typing import Dict, List, Optional, Tuple
  2. import torch
  3. import torch.nn.functional as F
  4. import torchvision
  5. from torch import nn, Tensor
  6. from libs.vision_libs.ops import boxes as box_ops, roi_align
  7. import libs.vision_libs.models.detection._utils as det_utils
  8. from collections import OrderedDict
  9. def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
  10. # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
  11. """
  12. Computes the loss for Faster R-CNN.
  13. Args:
  14. class_logits (Tensor)
  15. box_regression (Tensor)
  16. labels (list[BoxList])
  17. regression_targets (Tensor)
  18. Returns:
  19. classification_loss (Tensor)
  20. box_loss (Tensor)
  21. """
  22. # print(f'compute fastrcnn_loss:{labels}')
  23. labels = torch.cat(labels, dim=0)
  24. regression_targets = torch.cat(regression_targets, dim=0)
  25. classification_loss = F.cross_entropy(class_logits, labels)
  26. # get indices that correspond to the regression targets for
  27. # the corresponding ground truth labels, to be used with
  28. # advanced indexing
  29. sampled_pos_inds_subset = torch.where(labels > 0)[0]
  30. labels_pos = labels[sampled_pos_inds_subset]
  31. N, num_classes = class_logits.shape
  32. box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
  33. box_loss = F.smooth_l1_loss(
  34. box_regression[sampled_pos_inds_subset, labels_pos],
  35. regression_targets[sampled_pos_inds_subset],
  36. beta=1 / 9,
  37. reduction="sum",
  38. )
  39. box_loss = box_loss / labels.numel()
  40. return classification_loss, box_loss
  41. def maskrcnn_inference(x, labels):
  42. # type: (Tensor, List[Tensor]) -> List[Tensor]
  43. """
  44. From the results of the CNN, post process the masks
  45. by taking the mask corresponding to the class with max
  46. probability (which are of fixed size and directly output
  47. by the CNN) and return the masks in the mask field of the BoxList.
  48. Args:
  49. x (Tensor): the mask logits
  50. labels (list[BoxList]): bounding boxes that are used as
  51. reference, one for ech image
  52. Returns:
  53. results (list[BoxList]): one BoxList for each image, containing
  54. the extra field mask
  55. """
  56. mask_prob = x.sigmoid()
  57. # select masks corresponding to the predicted classes
  58. num_masks = x.shape[0]
  59. boxes_per_image = [label.shape[0] for label in labels]
  60. labels = torch.cat(labels)
  61. index = torch.arange(num_masks, device=labels.device)
  62. mask_prob = mask_prob[index, labels][:, None]
  63. mask_prob = mask_prob.split(boxes_per_image, dim=0)
  64. return mask_prob
  65. def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
  66. # type: (Tensor, Tensor, Tensor, int) -> Tensor
  67. """
  68. Given segmentation masks and the bounding boxes corresponding
  69. to the location of the masks in the image, this function
  70. crops and resizes the masks in the position defined by the
  71. boxes. This prepares the masks for them to be fed to the
  72. loss computation as the targets.
  73. """
  74. matched_idxs = matched_idxs.to(boxes)
  75. rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
  76. gt_masks = gt_masks[:, None].to(rois)
  77. return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
  78. def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
  79. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  80. """
  81. Args:
  82. proposals (list[BoxList])
  83. mask_logits (Tensor)
  84. targets (list[BoxList])
  85. Return:
  86. mask_loss (Tensor): scalar tensor containing the loss
  87. """
  88. discretization_size = mask_logits.shape[-1]
  89. labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
  90. mask_targets = [
  91. project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
  92. ]
  93. labels = torch.cat(labels, dim=0)
  94. mask_targets = torch.cat(mask_targets, dim=0)
  95. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  96. # accept empty tensors, so handle it separately
  97. if mask_targets.numel() == 0:
  98. return mask_logits.sum() * 0
  99. mask_loss = F.binary_cross_entropy_with_logits(
  100. mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
  101. )
  102. return mask_loss
  103. def keypoints_to_heatmap(keypoints, rois, heatmap_size):
  104. # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
  105. offset_x = rois[:, 0]
  106. offset_y = rois[:, 1]
  107. scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
  108. scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
  109. offset_x = offset_x[:, None]
  110. offset_y = offset_y[:, None]
  111. scale_x = scale_x[:, None]
  112. scale_y = scale_y[:, None]
  113. x = keypoints[..., 0]
  114. y = keypoints[..., 1]
  115. x_boundary_inds = x == rois[:, 2][:, None]
  116. y_boundary_inds = y == rois[:, 3][:, None]
  117. x = (x - offset_x) * scale_x
  118. x = x.floor().long()
  119. y = (y - offset_y) * scale_y
  120. y = y.floor().long()
  121. x[x_boundary_inds] = heatmap_size - 1
  122. y[y_boundary_inds] = heatmap_size - 1
  123. valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
  124. vis = keypoints[..., 2] > 0
  125. valid = (valid_loc & vis).long()
  126. lin_ind = y * heatmap_size + x
  127. heatmaps = lin_ind * valid
  128. return heatmaps, valid
  129. def _onnx_heatmaps_to_keypoints(
  130. maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
  131. ):
  132. num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
  133. width_correction = widths_i / roi_map_width
  134. height_correction = heights_i / roi_map_height
  135. roi_map = F.interpolate(
  136. maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
  137. )[:, 0]
  138. w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
  139. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  140. x_int = pos % w
  141. y_int = (pos - x_int) // w
  142. x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
  143. dtype=torch.float32
  144. )
  145. y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
  146. dtype=torch.float32
  147. )
  148. xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
  149. xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
  150. xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
  151. xy_preds_i = torch.stack(
  152. [
  153. xy_preds_i_0.to(dtype=torch.float32),
  154. xy_preds_i_1.to(dtype=torch.float32),
  155. xy_preds_i_2.to(dtype=torch.float32),
  156. ],
  157. 0,
  158. )
  159. # TODO: simplify when indexing without rank will be supported by ONNX
  160. base = num_keypoints * num_keypoints + num_keypoints + 1
  161. ind = torch.arange(num_keypoints)
  162. ind = ind.to(dtype=torch.int64) * base
  163. end_scores_i = (
  164. roi_map.index_select(1, y_int.to(dtype=torch.int64))
  165. .index_select(2, x_int.to(dtype=torch.int64))
  166. .view(-1)
  167. .index_select(0, ind.to(dtype=torch.int64))
  168. )
  169. return xy_preds_i, end_scores_i
  170. @torch.jit._script_if_tracing
  171. def _onnx_heatmaps_to_keypoints_loop(
  172. maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
  173. ):
  174. xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  175. end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  176. for i in range(int(rois.size(0))):
  177. xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
  178. maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
  179. )
  180. xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
  181. end_scores = torch.cat(
  182. (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
  183. )
  184. return xy_preds, end_scores
  185. def heatmaps_to_keypoints(maps, rois):
  186. """Extract predicted keypoint locations from heatmaps. Output has shape
  187. (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
  188. for each keypoint.
  189. """
  190. # This function converts a discrete image coordinate in a HEATMAP_SIZE x
  191. # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
  192. # consistency with keypoints_to_heatmap_labels by using the conversion from
  193. # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
  194. # continuous coordinate.
  195. offset_x = rois[:, 0]
  196. offset_y = rois[:, 1]
  197. widths = rois[:, 2] - rois[:, 0]
  198. heights = rois[:, 3] - rois[:, 1]
  199. widths = widths.clamp(min=1)
  200. heights = heights.clamp(min=1)
  201. widths_ceil = widths.ceil()
  202. heights_ceil = heights.ceil()
  203. num_keypoints = maps.shape[1]
  204. if torchvision._is_tracing():
  205. xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
  206. maps,
  207. rois,
  208. widths_ceil,
  209. heights_ceil,
  210. widths,
  211. heights,
  212. offset_x,
  213. offset_y,
  214. torch.scalar_tensor(num_keypoints, dtype=torch.int64),
  215. )
  216. return xy_preds.permute(0, 2, 1), end_scores
  217. xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
  218. end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
  219. for i in range(len(rois)):
  220. roi_map_width = int(widths_ceil[i].item())
  221. roi_map_height = int(heights_ceil[i].item())
  222. width_correction = widths[i] / roi_map_width
  223. height_correction = heights[i] / roi_map_height
  224. roi_map = F.interpolate(
  225. maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
  226. )[:, 0]
  227. # roi_map_probs = scores_to_probs(roi_map.copy())
  228. w = roi_map.shape[2]
  229. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  230. x_int = pos % w
  231. y_int = torch.div(pos - x_int, w, rounding_mode="floor")
  232. # assert (roi_map_probs[k, y_int, x_int] ==
  233. # roi_map_probs[k, :, :].max())
  234. x = (x_int.float() + 0.5) * width_correction
  235. y = (y_int.float() + 0.5) * height_correction
  236. xy_preds[i, 0, :] = x + offset_x[i]
  237. xy_preds[i, 1, :] = y + offset_y[i]
  238. xy_preds[i, 2, :] = 1
  239. end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
  240. return xy_preds.permute(0, 2, 1), end_scores
  241. def lines_point_pair_loss(line_logits, proposals, gt_lines, line_matched_idxs):
  242. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  243. N, K, H, W = line_logits.shape
  244. if H != W:
  245. raise ValueError(
  246. f"line_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  247. )
  248. discretization_size = H
  249. heatmaps = []
  250. valid = []
  251. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_lines, line_matched_idxs):
  252. kp = gt_kp_in_image[midx]
  253. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  254. heatmaps.append(heatmaps_per_image.view(-1))
  255. valid.append(valid_per_image.view(-1))
  256. line_targets = torch.cat(heatmaps, dim=0)
  257. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  258. valid = torch.where(valid)[0]
  259. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  260. # accept empty tensors, so handle it sepaartely
  261. if line_targets.numel() == 0 or len(valid) == 0:
  262. return line_logits.sum() * 0
  263. line_logits = line_logits.view(N * K, H * W)
  264. line_loss = F.cross_entropy(line_logits[valid], line_targets[valid])
  265. return line_loss
  266. def line_inference(x, boxes):
  267. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  268. kp_probs = []
  269. kp_scores = []
  270. boxes_per_image = [box.size(0) for box in boxes]
  271. x2 = x.split(boxes_per_image, dim=0)
  272. for xx, bb in zip(x2, boxes):
  273. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  274. kp_probs.append(kp_prob)
  275. kp_scores.append(scores)
  276. return kp_probs, kp_scores
  277. def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
  278. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  279. N, K, H, W = keypoint_logits.shape
  280. if H != W:
  281. raise ValueError(
  282. f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  283. )
  284. discretization_size = H
  285. heatmaps = []
  286. valid = []
  287. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
  288. kp = gt_kp_in_image[midx]
  289. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  290. heatmaps.append(heatmaps_per_image.view(-1))
  291. valid.append(valid_per_image.view(-1))
  292. keypoint_targets = torch.cat(heatmaps, dim=0)
  293. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  294. valid = torch.where(valid)[0]
  295. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  296. # accept empty tensors, so handle it sepaartely
  297. if keypoint_targets.numel() == 0 or len(valid) == 0:
  298. return keypoint_logits.sum() * 0
  299. keypoint_logits = keypoint_logits.view(N * K, H * W)
  300. keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
  301. return keypoint_loss
  302. def keypointrcnn_inference(x, boxes):
  303. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  304. kp_probs = []
  305. kp_scores = []
  306. boxes_per_image = [box.size(0) for box in boxes]
  307. x2 = x.split(boxes_per_image, dim=0)
  308. for xx, bb in zip(x2, boxes):
  309. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  310. kp_probs.append(kp_prob)
  311. kp_scores.append(scores)
  312. return kp_probs, kp_scores
  313. def _onnx_expand_boxes(boxes, scale):
  314. # type: (Tensor, float) -> Tensor
  315. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  316. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  317. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  318. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  319. w_half = w_half.to(dtype=torch.float32) * scale
  320. h_half = h_half.to(dtype=torch.float32) * scale
  321. boxes_exp0 = x_c - w_half
  322. boxes_exp1 = y_c - h_half
  323. boxes_exp2 = x_c + w_half
  324. boxes_exp3 = y_c + h_half
  325. boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
  326. return boxes_exp
  327. # the next two functions should be merged inside Masker
  328. # but are kept here for the moment while we need them
  329. # temporarily for paste_mask_in_image
  330. def expand_boxes(boxes, scale):
  331. # type: (Tensor, float) -> Tensor
  332. if torchvision._is_tracing():
  333. return _onnx_expand_boxes(boxes, scale)
  334. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  335. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  336. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  337. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  338. w_half *= scale
  339. h_half *= scale
  340. boxes_exp = torch.zeros_like(boxes)
  341. boxes_exp[:, 0] = x_c - w_half
  342. boxes_exp[:, 2] = x_c + w_half
  343. boxes_exp[:, 1] = y_c - h_half
  344. boxes_exp[:, 3] = y_c + h_half
  345. return boxes_exp
  346. @torch.jit.unused
  347. def expand_masks_tracing_scale(M, padding):
  348. # type: (int, int) -> float
  349. return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  350. def expand_masks(mask, padding):
  351. # type: (Tensor, int) -> Tuple[Tensor, float]
  352. M = mask.shape[-1]
  353. if torch._C._get_tracing_state(): # could not import is_tracing(), not sure why
  354. scale = expand_masks_tracing_scale(M, padding)
  355. else:
  356. scale = float(M + 2 * padding) / M
  357. padded_mask = F.pad(mask, (padding,) * 4)
  358. return padded_mask, scale
  359. def paste_mask_in_image(mask, box, im_h, im_w):
  360. # type: (Tensor, Tensor, int, int) -> Tensor
  361. TO_REMOVE = 1
  362. w = int(box[2] - box[0] + TO_REMOVE)
  363. h = int(box[3] - box[1] + TO_REMOVE)
  364. w = max(w, 1)
  365. h = max(h, 1)
  366. # Set shape to [batchxCxHxW]
  367. mask = mask.expand((1, 1, -1, -1))
  368. # Resize mask
  369. mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
  370. mask = mask[0][0]
  371. im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
  372. x_0 = max(box[0], 0)
  373. x_1 = min(box[2] + 1, im_w)
  374. y_0 = max(box[1], 0)
  375. y_1 = min(box[3] + 1, im_h)
  376. im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  377. return im_mask
  378. def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
  379. one = torch.ones(1, dtype=torch.int64)
  380. zero = torch.zeros(1, dtype=torch.int64)
  381. w = box[2] - box[0] + one
  382. h = box[3] - box[1] + one
  383. w = torch.max(torch.cat((w, one)))
  384. h = torch.max(torch.cat((h, one)))
  385. # Set shape to [batchxCxHxW]
  386. mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
  387. # Resize mask
  388. mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
  389. mask = mask[0][0]
  390. x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
  391. x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
  392. y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
  393. y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
  394. unpaded_im_mask = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  395. # TODO : replace below with a dynamic padding when support is added in ONNX
  396. # pad y
  397. zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
  398. zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
  399. concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
  400. # pad x
  401. zeros_x0 = torch.zeros(concat_0.size(0), x_0)
  402. zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
  403. im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
  404. return im_mask
  405. @torch.jit._script_if_tracing
  406. def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
  407. res_append = torch.zeros(0, im_h, im_w)
  408. for i in range(masks.size(0)):
  409. mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
  410. mask_res = mask_res.unsqueeze(0)
  411. res_append = torch.cat((res_append, mask_res))
  412. return res_append
  413. def paste_masks_in_image(masks, boxes, img_shape, padding=1):
  414. # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
  415. masks, scale = expand_masks(masks, padding=padding)
  416. boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
  417. im_h, im_w = img_shape
  418. if torchvision._is_tracing():
  419. return _onnx_paste_masks_in_image_loop(
  420. masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
  421. )[:, None]
  422. res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
  423. if len(res) > 0:
  424. ret = torch.stack(res, dim=0)[:, None]
  425. else:
  426. ret = masks.new_empty((0, 1, im_h, im_w))
  427. return ret
  428. class RoIHeads(nn.Module):
  429. __annotations__ = {
  430. "box_coder": det_utils.BoxCoder,
  431. "proposal_matcher": det_utils.Matcher,
  432. "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
  433. }
  434. def __init__(
  435. self,
  436. box_roi_pool,
  437. box_head,
  438. box_predictor,
  439. # Faster R-CNN training
  440. fg_iou_thresh,
  441. bg_iou_thresh,
  442. batch_size_per_image,
  443. positive_fraction,
  444. bbox_reg_weights,
  445. # Faster R-CNN inference
  446. score_thresh,
  447. nms_thresh,
  448. detections_per_img,
  449. # Line
  450. line_roi_pool=None,
  451. line_head=None,
  452. line_predictor=None,
  453. # Mask
  454. mask_roi_pool=None,
  455. mask_head=None,
  456. mask_predictor=None,
  457. keypoint_roi_pool=None,
  458. keypoint_head=None,
  459. keypoint_predictor=None,
  460. ):
  461. super().__init__()
  462. self.box_similarity = box_ops.box_iou
  463. # assign ground-truth boxes for each proposal
  464. self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
  465. self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
  466. if bbox_reg_weights is None:
  467. bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
  468. self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
  469. self.box_roi_pool = box_roi_pool
  470. self.box_head = box_head
  471. self.box_predictor = box_predictor
  472. self.score_thresh = score_thresh
  473. self.nms_thresh = nms_thresh
  474. self.detections_per_img = detections_per_img
  475. self.line_roi_pool = line_roi_pool
  476. self.line_head = line_head
  477. self.line_predictor = line_predictor
  478. self.mask_roi_pool = mask_roi_pool
  479. self.mask_head = mask_head
  480. self.mask_predictor = mask_predictor
  481. self.keypoint_roi_pool = keypoint_roi_pool
  482. self.keypoint_head = keypoint_head
  483. self.keypoint_predictor = keypoint_predictor
  484. def has_mask(self):
  485. if self.mask_roi_pool is None:
  486. return False
  487. if self.mask_head is None:
  488. return False
  489. if self.mask_predictor is None:
  490. return False
  491. return True
  492. def has_keypoint(self):
  493. if self.keypoint_roi_pool is None:
  494. return False
  495. if self.keypoint_head is None:
  496. return False
  497. if self.keypoint_predictor is None:
  498. return False
  499. return True
  500. def has_line(self):
  501. if self.line_roi_pool is None:
  502. return False
  503. if self.line_head is None:
  504. return False
  505. if self.line_predictor is None:
  506. return False
  507. return True
  508. def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
  509. # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  510. matched_idxs = []
  511. labels = []
  512. for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
  513. if gt_boxes_in_image.numel() == 0:
  514. # Background image
  515. device = proposals_in_image.device
  516. clamped_matched_idxs_in_image = torch.zeros(
  517. (proposals_in_image.shape[0],), dtype=torch.int64, device=device
  518. )
  519. labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
  520. else:
  521. # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
  522. match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
  523. matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
  524. clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
  525. labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
  526. labels_in_image = labels_in_image.to(dtype=torch.int64)
  527. # Label background (below the low threshold)
  528. bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
  529. labels_in_image[bg_inds] = 0
  530. # Label ignore proposals (between low and high thresholds)
  531. ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
  532. labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
  533. matched_idxs.append(clamped_matched_idxs_in_image)
  534. labels.append(labels_in_image)
  535. return matched_idxs, labels
  536. def subsample(self, labels):
  537. # type: (List[Tensor]) -> List[Tensor]
  538. sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  539. sampled_inds = []
  540. for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
  541. img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
  542. sampled_inds.append(img_sampled_inds)
  543. return sampled_inds
  544. def add_gt_proposals(self, proposals, gt_boxes):
  545. # type: (List[Tensor], List[Tensor]) -> List[Tensor]
  546. proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
  547. return proposals
  548. def check_targets(self, targets):
  549. # type: (Optional[List[Dict[str, Tensor]]]) -> None
  550. if targets is None:
  551. raise ValueError("targets should not be None")
  552. if not all(["boxes" in t for t in targets]):
  553. raise ValueError("Every element of targets should have a boxes key")
  554. if not all(["labels" in t for t in targets]):
  555. raise ValueError("Every element of targets should have a labels key")
  556. if self.has_mask():
  557. if not all(["masks" in t for t in targets]):
  558. raise ValueError("Every element of targets should have a masks key")
  559. def select_training_samples(
  560. self,
  561. proposals, # type: List[Tensor]
  562. targets, # type: Optional[List[Dict[str, Tensor]]]
  563. ):
  564. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
  565. self.check_targets(targets)
  566. if targets is None:
  567. raise ValueError("targets should not be None")
  568. dtype = proposals[0].dtype
  569. device = proposals[0].device
  570. gt_boxes = [t["boxes"].to(dtype) for t in targets]
  571. gt_labels = [t["labels"] for t in targets]
  572. # append ground-truth bboxes to propos
  573. proposals = self.add_gt_proposals(proposals, gt_boxes)
  574. # get matching gt indices for each proposal
  575. matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
  576. # sample a fixed proportion of positive-negative proposals
  577. sampled_inds = self.subsample(labels)
  578. matched_gt_boxes = []
  579. num_images = len(proposals)
  580. for img_id in range(num_images):
  581. img_sampled_inds = sampled_inds[img_id]
  582. proposals[img_id] = proposals[img_id][img_sampled_inds]
  583. labels[img_id] = labels[img_id][img_sampled_inds]
  584. matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
  585. gt_boxes_in_image = gt_boxes[img_id]
  586. if gt_boxes_in_image.numel() == 0:
  587. gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
  588. matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
  589. regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
  590. return proposals, matched_idxs, labels, regression_targets
  591. def postprocess_detections(
  592. self,
  593. class_logits, # type: Tensor
  594. box_regression, # type: Tensor
  595. proposals, # type: List[Tensor]
  596. image_shapes, # type: List[Tuple[int, int]]
  597. ):
  598. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
  599. device = class_logits.device
  600. num_classes = class_logits.shape[-1]
  601. boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
  602. pred_boxes = self.box_coder.decode(box_regression, proposals)
  603. pred_scores = F.softmax(class_logits, -1)
  604. pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
  605. pred_scores_list = pred_scores.split(boxes_per_image, 0)
  606. all_boxes = []
  607. all_scores = []
  608. all_labels = []
  609. for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
  610. boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
  611. # create labels for each prediction
  612. labels = torch.arange(num_classes, device=device)
  613. labels = labels.view(1, -1).expand_as(scores)
  614. # remove predictions with the background label
  615. boxes = boxes[:, 1:]
  616. scores = scores[:, 1:]
  617. labels = labels[:, 1:]
  618. # batch everything, by making every class prediction be a separate instance
  619. boxes = boxes.reshape(-1, 4)
  620. scores = scores.reshape(-1)
  621. labels = labels.reshape(-1)
  622. # remove low scoring boxes
  623. inds = torch.where(scores > self.score_thresh)[0]
  624. boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
  625. # remove empty boxes
  626. keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
  627. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  628. # non-maximum suppression, independently done per class
  629. keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
  630. # keep only topk scoring predictions
  631. keep = keep[: self.detections_per_img]
  632. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  633. all_boxes.append(boxes)
  634. all_scores.append(scores)
  635. all_labels.append(labels)
  636. return all_boxes, all_scores, all_labels
  637. def forward(
  638. self,
  639. features, # type: Dict[str, Tensor]
  640. proposals, # type: List[Tensor]
  641. image_shapes, # type: List[Tuple[int, int]]
  642. targets=None, # type: Optional[List[Dict[str, Tensor]]]
  643. ):
  644. # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
  645. """
  646. Args:
  647. features (List[Tensor])
  648. proposals (List[Tensor[N, 4]])
  649. image_shapes (List[Tuple[H, W]])
  650. targets (List[Dict])
  651. """
  652. print(f'roihead forward!!!')
  653. if targets is not None:
  654. for t in targets:
  655. # TODO: https://github.com/pytorch/pytorch/issues/26731
  656. floating_point_types = (torch.float, torch.double, torch.half)
  657. if not t["boxes"].dtype in floating_point_types:
  658. raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
  659. if not t["labels"].dtype == torch.int64:
  660. raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
  661. if self.has_keypoint():
  662. if not t["keypoints"].dtype == torch.float32:
  663. raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
  664. if self.training:
  665. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  666. else:
  667. if targets is not None:
  668. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  669. else:
  670. labels = None
  671. regression_targets = None
  672. matched_idxs = None
  673. box_features = self.box_roi_pool(features, proposals, image_shapes)
  674. box_features = self.box_head(box_features)
  675. class_logits, box_regression = self.box_predictor(box_features)
  676. result: List[Dict[str, torch.Tensor]] = []
  677. losses = {}
  678. # _, C, H, W = features['0'].shape # 忽略 batch_size,因为我们只关心 C, H, W
  679. if self.training:
  680. if labels is None:
  681. raise ValueError("labels cannot be None")
  682. if regression_targets is None:
  683. raise ValueError("regression_targets cannot be None")
  684. print(f'boxes compute losses')
  685. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  686. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  687. else:
  688. if targets is not None:
  689. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  690. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  691. boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals,
  692. image_shapes)
  693. num_images = len(boxes)
  694. for i in range(num_images):
  695. result.append(
  696. {
  697. "boxes": boxes[i],
  698. "labels": labels[i],
  699. "scores": scores[i],
  700. }
  701. )
  702. if self.has_line():
  703. print(f'roi_heads forward has_line()!!!!')
  704. line_proposals = [p["boxes"] for p in result]
  705. print(f'line_proposals:{len(line_proposals)}')
  706. # if line_proposals is None or len(line_proposals) == 0:
  707. # # 返回空特征或者跳过该部分计算
  708. # return torch.empty(0, C, H, W).to(features['0'].device)
  709. if self.training:
  710. # during training, only focus on positive boxes
  711. num_images = len(proposals)
  712. line_proposals = []
  713. pos_matched_idxs = []
  714. if matched_idxs is None:
  715. raise ValueError("if in trainning, matched_idxs should not be None")
  716. for img_id in range(num_images):
  717. pos = torch.where(labels[img_id] > 0)[0]
  718. line_proposals.append(proposals[img_id][pos])
  719. pos_matched_idxs.append(matched_idxs[img_id][pos])
  720. else:
  721. if targets is not None:
  722. pos_matched_idxs = []
  723. num_images = len(proposals)
  724. if matched_idxs is None:
  725. raise ValueError("if in trainning, matched_idxs should not be None")
  726. for img_id in range(num_images):
  727. pos = torch.where(labels[img_id] > 0)[0]
  728. line_proposals.append(proposals[img_id][pos])
  729. pos_matched_idxs.append(matched_idxs[img_id][pos])
  730. else:
  731. pos_matched_idxs = None
  732. line_features = self.line_roi_pool(features, line_proposals, image_shapes)
  733. line_features = self.line_head(line_features)
  734. line_logits = self.line_predictor(line_features)
  735. loss_line = {}
  736. if self.training:
  737. if targets is None or pos_matched_idxs is None:
  738. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  739. gt_lines = [t["lines"] for t in targets]
  740. rcnn_loss_line = lines_point_pair_loss(
  741. line_logits, line_proposals, gt_lines, pos_matched_idxs
  742. )
  743. loss_line = {"loss_line": rcnn_loss_line}
  744. else:
  745. if targets is not None:
  746. gt_lines = [t["lines"] for t in targets]
  747. rcnn_loss_lines = lines_point_pair_loss(
  748. line_logits, line_proposals, gt_lines, pos_matched_idxs
  749. )
  750. loss_line = {"loss_line": rcnn_loss_lines}
  751. else:
  752. if line_logits is None or line_proposals is None:
  753. raise ValueError(
  754. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  755. )
  756. lines_probs, kp_scores = line_inference(line_logits, line_proposals)
  757. for keypoint_prob, kps, r in zip(lines_probs, kp_scores, result):
  758. r["lines"] = keypoint_prob
  759. r["liness_scores"] = kps
  760. losses.update(loss_line)
  761. if self.has_mask():
  762. mask_proposals = [p["boxes"] for p in result]
  763. if self.training:
  764. if matched_idxs is None:
  765. raise ValueError("if in training, matched_idxs should not be None")
  766. # during training, only focus on positive boxes
  767. num_images = len(proposals)
  768. mask_proposals = []
  769. pos_matched_idxs = []
  770. for img_id in range(num_images):
  771. pos = torch.where(labels[img_id] > 0)[0]
  772. mask_proposals.append(proposals[img_id][pos])
  773. pos_matched_idxs.append(matched_idxs[img_id][pos])
  774. else:
  775. pos_matched_idxs = None
  776. if self.mask_roi_pool is not None:
  777. mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
  778. mask_features = self.mask_head(mask_features)
  779. mask_logits = self.mask_predictor(mask_features)
  780. else:
  781. raise Exception("Expected mask_roi_pool to be not None")
  782. loss_mask = {}
  783. if self.training:
  784. if targets is None or pos_matched_idxs is None or mask_logits is None:
  785. raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
  786. gt_masks = [t["masks"] for t in targets]
  787. gt_labels = [t["labels"] for t in targets]
  788. rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
  789. loss_mask = {"loss_mask": rcnn_loss_mask}
  790. else:
  791. labels = [r["labels"] for r in result]
  792. masks_probs = maskrcnn_inference(mask_logits, labels)
  793. for mask_prob, r in zip(masks_probs, result):
  794. r["masks"] = mask_prob
  795. losses.update(loss_mask)
  796. # keep none checks in if conditional so torchscript will conditionally
  797. # compile each branch
  798. if self.has_keypoint():
  799. keypoint_proposals = [p["boxes"] for p in result]
  800. if self.training:
  801. # during training, only focus on positive boxes
  802. num_images = len(proposals)
  803. keypoint_proposals = []
  804. pos_matched_idxs = []
  805. if matched_idxs is None:
  806. raise ValueError("if in trainning, matched_idxs should not be None")
  807. for img_id in range(num_images):
  808. pos = torch.where(labels[img_id] > 0)[0]
  809. keypoint_proposals.append(proposals[img_id][pos])
  810. pos_matched_idxs.append(matched_idxs[img_id][pos])
  811. else:
  812. pos_matched_idxs = None
  813. keypoint_features = self.line_roi_pool(features, keypoint_proposals, image_shapes)
  814. keypoint_features = self.line_head(keypoint_features)
  815. keypoint_logits = self.line_predictor(keypoint_features)
  816. loss_keypoint = {}
  817. if self.training:
  818. if targets is None or pos_matched_idxs is None:
  819. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  820. gt_keypoints = [t["keypoints"] for t in targets]
  821. rcnn_loss_keypoint = keypointrcnn_loss(
  822. keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
  823. )
  824. loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
  825. else:
  826. if keypoint_logits is None or keypoint_proposals is None:
  827. raise ValueError(
  828. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  829. )
  830. keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
  831. for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
  832. r["keypoints"] = keypoint_prob
  833. r["keypoints_scores"] = kps
  834. losses.update(loss_keypoint)
  835. return result, losses