roi_heads.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045
  1. from typing import Dict, List, Optional, Tuple
  2. import torch
  3. import torch.nn.functional as F
  4. import torchvision
  5. from torch import nn, Tensor
  6. from torchvision.ops import boxes as box_ops, roi_align
  7. from libs.vision_libs.models.detection import _utils as det_utils
  8. from collections import OrderedDict
  9. def l2loss(input, target):
  10. return ((target - input) ** 2).mean(2).mean(1)
  11. def cross_entropy_loss(logits, positive):
  12. nlogp = -F.log_softmax(logits, dim=0)
  13. return (positive * nlogp[1] + (1 - positive) * nlogp[0]).mean(2).mean(1)
  14. def sigmoid_l1_loss(logits, target, offset=0.0, mask=None):
  15. logp = torch.sigmoid(logits) + offset
  16. loss = torch.abs(logp - target)
  17. if mask is not None:
  18. w = mask.mean(2, True).mean(1, True)
  19. w[w == 0] = 1
  20. loss = loss * (mask / w)
  21. return loss.mean(2).mean(1)
  22. ###计算多头损失
  23. def line_head_loss(input_dict, outputs, feature, loss_weight, mode_train):
  24. # image = input_dict["image"]
  25. # target_b = input_dict["target_b"]
  26. # outputs, feature, aaa = self.backbone(image, target_b, input_dict["mode"]) # train时aaa是损失,val时是box
  27. result = {"feature": feature}
  28. batch, channel, row, col = outputs[0].shape
  29. T = input_dict["target"].copy()
  30. n_jtyp = T["junc_map"].shape[1]
  31. # switch to CNHW
  32. for task in ["junc_map"]:
  33. T[task] = T[task].permute(1, 0, 2, 3)
  34. for task in ["junc_offset"]:
  35. T[task] = T[task].permute(1, 2, 0, 3, 4)
  36. offset = [2, 3, 5]
  37. losses = []
  38. for stack, output in enumerate(outputs):
  39. output = output.transpose(0, 1).reshape([-1, batch, row, col]).contiguous()
  40. jmap = output[0: offset[0]].reshape(n_jtyp, 2, batch, row, col)
  41. lmap = output[offset[0]: offset[1]].squeeze(0)
  42. # print(f"lmap:{lmap.shape}")
  43. joff = output[offset[1]: offset[2]].reshape(n_jtyp, 2, batch, row, col)
  44. if stack == 0:
  45. result["preds"] = {
  46. "jmap": jmap.permute(2, 0, 1, 3, 4).softmax(2)[:, :, 1],
  47. "lmap": lmap.sigmoid(),
  48. "joff": joff.permute(2, 0, 1, 3, 4).sigmoid() - 0.5,
  49. }
  50. if mode_train == False:
  51. return result
  52. L = OrderedDict()
  53. L["jmap"] = sum(
  54. cross_entropy_loss(jmap[i], T["junc_map"][i]) for i in range(n_jtyp)
  55. )
  56. L["lmap"] = (
  57. F.binary_cross_entropy_with_logits(lmap, T["line_map"], reduction="none")
  58. .mean(2)
  59. .mean(1)
  60. )
  61. L["joff"] = sum(
  62. sigmoid_l1_loss(joff[i, j], T["junc_offset"][i, j], -0.5, T["junc_map"][i])
  63. for i in range(n_jtyp)
  64. for j in range(2)
  65. )
  66. for loss_name in L:
  67. L[loss_name].mul_(loss_weight[loss_name])
  68. losses.append(L)
  69. result["losses"] = losses
  70. # result["aaa"] = aaa
  71. return result
  72. # 计算线性损失
  73. def line_vectorizer_loss(result, x, ys, idx, jcs, n_batch, ps, n_out_line, n_out_junc, loss_weight, mode_train):
  74. if mode_train == False:
  75. p = torch.cat(ps)
  76. s = torch.sigmoid(x)
  77. b = s > 0.5
  78. lines = []
  79. score = []
  80. for i in range(n_batch):
  81. p0 = p[idx[i]: idx[i + 1]]
  82. s0 = s[idx[i]: idx[i + 1]]
  83. mask = b[idx[i]: idx[i + 1]]
  84. p0 = p0[mask]
  85. s0 = s0[mask]
  86. if len(p0) == 0:
  87. lines.append(torch.zeros([1, n_out_line, 2, 2], device=p.device))
  88. score.append(torch.zeros([1, n_out_line], device=p.device))
  89. else:
  90. arg = torch.argsort(s0, descending=True)
  91. p0, s0 = p0[arg], s0[arg]
  92. lines.append(p0[None, torch.arange(n_out_line) % len(p0)])
  93. score.append(s0[None, torch.arange(n_out_line) % len(s0)])
  94. for j in range(len(jcs[i])):
  95. if len(jcs[i][j]) == 0:
  96. jcs[i][j] = torch.zeros([n_out_junc, 2], device=p.device)
  97. jcs[i][j] = jcs[i][j][
  98. None, torch.arange(n_out_junc) % len(jcs[i][j])
  99. ]
  100. result["preds"]["lines"] = torch.cat(lines)
  101. result["preds"]["score"] = torch.cat(score)
  102. result["preds"]["juncs"] = torch.cat([jcs[i][0] for i in range(n_batch)])
  103. if len(jcs[i]) > 1:
  104. result["preds"]["junts"] = torch.cat(
  105. [jcs[i][1] for i in range(n_batch)]
  106. )
  107. # if input_dict["mode"] != "testing":
  108. y = torch.cat(ys)
  109. loss = nn.BCEWithLogitsLoss(reduction="none")
  110. loss = loss(x, y)
  111. lpos_mask, lneg_mask = y, 1 - y
  112. loss_lpos, loss_lneg = loss * lpos_mask, loss * lneg_mask
  113. def sum_batch(x):
  114. xs = [x[idx[i]: idx[i + 1]].sum()[None] for i in range(n_batch)]
  115. return torch.cat(xs)
  116. lpos = sum_batch(loss_lpos) / sum_batch(lpos_mask).clamp(min=1)
  117. lneg = sum_batch(loss_lneg) / sum_batch(lneg_mask).clamp(min=1)
  118. result["losses"][0]["lpos"] = lpos * loss_weight["lpos"]
  119. result["losses"][0]["lneg"] = lneg * loss_weight["lneg"]
  120. if mode_train == True:
  121. del result["preds"]
  122. return result
  123. def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
  124. # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
  125. """
  126. Computes the loss for Faster R-CNN.
  127. Args:
  128. class_logits (Tensor)
  129. box_regression (Tensor)
  130. labels (list[BoxList])
  131. regression_targets (Tensor)
  132. Returns:
  133. classification_loss (Tensor)
  134. box_loss (Tensor)
  135. """
  136. labels = torch.cat(labels, dim=0)
  137. regression_targets = torch.cat(regression_targets, dim=0)
  138. classification_loss = F.cross_entropy(class_logits, labels)
  139. # get indices that correspond to the regression targets for
  140. # the corresponding ground truth labels, to be used with
  141. # advanced indexing
  142. sampled_pos_inds_subset = torch.where(labels > 0)[0]
  143. labels_pos = labels[sampled_pos_inds_subset]
  144. N, num_classes = class_logits.shape
  145. box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
  146. box_loss = F.smooth_l1_loss(
  147. box_regression[sampled_pos_inds_subset, labels_pos],
  148. regression_targets[sampled_pos_inds_subset],
  149. beta=1 / 9,
  150. reduction="sum",
  151. )
  152. box_loss = box_loss / labels.numel()
  153. return classification_loss, box_loss
  154. def maskrcnn_inference(x, labels):
  155. # type: (Tensor, List[Tensor]) -> List[Tensor]
  156. """
  157. From the results of the CNN, post process the masks
  158. by taking the mask corresponding to the class with max
  159. probability (which are of fixed size and directly output
  160. by the CNN) and return the masks in the mask field of the BoxList.
  161. Args:
  162. x (Tensor): the mask logits
  163. labels (list[BoxList]): bounding boxes that are used as
  164. reference, one for ech image
  165. Returns:
  166. results (list[BoxList]): one BoxList for each image, containing
  167. the extra field mask
  168. """
  169. mask_prob = x.sigmoid()
  170. # select masks corresponding to the predicted classes
  171. num_masks = x.shape[0]
  172. boxes_per_image = [label.shape[0] for label in labels]
  173. labels = torch.cat(labels)
  174. index = torch.arange(num_masks, device=labels.device)
  175. mask_prob = mask_prob[index, labels][:, None]
  176. mask_prob = mask_prob.split(boxes_per_image, dim=0)
  177. return mask_prob
  178. def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
  179. # type: (Tensor, Tensor, Tensor, int) -> Tensor
  180. """
  181. Given segmentation masks and the bounding boxes corresponding
  182. to the location of the masks in the image, this function
  183. crops and resizes the masks in the position defined by the
  184. boxes. This prepares the masks for them to be fed to the
  185. loss computation as the targets.
  186. """
  187. matched_idxs = matched_idxs.to(boxes)
  188. rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
  189. gt_masks = gt_masks[:, None].to(rois)
  190. return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
  191. def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
  192. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  193. """
  194. Args:
  195. proposals (list[BoxList])
  196. mask_logits (Tensor)
  197. targets (list[BoxList])
  198. Return:
  199. mask_loss (Tensor): scalar tensor containing the loss
  200. """
  201. discretization_size = mask_logits.shape[-1]
  202. labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
  203. mask_targets = [
  204. project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
  205. ]
  206. labels = torch.cat(labels, dim=0)
  207. mask_targets = torch.cat(mask_targets, dim=0)
  208. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  209. # accept empty tensors, so handle it separately
  210. if mask_targets.numel() == 0:
  211. return mask_logits.sum() * 0
  212. mask_loss = F.binary_cross_entropy_with_logits(
  213. mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
  214. )
  215. return mask_loss
  216. def keypoints_to_heatmap(keypoints, rois, heatmap_size):
  217. # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
  218. offset_x = rois[:, 0]
  219. offset_y = rois[:, 1]
  220. scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
  221. scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
  222. offset_x = offset_x[:, None]
  223. offset_y = offset_y[:, None]
  224. scale_x = scale_x[:, None]
  225. scale_y = scale_y[:, None]
  226. x = keypoints[..., 0]
  227. y = keypoints[..., 1]
  228. x_boundary_inds = x == rois[:, 2][:, None]
  229. y_boundary_inds = y == rois[:, 3][:, None]
  230. x = (x - offset_x) * scale_x
  231. x = x.floor().long()
  232. y = (y - offset_y) * scale_y
  233. y = y.floor().long()
  234. x[x_boundary_inds] = heatmap_size - 1
  235. y[y_boundary_inds] = heatmap_size - 1
  236. valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
  237. vis = keypoints[..., 2] > 0
  238. valid = (valid_loc & vis).long()
  239. lin_ind = y * heatmap_size + x
  240. heatmaps = lin_ind * valid
  241. return heatmaps, valid
  242. def _onnx_heatmaps_to_keypoints(
  243. maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
  244. ):
  245. num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
  246. width_correction = widths_i / roi_map_width
  247. height_correction = heights_i / roi_map_height
  248. roi_map = F.interpolate(
  249. maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
  250. )[:, 0]
  251. w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
  252. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  253. x_int = pos % w
  254. y_int = (pos - x_int) // w
  255. x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
  256. dtype=torch.float32
  257. )
  258. y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
  259. dtype=torch.float32
  260. )
  261. xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
  262. xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
  263. xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
  264. xy_preds_i = torch.stack(
  265. [
  266. xy_preds_i_0.to(dtype=torch.float32),
  267. xy_preds_i_1.to(dtype=torch.float32),
  268. xy_preds_i_2.to(dtype=torch.float32),
  269. ],
  270. 0,
  271. )
  272. # TODO: simplify when indexing without rank will be supported by ONNX
  273. base = num_keypoints * num_keypoints + num_keypoints + 1
  274. ind = torch.arange(num_keypoints)
  275. ind = ind.to(dtype=torch.int64) * base
  276. end_scores_i = (
  277. roi_map.index_select(1, y_int.to(dtype=torch.int64))
  278. .index_select(2, x_int.to(dtype=torch.int64))
  279. .view(-1)
  280. .index_select(0, ind.to(dtype=torch.int64))
  281. )
  282. return xy_preds_i, end_scores_i
  283. @torch.jit._script_if_tracing
  284. def _onnx_heatmaps_to_keypoints_loop(
  285. maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
  286. ):
  287. xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  288. end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  289. for i in range(int(rois.size(0))):
  290. xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
  291. maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
  292. )
  293. xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
  294. end_scores = torch.cat(
  295. (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
  296. )
  297. return xy_preds, end_scores
  298. def heatmaps_to_keypoints(maps, rois):
  299. """Extract predicted keypoint locations from heatmaps. Output has shape
  300. (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
  301. for each keypoint.
  302. """
  303. # This function converts a discrete image coordinate in a HEATMAP_SIZE x
  304. # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
  305. # consistency with keypoints_to_heatmap_labels by using the conversion from
  306. # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
  307. # continuous coordinate.
  308. offset_x = rois[:, 0]
  309. offset_y = rois[:, 1]
  310. widths = rois[:, 2] - rois[:, 0]
  311. heights = rois[:, 3] - rois[:, 1]
  312. widths = widths.clamp(min=1)
  313. heights = heights.clamp(min=1)
  314. widths_ceil = widths.ceil()
  315. heights_ceil = heights.ceil()
  316. num_keypoints = maps.shape[1]
  317. if torchvision._is_tracing():
  318. xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
  319. maps,
  320. rois,
  321. widths_ceil,
  322. heights_ceil,
  323. widths,
  324. heights,
  325. offset_x,
  326. offset_y,
  327. torch.scalar_tensor(num_keypoints, dtype=torch.int64),
  328. )
  329. return xy_preds.permute(0, 2, 1), end_scores
  330. xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
  331. end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
  332. for i in range(len(rois)):
  333. roi_map_width = int(widths_ceil[i].item())
  334. roi_map_height = int(heights_ceil[i].item())
  335. width_correction = widths[i] / roi_map_width
  336. height_correction = heights[i] / roi_map_height
  337. roi_map = F.interpolate(
  338. maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
  339. )[:, 0]
  340. # roi_map_probs = scores_to_probs(roi_map.copy())
  341. w = roi_map.shape[2]
  342. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  343. x_int = pos % w
  344. y_int = torch.div(pos - x_int, w, rounding_mode="floor")
  345. # assert (roi_map_probs[k, y_int, x_int] ==
  346. # roi_map_probs[k, :, :].max())
  347. x = (x_int.float() + 0.5) * width_correction
  348. y = (y_int.float() + 0.5) * height_correction
  349. xy_preds[i, 0, :] = x + offset_x[i]
  350. xy_preds[i, 1, :] = y + offset_y[i]
  351. xy_preds[i, 2, :] = 1
  352. end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
  353. return xy_preds.permute(0, 2, 1), end_scores
  354. def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
  355. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  356. N, K, H, W = keypoint_logits.shape
  357. if H != W:
  358. raise ValueError(
  359. f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  360. )
  361. discretization_size = H
  362. heatmaps = []
  363. valid = []
  364. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
  365. kp = gt_kp_in_image[midx]
  366. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  367. heatmaps.append(heatmaps_per_image.view(-1))
  368. valid.append(valid_per_image.view(-1))
  369. keypoint_targets = torch.cat(heatmaps, dim=0)
  370. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  371. valid = torch.where(valid)[0]
  372. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  373. # accept empty tensors, so handle it sepaartely
  374. if keypoint_targets.numel() == 0 or len(valid) == 0:
  375. return keypoint_logits.sum() * 0
  376. keypoint_logits = keypoint_logits.view(N * K, H * W)
  377. keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
  378. return keypoint_loss
  379. def keypointrcnn_inference(x, boxes):
  380. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  381. kp_probs = []
  382. kp_scores = []
  383. boxes_per_image = [box.size(0) for box in boxes]
  384. x2 = x.split(boxes_per_image, dim=0)
  385. for xx, bb in zip(x2, boxes):
  386. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  387. kp_probs.append(kp_prob)
  388. kp_scores.append(scores)
  389. return kp_probs, kp_scores
  390. def _onnx_expand_boxes(boxes, scale):
  391. # type: (Tensor, float) -> Tensor
  392. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  393. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  394. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  395. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  396. w_half = w_half.to(dtype=torch.float32) * scale
  397. h_half = h_half.to(dtype=torch.float32) * scale
  398. boxes_exp0 = x_c - w_half
  399. boxes_exp1 = y_c - h_half
  400. boxes_exp2 = x_c + w_half
  401. boxes_exp3 = y_c + h_half
  402. boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
  403. return boxes_exp
  404. # the next two functions should be merged inside Masker
  405. # but are kept here for the moment while we need them
  406. # temporarily for paste_mask_in_image
  407. def expand_boxes(boxes, scale):
  408. # type: (Tensor, float) -> Tensor
  409. if torchvision._is_tracing():
  410. return _onnx_expand_boxes(boxes, scale)
  411. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  412. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  413. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  414. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  415. w_half *= scale
  416. h_half *= scale
  417. boxes_exp = torch.zeros_like(boxes)
  418. boxes_exp[:, 0] = x_c - w_half
  419. boxes_exp[:, 2] = x_c + w_half
  420. boxes_exp[:, 1] = y_c - h_half
  421. boxes_exp[:, 3] = y_c + h_half
  422. return boxes_exp
  423. @torch.jit.unused
  424. def expand_masks_tracing_scale(M, padding):
  425. # type: (int, int) -> float
  426. return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  427. def expand_masks(mask, padding):
  428. # type: (Tensor, int) -> Tuple[Tensor, float]
  429. M = mask.shape[-1]
  430. if torch._C._get_tracing_state(): # could not import is_tracing(), not sure why
  431. scale = expand_masks_tracing_scale(M, padding)
  432. else:
  433. scale = float(M + 2 * padding) / M
  434. padded_mask = F.pad(mask, (padding,) * 4)
  435. return padded_mask, scale
  436. def paste_mask_in_image(mask, box, im_h, im_w):
  437. # type: (Tensor, Tensor, int, int) -> Tensor
  438. TO_REMOVE = 1
  439. w = int(box[2] - box[0] + TO_REMOVE)
  440. h = int(box[3] - box[1] + TO_REMOVE)
  441. w = max(w, 1)
  442. h = max(h, 1)
  443. # Set shape to [batchxCxHxW]
  444. mask = mask.expand((1, 1, -1, -1))
  445. # Resize mask
  446. mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
  447. mask = mask[0][0]
  448. im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
  449. x_0 = max(box[0], 0)
  450. x_1 = min(box[2] + 1, im_w)
  451. y_0 = max(box[1], 0)
  452. y_1 = min(box[3] + 1, im_h)
  453. im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])]
  454. return im_mask
  455. def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
  456. one = torch.ones(1, dtype=torch.int64)
  457. zero = torch.zeros(1, dtype=torch.int64)
  458. w = box[2] - box[0] + one
  459. h = box[3] - box[1] + one
  460. w = torch.max(torch.cat((w, one)))
  461. h = torch.max(torch.cat((h, one)))
  462. # Set shape to [batchxCxHxW]
  463. mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
  464. # Resize mask
  465. mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
  466. mask = mask[0][0]
  467. x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
  468. x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
  469. y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
  470. y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
  471. unpaded_im_mask = mask[(y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])]
  472. # TODO : replace below with a dynamic padding when support is added in ONNX
  473. # pad y
  474. zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
  475. zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
  476. concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
  477. # pad x
  478. zeros_x0 = torch.zeros(concat_0.size(0), x_0)
  479. zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
  480. im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
  481. return im_mask
  482. @torch.jit._script_if_tracing
  483. def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
  484. res_append = torch.zeros(0, im_h, im_w)
  485. for i in range(masks.size(0)):
  486. mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
  487. mask_res = mask_res.unsqueeze(0)
  488. res_append = torch.cat((res_append, mask_res))
  489. return res_append
  490. def paste_masks_in_image(masks, boxes, img_shape, padding=1):
  491. # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
  492. masks, scale = expand_masks(masks, padding=padding)
  493. boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
  494. im_h, im_w = img_shape
  495. if torchvision._is_tracing():
  496. return _onnx_paste_masks_in_image_loop(
  497. masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
  498. )[:, None]
  499. res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
  500. if len(res) > 0:
  501. ret = torch.stack(res, dim=0)[:, None]
  502. else:
  503. ret = masks.new_empty((0, 1, im_h, im_w))
  504. return ret
  505. class RoIHeads(nn.Module):
  506. __annotations__ = {
  507. "box_coder": det_utils.BoxCoder,
  508. "proposal_matcher": det_utils.Matcher,
  509. "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
  510. }
  511. def __init__(
  512. self,
  513. box_roi_pool,
  514. box_head,
  515. box_predictor,
  516. line_head,
  517. line_predictor,
  518. # Faster R-CNN training
  519. fg_iou_thresh,
  520. bg_iou_thresh,
  521. batch_size_per_image,
  522. positive_fraction,
  523. bbox_reg_weights,
  524. # Faster R-CNN inference
  525. score_thresh,
  526. nms_thresh,
  527. detections_per_img,
  528. # Mask
  529. mask_roi_pool=None,
  530. mask_head=None,
  531. mask_predictor=None,
  532. keypoint_roi_pool=None,
  533. keypoint_head=None,
  534. keypoint_predictor=None,
  535. ):
  536. super().__init__()
  537. self.box_similarity = box_ops.box_iou
  538. # assign ground-truth boxes for each proposal
  539. self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
  540. self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
  541. if bbox_reg_weights is None:
  542. bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
  543. self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
  544. self.box_roi_pool = box_roi_pool
  545. self.box_head = box_head
  546. self.box_predictor = box_predictor
  547. self.line_head = line_head
  548. self.line_predictor = line_predictor
  549. self.score_thresh = score_thresh
  550. self.nms_thresh = nms_thresh
  551. self.detections_per_img = detections_per_img
  552. self.mask_roi_pool = mask_roi_pool
  553. self.mask_head = mask_head
  554. self.mask_predictor = mask_predictor
  555. self.keypoint_roi_pool = keypoint_roi_pool
  556. self.keypoint_head = keypoint_head
  557. self.keypoint_predictor = keypoint_predictor
  558. def has_line(self):
  559. # if self.mask_roi_pool is None:
  560. # return False
  561. if self.line_head is None:
  562. return False
  563. if self.line_predictor is None:
  564. return False
  565. return True
  566. def has_mask(self):
  567. if self.mask_roi_pool is None:
  568. return False
  569. if self.mask_head is None:
  570. return False
  571. if self.mask_predictor is None:
  572. return False
  573. return True
  574. def has_keypoint(self):
  575. if self.keypoint_roi_pool is None:
  576. return False
  577. if self.keypoint_head is None:
  578. return False
  579. if self.keypoint_predictor is None:
  580. return False
  581. return True
  582. def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
  583. # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  584. matched_idxs = []
  585. labels = []
  586. for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
  587. if gt_boxes_in_image.numel() == 0:
  588. # Background image
  589. device = proposals_in_image.device
  590. clamped_matched_idxs_in_image = torch.zeros(
  591. (proposals_in_image.shape[0],), dtype=torch.int64, device=device
  592. )
  593. labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
  594. else:
  595. # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
  596. match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
  597. matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
  598. clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
  599. labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
  600. labels_in_image = labels_in_image.to(dtype=torch.int64)
  601. # Label background (below the low threshold)
  602. bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
  603. labels_in_image[bg_inds] = 0
  604. # Label ignore proposals (between low and high thresholds)
  605. ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
  606. labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
  607. matched_idxs.append(clamped_matched_idxs_in_image)
  608. labels.append(labels_in_image)
  609. return matched_idxs, labels
  610. def subsample(self, labels):
  611. # type: (List[Tensor]) -> List[Tensor]
  612. sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  613. sampled_inds = []
  614. for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
  615. img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
  616. sampled_inds.append(img_sampled_inds)
  617. return sampled_inds
  618. def add_gt_proposals(self, proposals, gt_boxes):
  619. # type: (List[Tensor], List[Tensor]) -> List[Tensor]
  620. proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
  621. return proposals
  622. def check_targets(self, targets):
  623. # type: (Optional[List[Dict[str, Tensor]]]) -> None
  624. if targets is None:
  625. raise ValueError("targets should not be None")
  626. if not all(["boxes" in t for t in targets]):
  627. raise ValueError("Every element of targets should have a boxes key")
  628. if not all(["labels" in t for t in targets]):
  629. raise ValueError("Every element of targets should have a labels key")
  630. if self.has_mask():
  631. if not all(["masks" in t for t in targets]):
  632. raise ValueError("Every element of targets should have a masks key")
  633. def select_training_samples(
  634. self,
  635. proposals, # type: List[Tensor]
  636. targets, # type: Optional[List[Dict[str, Tensor]]]
  637. ):
  638. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
  639. self.check_targets(targets)
  640. if targets is None:
  641. raise ValueError("targets should not be None")
  642. dtype = proposals[0].dtype
  643. device = proposals[0].device
  644. gt_boxes = [t["boxes"].to(dtype) for t in targets]
  645. gt_labels = [t["labels"] for t in targets]
  646. # append ground-truth bboxes to propos
  647. proposals = self.add_gt_proposals(proposals, gt_boxes)
  648. # get matching gt indices for each proposal
  649. matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
  650. # sample a fixed proportion of positive-negative proposals
  651. sampled_inds = self.subsample(labels)
  652. matched_gt_boxes = []
  653. num_images = len(proposals)
  654. for img_id in range(num_images):
  655. img_sampled_inds = sampled_inds[img_id]
  656. proposals[img_id] = proposals[img_id][img_sampled_inds]
  657. labels[img_id] = labels[img_id][img_sampled_inds]
  658. matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
  659. gt_boxes_in_image = gt_boxes[img_id]
  660. if gt_boxes_in_image.numel() == 0:
  661. gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
  662. matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
  663. regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
  664. return proposals, matched_idxs, labels, regression_targets
  665. def postprocess_detections(
  666. self,
  667. class_logits, # type: Tensor
  668. box_regression, # type: Tensor
  669. proposals, # type: List[Tensor]
  670. image_shapes, # type: List[Tuple[int, int]]
  671. ):
  672. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
  673. device = class_logits.device
  674. num_classes = class_logits.shape[-1]
  675. boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
  676. pred_boxes = self.box_coder.decode(box_regression, proposals)
  677. pred_scores = F.softmax(class_logits, -1)
  678. pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
  679. pred_scores_list = pred_scores.split(boxes_per_image, 0)
  680. all_boxes = []
  681. all_scores = []
  682. all_labels = []
  683. for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
  684. boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
  685. # create labels for each prediction
  686. labels = torch.arange(num_classes, device=device)
  687. labels = labels.view(1, -1).expand_as(scores)
  688. # remove predictions with the background label
  689. boxes = boxes[:, 1:]
  690. scores = scores[:, 1:]
  691. labels = labels[:, 1:]
  692. # batch everything, by making every class prediction be a separate instance
  693. boxes = boxes.reshape(-1, 4)
  694. scores = scores.reshape(-1)
  695. labels = labels.reshape(-1)
  696. # remove low scoring boxes
  697. inds = torch.where(scores > self.score_thresh)[0]
  698. boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
  699. # remove empty boxes
  700. keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
  701. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  702. # non-maximum suppression, independently done per class
  703. keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
  704. # keep only topk scoring predictions
  705. keep = keep[: self.detections_per_img]
  706. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  707. all_boxes.append(boxes)
  708. all_scores.append(scores)
  709. all_labels.append(labels)
  710. return all_boxes, all_scores, all_labels
  711. def forward(
  712. self,
  713. features, # type: Dict[str, Tensor]
  714. proposals, # type: List[Tensor]
  715. image_shapes, # type: List[Tuple[int, int]]
  716. targets=None, # type: Optional[List[Dict[str, Tensor]]]
  717. ):
  718. # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
  719. """
  720. Args:
  721. features (List[Tensor])
  722. proposals (List[Tensor[N, 4]])
  723. image_shapes (List[Tuple[H, W]])
  724. targets (List[Dict])
  725. """
  726. if targets is not None:
  727. for t in targets:
  728. # TODO: https://github.com/pytorch/pytorch/issues/26731
  729. floating_point_types = (torch.float, torch.double, torch.half)
  730. if not t["boxes"].dtype in floating_point_types:
  731. raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
  732. if not t["labels"].dtype == torch.int64:
  733. raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
  734. if self.has_keypoint():
  735. if not t["keypoints"].dtype == torch.float32:
  736. raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
  737. if self.training:
  738. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  739. else:
  740. labels = None
  741. regression_targets = None
  742. matched_idxs = None
  743. box_features = self.box_roi_pool(features, proposals, image_shapes)
  744. box_features = self.box_head(box_features)
  745. class_logits, box_regression = self.box_predictor(box_features)
  746. result: List[Dict[str, torch.Tensor]] = []
  747. losses = {}
  748. if self.training:
  749. if labels is None:
  750. raise ValueError("labels cannot be None")
  751. if regression_targets is None:
  752. raise ValueError("regression_targets cannot be None")
  753. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  754. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  755. else:
  756. boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
  757. num_images = len(boxes)
  758. for i in range(num_images):
  759. result.append(
  760. {
  761. "boxes": boxes[i],
  762. "labels": labels[i],
  763. "scores": scores[i],
  764. }
  765. )
  766. features_lcnn = features['0']
  767. if self.has_line():
  768. line_features = self.line_head(features_lcnn)
  769. loss_weight = {'jmap': 8.0, 'lmap': 0.5, 'joff': 0.25, 'lpos': 1, 'lneg': 1, 'boxes': 1.0}
  770. x, ys, idx, jcs, n_batch, ps, n_out_line, n_out_junc = self.line_predictor(
  771. features_lcnn) # x, y, idx, jcs, n_batch, ps, self.n_out_line, self.n_out_junc
  772. # line_loss(multitasklearner)
  773. if self.training:
  774. head_result = line_head_loss(targets, line_features, features_lcnn, loss_weight, mode_train=True)
  775. line_result = line_vectorizer_loss(head_result, x, ys, idx, jcs, n_batch, ps, n_out_line, n_out_junc,
  776. loss_weight, mode_train=True)
  777. else:
  778. head_result = line_head_loss(targets, line_features, features_lcnn, loss_weight, mode_train=False)
  779. line_result = line_vectorizer_loss(head_result, x, ys, idx, jcs, n_batch, ps, n_out_line, n_out_junc,
  780. loss_weight, mode_train=False)
  781. if self.has_mask():
  782. mask_proposals = [p["boxes"] for p in result]
  783. if self.training:
  784. if matched_idxs is None:
  785. raise ValueError("if in training, matched_idxs should not be None")
  786. # during training, only focus on positive boxes
  787. num_images = len(proposals)
  788. mask_proposals = []
  789. pos_matched_idxs = []
  790. for img_id in range(num_images):
  791. pos = torch.where(labels[img_id] > 0)[0]
  792. mask_proposals.append(proposals[img_id][pos])
  793. pos_matched_idxs.append(matched_idxs[img_id][pos])
  794. else:
  795. pos_matched_idxs = None
  796. if self.mask_roi_pool is not None:
  797. mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
  798. mask_features = self.mask_head(mask_features)
  799. mask_logits = self.mask_predictor(mask_features)
  800. else:
  801. raise Exception("Expected mask_roi_pool to be not None")
  802. loss_mask = {}
  803. if self.training:
  804. if targets is None or pos_matched_idxs is None or mask_logits is None:
  805. raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
  806. gt_masks = [t["masks"] for t in targets]
  807. gt_labels = [t["labels"] for t in targets]
  808. rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
  809. loss_mask = {"loss_mask": rcnn_loss_mask}
  810. else:
  811. labels = [r["labels"] for r in result]
  812. masks_probs = maskrcnn_inference(mask_logits, labels)
  813. for mask_prob, r in zip(masks_probs, result):
  814. r["masks"] = mask_prob
  815. losses.update(loss_mask)
  816. # keep none checks in if conditional so torchscript will conditionally
  817. # compile each branch
  818. if (
  819. self.keypoint_roi_pool is not None
  820. and self.keypoint_head is not None
  821. and self.keypoint_predictor is not None
  822. ):
  823. keypoint_proposals = [p["boxes"] for p in result]
  824. if self.training:
  825. # during training, only focus on positive boxes
  826. num_images = len(proposals)
  827. keypoint_proposals = []
  828. pos_matched_idxs = []
  829. if matched_idxs is None:
  830. raise ValueError("if in trainning, matched_idxs should not be None")
  831. for img_id in range(num_images):
  832. pos = torch.where(labels[img_id] > 0)[0]
  833. keypoint_proposals.append(proposals[img_id][pos])
  834. pos_matched_idxs.append(matched_idxs[img_id][pos])
  835. else:
  836. pos_matched_idxs = None
  837. keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
  838. keypoint_features = self.keypoint_head(keypoint_features)
  839. keypoint_logits = self.keypoint_predictor(keypoint_features)
  840. loss_keypoint = {}
  841. if self.training:
  842. if targets is None or pos_matched_idxs is None:
  843. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  844. gt_keypoints = [t["keypoints"] for t in targets]
  845. rcnn_loss_keypoint = keypointrcnn_loss(
  846. keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
  847. )
  848. loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
  849. else:
  850. if keypoint_logits is None or keypoint_proposals is None:
  851. raise ValueError(
  852. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  853. )
  854. keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
  855. for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
  856. r["keypoints"] = keypoint_prob
  857. r["keypoints_scores"] = kps
  858. losses.update(loss_keypoint)
  859. return result, losses