ops.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
  2. import contextlib
  3. import math
  4. import re
  5. import time
  6. import cv2
  7. import numpy as np
  8. import torch
  9. import torch.nn.functional as F
  10. from ultralytics.utils import LOGGER
  11. from ultralytics.utils.metrics import batch_probiou
  12. class Profile(contextlib.ContextDecorator):
  13. """
  14. YOLOv8 Profile class. Use as a decorator with @Profile() or as a context manager with 'with Profile():'.
  15. Example:
  16. ```python
  17. from ultralytics.utils.ops import Profile
  18. with Profile(device=device) as dt:
  19. pass # slow operation here
  20. print(dt) # prints "Elapsed time is 9.5367431640625e-07 s"
  21. ```
  22. """
  23. def __init__(self, t=0.0, device: torch.device = None):
  24. """
  25. Initialize the Profile class.
  26. Args:
  27. t (float): Initial time. Defaults to 0.0.
  28. device (torch.device): Devices used for model inference. Defaults to None (cpu).
  29. """
  30. self.t = t
  31. self.device = device
  32. self.cuda = bool(device and str(device).startswith("cuda"))
  33. def __enter__(self):
  34. """Start timing."""
  35. self.start = self.time()
  36. return self
  37. def __exit__(self, type, value, traceback): # noqa
  38. """Stop timing."""
  39. self.dt = self.time() - self.start # delta-time
  40. self.t += self.dt # accumulate dt
  41. def __str__(self):
  42. """Returns a human-readable string representing the accumulated elapsed time in the profiler."""
  43. return f"Elapsed time is {self.t} s"
  44. def time(self):
  45. """Get current time."""
  46. if self.cuda:
  47. torch.cuda.synchronize(self.device)
  48. return time.time()
  49. def segment2box(segment, width=640, height=640):
  50. """
  51. Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy).
  52. Args:
  53. segment (torch.Tensor): the segment label
  54. width (int): the width of the image. Defaults to 640
  55. height (int): The height of the image. Defaults to 640
  56. Returns:
  57. (np.ndarray): the minimum and maximum x and y values of the segment.
  58. """
  59. x, y = segment.T # segment xy
  60. # any 3 out of 4 sides are outside the image, clip coordinates first, https://github.com/ultralytics/ultralytics/pull/18294
  61. if np.array([x.min() < 0, y.min() < 0, x.max() > width, y.max() > height]).sum() >= 3:
  62. x = x.clip(0, width)
  63. y = y.clip(0, height)
  64. inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
  65. x = x[inside]
  66. y = y[inside]
  67. return (
  68. np.array([x.min(), y.min(), x.max(), y.max()], dtype=segment.dtype)
  69. if any(x)
  70. else np.zeros(4, dtype=segment.dtype)
  71. ) # xyxy
  72. def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False):
  73. """
  74. Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
  75. specified in (img1_shape) to the shape of a different image (img0_shape).
  76. Args:
  77. img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
  78. boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
  79. img0_shape (tuple): the shape of the target image, in the format of (height, width).
  80. ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
  81. calculated based on the size difference between the two images.
  82. padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
  83. rescaling.
  84. xywh (bool): The box format is xywh or not, default=False.
  85. Returns:
  86. boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
  87. """
  88. if ratio_pad is None: # calculate from img0_shape
  89. gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
  90. pad = (
  91. round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
  92. round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
  93. ) # wh padding
  94. else:
  95. gain = ratio_pad[0][0]
  96. pad = ratio_pad[1]
  97. if padding:
  98. boxes[..., 0] -= pad[0] # x padding
  99. boxes[..., 1] -= pad[1] # y padding
  100. if not xywh:
  101. boxes[..., 2] -= pad[0] # x padding
  102. boxes[..., 3] -= pad[1] # y padding
  103. boxes[..., :4] /= gain
  104. return clip_boxes(boxes, img0_shape)
  105. def make_divisible(x, divisor):
  106. """
  107. Returns the nearest number that is divisible by the given divisor.
  108. Args:
  109. x (int): The number to make divisible.
  110. divisor (int | torch.Tensor): The divisor.
  111. Returns:
  112. (int): The nearest number divisible by the divisor.
  113. """
  114. if isinstance(divisor, torch.Tensor):
  115. divisor = int(divisor.max()) # to int
  116. return math.ceil(x / divisor) * divisor
  117. def nms_rotated(boxes, scores, threshold=0.45):
  118. """
  119. NMS for oriented bounding boxes using probiou and fast-nms.
  120. Args:
  121. boxes (torch.Tensor): Rotated bounding boxes, shape (N, 5), format xywhr.
  122. scores (torch.Tensor): Confidence scores, shape (N,).
  123. threshold (float, optional): IoU threshold. Defaults to 0.45.
  124. Returns:
  125. (torch.Tensor): Indices of boxes to keep after NMS.
  126. """
  127. if len(boxes) == 0:
  128. return np.empty((0,), dtype=np.int8)
  129. sorted_idx = torch.argsort(scores, descending=True)
  130. boxes = boxes[sorted_idx]
  131. ious = batch_probiou(boxes, boxes).triu_(diagonal=1)
  132. pick = torch.nonzero(ious.max(dim=0)[0] < threshold).squeeze_(-1)
  133. return sorted_idx[pick]
  134. def non_max_suppression(
  135. prediction,
  136. conf_thres=0.25,
  137. iou_thres=0.45,
  138. classes=None,
  139. agnostic=False,
  140. multi_label=False,
  141. labels=(),
  142. max_det=300,
  143. nc=0, # number of classes (optional)
  144. max_time_img=0.05,
  145. max_nms=30000,
  146. max_wh=7680,
  147. in_place=True,
  148. rotated=False,
  149. ):
  150. """
  151. Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
  152. Args:
  153. prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
  154. containing the predicted boxes, classes, and masks. The tensor should be in the format
  155. output by a model, such as YOLO.
  156. conf_thres (float): The confidence threshold below which boxes will be filtered out.
  157. Valid values are between 0.0 and 1.0.
  158. iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
  159. Valid values are between 0.0 and 1.0.
  160. classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
  161. agnostic (bool): If True, the model is agnostic to the number of classes, and all
  162. classes will be considered as one.
  163. multi_label (bool): If True, each box may have multiple labels.
  164. labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
  165. list contains the apriori labels for a given image. The list should be in the format
  166. output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
  167. max_det (int): The maximum number of boxes to keep after NMS.
  168. nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
  169. max_time_img (float): The maximum time (seconds) for processing one image.
  170. max_nms (int): The maximum number of boxes into torchvision.ops.nms().
  171. max_wh (int): The maximum box width and height in pixels.
  172. in_place (bool): If True, the input prediction tensor will be modified in place.
  173. rotated (bool): If Oriented Bounding Boxes (OBB) are being passed for NMS.
  174. Returns:
  175. (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
  176. shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
  177. (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
  178. """
  179. import torchvision # scope for faster 'import ultralytics'
  180. # Checks
  181. assert 0 <= conf_thres <= 1, f"Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0"
  182. assert 0 <= iou_thres <= 1, f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
  183. if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
  184. prediction = prediction[0] # select only inference output
  185. if classes is not None:
  186. classes = torch.tensor(classes, device=prediction.device)
  187. if prediction.shape[-1] == 6: # end-to-end model (BNC, i.e. 1,300,6)
  188. output = [pred[pred[:, 4] > conf_thres][:max_det] for pred in prediction]
  189. if classes is not None:
  190. output = [pred[(pred[:, 5:6] == classes).any(1)] for pred in output]
  191. return output
  192. bs = prediction.shape[0] # batch size (BCN, i.e. 1,84,6300)
  193. nc = nc or (prediction.shape[1] - 4) # number of classes
  194. nm = prediction.shape[1] - nc - 4 # number of masks
  195. mi = 4 + nc # mask start index
  196. xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
  197. # Settings
  198. # min_wh = 2 # (pixels) minimum box width and height
  199. time_limit = 2.0 + max_time_img * bs # seconds to quit after
  200. multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
  201. prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
  202. if not rotated:
  203. if in_place:
  204. prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
  205. else:
  206. prediction = torch.cat((xywh2xyxy(prediction[..., :4]), prediction[..., 4:]), dim=-1) # xywh to xyxy
  207. t = time.time()
  208. output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
  209. for xi, x in enumerate(prediction): # image index, image inference
  210. # Apply constraints
  211. # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
  212. x = x[xc[xi]] # confidence
  213. # Cat apriori labels if autolabelling
  214. if labels and len(labels[xi]) and not rotated:
  215. lb = labels[xi]
  216. v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
  217. v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
  218. v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
  219. x = torch.cat((x, v), 0)
  220. # If none remain process next image
  221. if not x.shape[0]:
  222. continue
  223. # Detections matrix nx6 (xyxy, conf, cls)
  224. box, cls, mask = x.split((4, nc, nm), 1)
  225. if multi_label:
  226. i, j = torch.where(cls > conf_thres)
  227. x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
  228. else: # best class only
  229. conf, j = cls.max(1, keepdim=True)
  230. x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
  231. # Filter by class
  232. if classes is not None:
  233. x = x[(x[:, 5:6] == classes).any(1)]
  234. # Check shape
  235. n = x.shape[0] # number of boxes
  236. if not n: # no boxes
  237. continue
  238. if n > max_nms: # excess boxes
  239. x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
  240. # Batched NMS
  241. c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
  242. scores = x[:, 4] # scores
  243. if rotated:
  244. boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1) # xywhr
  245. i = nms_rotated(boxes, scores, iou_thres)
  246. else:
  247. boxes = x[:, :4] + c # boxes (offset by class)
  248. i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
  249. i = i[:max_det] # limit detections
  250. # # Experimental
  251. # merge = False # use merge-NMS
  252. # if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
  253. # # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
  254. # from .metrics import box_iou
  255. # iou = box_iou(boxes[i], boxes) > iou_thres # IoU matrix
  256. # weights = iou * scores[None] # box weights
  257. # x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
  258. # redundant = True # require redundant detections
  259. # if redundant:
  260. # i = i[iou.sum(1) > 1] # require redundancy
  261. output[xi] = x[i]
  262. if (time.time() - t) > time_limit:
  263. LOGGER.warning(f"WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded")
  264. break # time limit exceeded
  265. return output
  266. def clip_boxes(boxes, shape):
  267. """
  268. Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
  269. Args:
  270. boxes (torch.Tensor): The bounding boxes to clip.
  271. shape (tuple): The shape of the image.
  272. Returns:
  273. (torch.Tensor | numpy.ndarray): The clipped boxes.
  274. """
  275. if isinstance(boxes, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
  276. boxes[..., 0] = boxes[..., 0].clamp(0, shape[1]) # x1
  277. boxes[..., 1] = boxes[..., 1].clamp(0, shape[0]) # y1
  278. boxes[..., 2] = boxes[..., 2].clamp(0, shape[1]) # x2
  279. boxes[..., 3] = boxes[..., 3].clamp(0, shape[0]) # y2
  280. else: # np.array (faster grouped)
  281. boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
  282. boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
  283. return boxes
  284. def clip_coords(coords, shape):
  285. """
  286. Clip line coordinates to the image boundaries.
  287. Args:
  288. coords (torch.Tensor | numpy.ndarray): A list of line coordinates.
  289. shape (tuple): A tuple of integers representing the size of the image in the format (height, width).
  290. Returns:
  291. (torch.Tensor | numpy.ndarray): Clipped coordinates
  292. """
  293. if isinstance(coords, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
  294. coords[..., 0] = coords[..., 0].clamp(0, shape[1]) # x
  295. coords[..., 1] = coords[..., 1].clamp(0, shape[0]) # y
  296. else: # np.array (faster grouped)
  297. coords[..., 0] = coords[..., 0].clip(0, shape[1]) # x
  298. coords[..., 1] = coords[..., 1].clip(0, shape[0]) # y
  299. return coords
  300. def scale_image(masks, im0_shape, ratio_pad=None):
  301. """
  302. Takes a mask, and resizes it to the original image size.
  303. Args:
  304. masks (np.ndarray): Resized and padded masks/images, [h, w, num]/[h, w, 3].
  305. im0_shape (tuple): The original image shape.
  306. ratio_pad (tuple): The ratio of the padding to the original image.
  307. Returns:
  308. masks (np.ndarray): The masks that are being returned with shape [h, w, num].
  309. """
  310. # Rescale coordinates (xyxy) from im1_shape to im0_shape
  311. im1_shape = masks.shape
  312. if im1_shape[:2] == im0_shape[:2]:
  313. return masks
  314. if ratio_pad is None: # calculate from im0_shape
  315. gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
  316. pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
  317. else:
  318. # gain = ratio_pad[0][0]
  319. pad = ratio_pad[1]
  320. top, left = int(pad[1]), int(pad[0]) # y, x
  321. bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
  322. if len(masks.shape) < 2:
  323. raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
  324. masks = masks[top:bottom, left:right]
  325. masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
  326. if len(masks.shape) == 2:
  327. masks = masks[:, :, None]
  328. return masks
  329. def xyxy2xywh(x):
  330. """
  331. Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
  332. top-left corner and (x2, y2) is the bottom-right corner.
  333. Args:
  334. x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
  335. Returns:
  336. y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format.
  337. """
  338. assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
  339. y = empty_like(x) # faster than clone/copy
  340. y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center
  341. y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center
  342. y[..., 2] = x[..., 2] - x[..., 0] # width
  343. y[..., 3] = x[..., 3] - x[..., 1] # height
  344. return y
  345. def xywh2xyxy(x):
  346. """
  347. Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
  348. top-left corner and (x2, y2) is the bottom-right corner. Note: ops per 2 channels faster than per channel.
  349. Args:
  350. x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
  351. Returns:
  352. y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
  353. """
  354. assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
  355. y = empty_like(x) # faster than clone/copy
  356. xy = x[..., :2] # centers
  357. wh = x[..., 2:] / 2 # half width-height
  358. y[..., :2] = xy - wh # top left xy
  359. y[..., 2:] = xy + wh # bottom right xy
  360. return y
  361. def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
  362. """
  363. Convert normalized bounding box coordinates to pixel coordinates.
  364. Args:
  365. x (np.ndarray | torch.Tensor): The bounding box coordinates.
  366. w (int): Width of the image. Defaults to 640
  367. h (int): Height of the image. Defaults to 640
  368. padw (int): Padding width. Defaults to 0
  369. padh (int): Padding height. Defaults to 0
  370. Returns:
  371. y (np.ndarray | torch.Tensor): The coordinates of the bounding box in the format [x1, y1, x2, y2] where
  372. x1,y1 is the top-left corner, x2,y2 is the bottom-right corner of the bounding box.
  373. """
  374. assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
  375. y = empty_like(x) # faster than clone/copy
  376. y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x
  377. y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y
  378. y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw # bottom right x
  379. y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh # bottom right y
  380. return y
  381. def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
  382. """
  383. Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. x, y,
  384. width and height are normalized to image dimensions.
  385. Args:
  386. x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
  387. w (int): The width of the image. Defaults to 640
  388. h (int): The height of the image. Defaults to 640
  389. clip (bool): If True, the boxes will be clipped to the image boundaries. Defaults to False
  390. eps (float): The minimum value of the box's width and height. Defaults to 0.0
  391. Returns:
  392. y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height, normalized) format
  393. """
  394. if clip:
  395. x = clip_boxes(x, (h - eps, w - eps))
  396. assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
  397. y = empty_like(x) # faster than clone/copy
  398. y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center
  399. y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center
  400. y[..., 2] = (x[..., 2] - x[..., 0]) / w # width
  401. y[..., 3] = (x[..., 3] - x[..., 1]) / h # height
  402. return y
  403. def xywh2ltwh(x):
  404. """
  405. Convert the bounding box format from [x, y, w, h] to [x1, y1, w, h], where x1, y1 are the top-left coordinates.
  406. Args:
  407. x (np.ndarray | torch.Tensor): The input tensor with the bounding box coordinates in the xywh format
  408. Returns:
  409. y (np.ndarray | torch.Tensor): The bounding box coordinates in the xyltwh format
  410. """
  411. y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
  412. y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x
  413. y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y
  414. return y
  415. def xyxy2ltwh(x):
  416. """
  417. Convert nx4 bounding boxes from [x1, y1, x2, y2] to [x1, y1, w, h], where xy1=top-left, xy2=bottom-right.
  418. Args:
  419. x (np.ndarray | torch.Tensor): The input tensor with the bounding boxes coordinates in the xyxy format
  420. Returns:
  421. y (np.ndarray | torch.Tensor): The bounding box coordinates in the xyltwh format.
  422. """
  423. y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
  424. y[..., 2] = x[..., 2] - x[..., 0] # width
  425. y[..., 3] = x[..., 3] - x[..., 1] # height
  426. return y
  427. def ltwh2xywh(x):
  428. """
  429. Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center.
  430. Args:
  431. x (torch.Tensor): the input tensor
  432. Returns:
  433. y (np.ndarray | torch.Tensor): The bounding box coordinates in the xywh format.
  434. """
  435. y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
  436. y[..., 0] = x[..., 0] + x[..., 2] / 2 # center x
  437. y[..., 1] = x[..., 1] + x[..., 3] / 2 # center y
  438. return y
  439. def xyxyxyxy2xywhr(x):
  440. """
  441. Convert batched Oriented Bounding Boxes (OBB) from [xy1, xy2, xy3, xy4] to [xywh, rotation]. Rotation values are
  442. returned in radians from 0 to pi/2.
  443. Args:
  444. x (numpy.ndarray | torch.Tensor): Input box corners [xy1, xy2, xy3, xy4] of shape (n, 8).
  445. Returns:
  446. (numpy.ndarray | torch.Tensor): Converted data in [cx, cy, w, h, rotation] format of shape (n, 5).
  447. """
  448. is_torch = isinstance(x, torch.Tensor)
  449. points = x.cpu().numpy() if is_torch else x
  450. points = points.reshape(len(x), -1, 2)
  451. rboxes = []
  452. for pts in points:
  453. # NOTE: Use cv2.minAreaRect to get accurate xywhr,
  454. # especially some objects are cut off by augmentations in dataloader.
  455. (cx, cy), (w, h), angle = cv2.minAreaRect(pts)
  456. rboxes.append([cx, cy, w, h, angle / 180 * np.pi])
  457. return torch.tensor(rboxes, device=x.device, dtype=x.dtype) if is_torch else np.asarray(rboxes)
  458. def xywhr2xyxyxyxy(x):
  459. """
  460. Convert batched Oriented Bounding Boxes (OBB) from [xywh, rotation] to [xy1, xy2, xy3, xy4]. Rotation values should
  461. be in radians from 0 to pi/2.
  462. Args:
  463. x (numpy.ndarray | torch.Tensor): Boxes in [cx, cy, w, h, rotation] format of shape (n, 5) or (b, n, 5).
  464. Returns:
  465. (numpy.ndarray | torch.Tensor): Converted corner points of shape (n, 4, 2) or (b, n, 4, 2).
  466. """
  467. cos, sin, cat, stack = (
  468. (torch.cos, torch.sin, torch.cat, torch.stack)
  469. if isinstance(x, torch.Tensor)
  470. else (np.cos, np.sin, np.concatenate, np.stack)
  471. )
  472. ctr = x[..., :2]
  473. w, h, angle = (x[..., i : i + 1] for i in range(2, 5))
  474. cos_value, sin_value = cos(angle), sin(angle)
  475. vec1 = [w / 2 * cos_value, w / 2 * sin_value]
  476. vec2 = [-h / 2 * sin_value, h / 2 * cos_value]
  477. vec1 = cat(vec1, -1)
  478. vec2 = cat(vec2, -1)
  479. pt1 = ctr + vec1 + vec2
  480. pt2 = ctr + vec1 - vec2
  481. pt3 = ctr - vec1 - vec2
  482. pt4 = ctr - vec1 + vec2
  483. return stack([pt1, pt2, pt3, pt4], -2)
  484. def ltwh2xyxy(x):
  485. """
  486. It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.
  487. Args:
  488. x (np.ndarray | torch.Tensor): the input image
  489. Returns:
  490. y (np.ndarray | torch.Tensor): the xyxy coordinates of the bounding boxes.
  491. """
  492. y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
  493. y[..., 2] = x[..., 2] + x[..., 0] # width
  494. y[..., 3] = x[..., 3] + x[..., 1] # height
  495. return y
  496. def segments2boxes(segments):
  497. """
  498. It converts segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh).
  499. Args:
  500. segments (list): list of segments, each segment is a list of points, each point is a list of x, y coordinates
  501. Returns:
  502. (np.ndarray): the xywh coordinates of the bounding boxes.
  503. """
  504. boxes = []
  505. for s in segments:
  506. x, y = s.T # segment xy
  507. boxes.append([x.min(), y.min(), x.max(), y.max()]) # cls, xyxy
  508. return xyxy2xywh(np.array(boxes)) # cls, xywh
  509. def resample_segments(segments, n=1000):
  510. """
  511. Inputs a list of segments (n,2) and returns a list of segments (n,2) up-sampled to n points each.
  512. Args:
  513. segments (list): a list of (n,2) arrays, where n is the number of points in the segment.
  514. n (int): number of points to resample the segment to. Defaults to 1000
  515. Returns:
  516. segments (list): the resampled segments.
  517. """
  518. for i, s in enumerate(segments):
  519. if len(s) == n:
  520. continue
  521. s = np.concatenate((s, s[0:1, :]), axis=0)
  522. x = np.linspace(0, len(s) - 1, n - len(s) if len(s) < n else n)
  523. xp = np.arange(len(s))
  524. x = np.insert(x, np.searchsorted(x, xp), xp) if len(s) < n else x
  525. segments[i] = (
  526. np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)], dtype=np.float32).reshape(2, -1).T
  527. ) # segment xy
  528. return segments
  529. def crop_mask(masks, boxes):
  530. """
  531. It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
  532. Args:
  533. masks (torch.Tensor): [n, h, w] tensor of masks
  534. boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
  535. Returns:
  536. (torch.Tensor): The masks are being cropped to the bounding box.
  537. """
  538. _, h, w = masks.shape
  539. x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
  540. r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
  541. c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
  542. return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
  543. def process_mask(protos, masks_in, bboxes, shape, upsample=False):
  544. """
  545. Apply masks to bounding boxes using the output of the mask head.
  546. Args:
  547. protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
  548. masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
  549. bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
  550. shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
  551. upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
  552. Returns:
  553. (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
  554. are the height and width of the input image. The mask is applied to the bounding boxes.
  555. """
  556. c, mh, mw = protos.shape # CHW
  557. ih, iw = shape
  558. masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw) # CHW
  559. width_ratio = mw / iw
  560. height_ratio = mh / ih
  561. downsampled_bboxes = bboxes.clone()
  562. downsampled_bboxes[:, 0] *= width_ratio
  563. downsampled_bboxes[:, 2] *= width_ratio
  564. downsampled_bboxes[:, 3] *= height_ratio
  565. downsampled_bboxes[:, 1] *= height_ratio
  566. masks = crop_mask(masks, downsampled_bboxes) # CHW
  567. if upsample:
  568. masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0] # CHW
  569. return masks.gt_(0.0)
  570. def process_mask_native(protos, masks_in, bboxes, shape):
  571. """
  572. It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
  573. Args:
  574. protos (torch.Tensor): [mask_dim, mask_h, mask_w]
  575. masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms.
  576. bboxes (torch.Tensor): [n, 4], n is number of masks after nms.
  577. shape (tuple): The size of the input image (h,w).
  578. Returns:
  579. masks (torch.Tensor): The returned masks with dimensions [h, w, n].
  580. """
  581. c, mh, mw = protos.shape # CHW
  582. masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)
  583. masks = scale_masks(masks[None], shape)[0] # CHW
  584. masks = crop_mask(masks, bboxes) # CHW
  585. return masks.gt_(0.0)
  586. def scale_masks(masks, shape, padding=True):
  587. """
  588. Rescale segment masks to shape.
  589. Args:
  590. masks (torch.Tensor): (N, C, H, W).
  591. shape (tuple): Height and width.
  592. padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
  593. rescaling.
  594. """
  595. mh, mw = masks.shape[2:]
  596. gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
  597. pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
  598. if padding:
  599. pad[0] /= 2
  600. pad[1] /= 2
  601. top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
  602. bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
  603. masks = masks[..., top:bottom, left:right]
  604. masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
  605. return masks
  606. def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False, padding=True):
  607. """
  608. Rescale segment coordinates (xy) from img1_shape to img0_shape.
  609. Args:
  610. img1_shape (tuple): The shape of the image that the coords are from.
  611. coords (torch.Tensor): the coords to be scaled of shape n,2.
  612. img0_shape (tuple): the shape of the image that the segmentation is being applied to.
  613. ratio_pad (tuple): the ratio of the image size to the padded image size.
  614. normalize (bool): If True, the coordinates will be normalized to the range [0, 1]. Defaults to False.
  615. padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
  616. rescaling.
  617. Returns:
  618. coords (torch.Tensor): The scaled coordinates.
  619. """
  620. if ratio_pad is None: # calculate from img0_shape
  621. gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
  622. pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
  623. else:
  624. gain = ratio_pad[0][0]
  625. pad = ratio_pad[1]
  626. if padding:
  627. coords[..., 0] -= pad[0] # x padding
  628. coords[..., 1] -= pad[1] # y padding
  629. coords[..., 0] /= gain
  630. coords[..., 1] /= gain
  631. coords = clip_coords(coords, img0_shape)
  632. if normalize:
  633. coords[..., 0] /= img0_shape[1] # width
  634. coords[..., 1] /= img0_shape[0] # height
  635. return coords
  636. def regularize_rboxes(rboxes):
  637. """
  638. Regularize rotated boxes in range [0, pi/2].
  639. Args:
  640. rboxes (torch.Tensor): Input boxes of shape(N, 5) in xywhr format.
  641. Returns:
  642. (torch.Tensor): The regularized boxes.
  643. """
  644. x, y, w, h, t = rboxes.unbind(dim=-1)
  645. # Swap edge and angle if h >= w
  646. w_ = torch.where(w > h, w, h)
  647. h_ = torch.where(w > h, h, w)
  648. t = torch.where(w > h, t, t + math.pi / 2) % math.pi
  649. return torch.stack([x, y, w_, h_, t], dim=-1) # regularized boxes
  650. def masks2segments(masks, strategy="all"):
  651. """
  652. It takes a list of masks(n,h,w) and returns a list of segments(n,xy).
  653. Args:
  654. masks (torch.Tensor): the output of the model, which is a tensor of shape (batch_size, 160, 160)
  655. strategy (str): 'all' or 'largest'. Defaults to all
  656. Returns:
  657. segments (List): list of segment masks
  658. """
  659. from ultralytics.data.converter import merge_multi_segment
  660. segments = []
  661. for x in masks.int().cpu().numpy().astype("uint8"):
  662. c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
  663. if c:
  664. if strategy == "all": # merge and concatenate all segments
  665. c = (
  666. np.concatenate(merge_multi_segment([x.reshape(-1, 2) for x in c]))
  667. if len(c) > 1
  668. else c[0].reshape(-1, 2)
  669. )
  670. elif strategy == "largest": # select largest segment
  671. c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
  672. else:
  673. c = np.zeros((0, 2)) # no segments found
  674. segments.append(c.astype("float32"))
  675. return segments
  676. def convert_torch2numpy_batch(batch: torch.Tensor) -> np.ndarray:
  677. """
  678. Convert a batch of FP32 torch tensors (0.0-1.0) to a NumPy uint8 array (0-255), changing from BCHW to BHWC layout.
  679. Args:
  680. batch (torch.Tensor): Input tensor batch of shape (Batch, Channels, Height, Width) and dtype torch.float32.
  681. Returns:
  682. (np.ndarray): Output NumPy array batch of shape (Batch, Height, Width, Channels) and dtype uint8.
  683. """
  684. return (batch.permute(0, 2, 3, 1).contiguous() * 255).clamp(0, 255).to(torch.uint8).cpu().numpy()
  685. def clean_str(s):
  686. """
  687. Cleans a string by replacing special characters with '_' character.
  688. Args:
  689. s (str): a string needing special characters replaced
  690. Returns:
  691. (str): a string with special characters replaced by an underscore _
  692. """
  693. return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s)
  694. def empty_like(x):
  695. """Creates empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
  696. return (
  697. torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
  698. )