autobatch.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
  2. """Functions for estimating the best YOLO batch size to use a fraction of the available CUDA memory in PyTorch."""
  3. import os
  4. from copy import deepcopy
  5. import numpy as np
  6. import torch
  7. from ultralytics.utils import DEFAULT_CFG, LOGGER, colorstr
  8. from ultralytics.utils.torch_utils import autocast, profile
  9. def check_train_batch_size(model, imgsz=640, amp=True, batch=-1, max_num_obj=1):
  10. """
  11. Compute optimal YOLO training batch size using the autobatch() function.
  12. Args:
  13. model (torch.nn.Module): YOLO model to check batch size for.
  14. imgsz (int, optional): Image size used for training.
  15. amp (bool, optional): Use automatic mixed precision if True.
  16. batch (float, optional): Fraction of GPU memory to use. If -1, use default.
  17. max_num_obj (int, optional): The maximum number of objects from dataset.
  18. Returns:
  19. (int): Optimal batch size computed using the autobatch() function.
  20. Note:
  21. If 0.0 < batch < 1.0, it's used as the fraction of GPU memory to use.
  22. Otherwise, a default fraction of 0.6 is used.
  23. """
  24. with autocast(enabled=amp):
  25. return autobatch(
  26. deepcopy(model).train(), imgsz, fraction=batch if 0.0 < batch < 1.0 else 0.6, max_num_obj=max_num_obj
  27. )
  28. def autobatch(model, imgsz=640, fraction=0.60, batch_size=DEFAULT_CFG.batch, max_num_obj=1):
  29. """
  30. Automatically estimate the best YOLO batch size to use a fraction of the available CUDA memory.
  31. Args:
  32. model (torch.nn.module): YOLO model to compute batch size for.
  33. imgsz (int, optional): The image size used as input for the YOLO model. Defaults to 640.
  34. fraction (float, optional): The fraction of available CUDA memory to use. Defaults to 0.60.
  35. batch_size (int, optional): The default batch size to use if an error is detected. Defaults to 16.
  36. max_num_obj (int, optional): The maximum number of objects from dataset.
  37. Returns:
  38. (int): The optimal batch size.
  39. """
  40. # Check device
  41. prefix = colorstr("AutoBatch: ")
  42. LOGGER.info(f"{prefix}Computing optimal batch size for imgsz={imgsz} at {fraction * 100}% CUDA memory utilization.")
  43. device = next(model.parameters()).device # get model device
  44. if device.type in {"cpu", "mps"}:
  45. LOGGER.info(f"{prefix} ⚠️ intended for CUDA devices, using default batch-size {batch_size}")
  46. return batch_size
  47. if torch.backends.cudnn.benchmark:
  48. LOGGER.info(f"{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}")
  49. return batch_size
  50. # Inspect CUDA memory
  51. gb = 1 << 30 # bytes to GiB (1024 ** 3)
  52. d = f"CUDA:{os.getenv('CUDA_VISIBLE_DEVICES', '0').strip()[0]}" # 'CUDA:0'
  53. properties = torch.cuda.get_device_properties(device) # device properties
  54. t = properties.total_memory / gb # GiB total
  55. r = torch.cuda.memory_reserved(device) / gb # GiB reserved
  56. a = torch.cuda.memory_allocated(device) / gb # GiB allocated
  57. f = t - (r + a) # GiB free
  58. LOGGER.info(f"{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free")
  59. # Profile batch sizes
  60. batch_sizes = [1, 2, 4, 8, 16] if t < 16 else [1, 2, 4, 8, 16, 32, 64]
  61. try:
  62. img = [torch.empty(b, 3, imgsz, imgsz) for b in batch_sizes]
  63. results = profile(img, model, n=1, device=device, max_num_obj=max_num_obj)
  64. # Fit a solution
  65. xy = [
  66. [x, y[2]]
  67. for i, (x, y) in enumerate(zip(batch_sizes, results))
  68. if y # valid result
  69. and isinstance(y[2], (int, float)) # is numeric
  70. and 0 < y[2] < t # between 0 and GPU limit
  71. and (i == 0 or not results[i - 1] or y[2] > results[i - 1][2]) # first item or increasing memory
  72. ]
  73. fit_x, fit_y = zip(*xy) if xy else ([], [])
  74. p = np.polyfit(np.log(fit_x), np.log(fit_y), deg=1) # first-degree polynomial fit in log space
  75. b = int(round(np.exp((np.log(f * fraction) - p[1]) / p[0]))) # y intercept (optimal batch size)
  76. if None in results: # some sizes failed
  77. i = results.index(None) # first fail index
  78. if b >= batch_sizes[i]: # y intercept above failure point
  79. b = batch_sizes[max(i - 1, 0)] # select prior safe point
  80. if b < 1 or b > 1024: # b outside of safe range
  81. LOGGER.info(f"{prefix}WARNING ⚠️ batch={b} outside safe range, using default batch-size {batch_size}.")
  82. b = batch_size
  83. fraction = (np.exp(np.polyval(p, np.log(b))) + r + a) / t # predicted fraction
  84. LOGGER.info(f"{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅")
  85. return b
  86. except Exception as e:
  87. LOGGER.warning(f"{prefix}WARNING ⚠️ error detected: {e}, using default batch-size {batch_size}.")
  88. return batch_size
  89. finally:
  90. torch.cuda.empty_cache()