Mengqi Lei пре 2 месеци
родитељ
комит
035b97700e

+ 29 - 0
ultralytics/__init__.py

@@ -0,0 +1,29 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+__version__ = "8.3.63"
+
+import os
+
+# Set ENV variables (place before imports)
+if not os.environ.get("OMP_NUM_THREADS"):
+    os.environ["OMP_NUM_THREADS"] = "1"  # default for reduced CPU utilization during training
+
+from ultralytics.models import NAS, RTDETR, SAM, YOLO, FastSAM, YOLOWorld
+from ultralytics.utils import ASSETS, SETTINGS
+from ultralytics.utils.checks import check_yolo as checks
+from ultralytics.utils.downloads import download
+
+settings = SETTINGS
+__all__ = (
+    "__version__",
+    "ASSETS",
+    "YOLO",
+    "YOLOWorld",
+    "NAS",
+    "SAM",
+    "FastSAM",
+    "RTDETR",
+    "checks",
+    "download",
+    "settings",
+)

BIN
ultralytics/assets/bus.jpg


BIN
ultralytics/assets/zidane.jpg


+ 26 - 0
ultralytics/data/__init__.py

@@ -0,0 +1,26 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from .base import BaseDataset
+from .build import build_dataloader, build_grounding, build_yolo_dataset, load_inference_source
+from .dataset import (
+    ClassificationDataset,
+    GroundingDataset,
+    SemanticDataset,
+    YOLOConcatDataset,
+    YOLODataset,
+    YOLOMultiModalDataset,
+)
+
+__all__ = (
+    "BaseDataset",
+    "ClassificationDataset",
+    "SemanticDataset",
+    "YOLODataset",
+    "YOLOMultiModalDataset",
+    "YOLOConcatDataset",
+    "GroundingDataset",
+    "build_yolo_dataset",
+    "build_grounding",
+    "build_dataloader",
+    "load_inference_source",
+)

+ 72 - 0
ultralytics/data/annotator.py

@@ -0,0 +1,72 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from pathlib import Path
+
+from ultralytics import SAM, YOLO
+
+
+def auto_annotate(
+    data,
+    det_model="yolo11x.pt",
+    sam_model="sam_b.pt",
+    device="",
+    conf=0.25,
+    iou=0.45,
+    imgsz=640,
+    max_det=300,
+    classes=None,
+    output_dir=None,
+):
+    """
+    Automatically annotates images using a YOLO object detection model and a SAM segmentation model.
+
+    This function processes images in a specified directory, detects objects using a YOLO model, and then generates
+    segmentation masks using a SAM model. The resulting annotations are saved as text files.
+
+    Args:
+        data (str): Path to a folder containing images to be annotated.
+        det_model (str): Path or name of the pre-trained YOLO detection model.
+        sam_model (str): Path or name of the pre-trained SAM segmentation model.
+        device (str): Device to run the models on (e.g., 'cpu', 'cuda', '0').
+        conf (float): Confidence threshold for detection model; default is 0.25.
+        iou (float): IoU threshold for filtering overlapping boxes in detection results; default is 0.45.
+        imgsz (int): Input image resize dimension; default is 640.
+        max_det (int): Limits detections per image to control outputs in dense scenes.
+        classes (list): Filters predictions to specified class IDs, returning only relevant detections.
+        output_dir (str | None): Directory to save the annotated results. If None, a default directory is created.
+
+    Examples:
+        >>> from ultralytics.data.annotator import auto_annotate
+        >>> auto_annotate(data="ultralytics/assets", det_model="yolo11n.pt", sam_model="mobile_sam.pt")
+
+    Notes:
+        - The function creates a new directory for output if not specified.
+        - Annotation results are saved as text files with the same names as the input images.
+        - Each line in the output text file represents a detected object with its class ID and segmentation points.
+    """
+    det_model = YOLO(det_model)
+    sam_model = SAM(sam_model)
+
+    data = Path(data)
+    if not output_dir:
+        output_dir = data.parent / f"{data.stem}_auto_annotate_labels"
+    Path(output_dir).mkdir(exist_ok=True, parents=True)
+
+    det_results = det_model(
+        data, stream=True, device=device, conf=conf, iou=iou, imgsz=imgsz, max_det=max_det, classes=classes
+    )
+
+    for result in det_results:
+        class_ids = result.boxes.cls.int().tolist()  # noqa
+        if len(class_ids):
+            boxes = result.boxes.xyxy  # Boxes object for bbox outputs
+            sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
+            segments = sam_results[0].masks.xyn  # noqa
+
+            with open(f"{Path(output_dir) / Path(result.path).stem}.txt", "w") as f:
+                for i in range(len(segments)):
+                    s = segments[i]
+                    if len(s) == 0:
+                        continue
+                    segment = map(str, segments[i].reshape(-1).tolist())
+                    f.write(f"{class_ids[i]} " + " ".join(segment) + "\n")

+ 2744 - 0
ultralytics/data/augment.py

@@ -0,0 +1,2744 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import math
+import random
+from copy import deepcopy
+from typing import Tuple, Union
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from ultralytics.data.utils import polygons2masks, polygons2masks_overlap
+from ultralytics.utils import LOGGER, colorstr
+from ultralytics.utils.checks import check_version
+from ultralytics.utils.instance import Instances
+from ultralytics.utils.metrics import bbox_ioa
+from ultralytics.utils.ops import segment2box, xyxyxyxy2xywhr
+from ultralytics.utils.torch_utils import TORCHVISION_0_10, TORCHVISION_0_11, TORCHVISION_0_13
+
+DEFAULT_MEAN = (0.0, 0.0, 0.0)
+DEFAULT_STD = (1.0, 1.0, 1.0)
+DEFAULT_CROP_FRACTION = 1.0
+
+
+class BaseTransform:
+    """
+    Base class for image transformations in the Ultralytics library.
+
+    This class serves as a foundation for implementing various image processing operations, designed to be
+    compatible with both classification and semantic segmentation tasks.
+
+    Methods:
+        apply_image: Applies image transformations to labels.
+        apply_instances: Applies transformations to object instances in labels.
+        apply_semantic: Applies semantic segmentation to an image.
+        __call__: Applies all label transformations to an image, instances, and semantic masks.
+
+    Examples:
+        >>> transform = BaseTransform()
+        >>> labels = {"image": np.array(...), "instances": [...], "semantic": np.array(...)}
+        >>> transformed_labels = transform(labels)
+    """
+
+    def __init__(self) -> None:
+        """
+        Initializes the BaseTransform object.
+
+        This constructor sets up the base transformation object, which can be extended for specific image
+        processing tasks. It is designed to be compatible with both classification and semantic segmentation.
+
+        Examples:
+            >>> transform = BaseTransform()
+        """
+        pass
+
+    def apply_image(self, labels):
+        """
+        Applies image transformations to labels.
+
+        This method is intended to be overridden by subclasses to implement specific image transformation
+        logic. In its base form, it returns the input labels unchanged.
+
+        Args:
+            labels (Any): The input labels to be transformed. The exact type and structure of labels may
+                vary depending on the specific implementation.
+
+        Returns:
+            (Any): The transformed labels. In the base implementation, this is identical to the input.
+
+        Examples:
+            >>> transform = BaseTransform()
+            >>> original_labels = [1, 2, 3]
+            >>> transformed_labels = transform.apply_image(original_labels)
+            >>> print(transformed_labels)
+            [1, 2, 3]
+        """
+        pass
+
+    def apply_instances(self, labels):
+        """
+        Applies transformations to object instances in labels.
+
+        This method is responsible for applying various transformations to object instances within the given
+        labels. It is designed to be overridden by subclasses to implement specific instance transformation
+        logic.
+
+        Args:
+            labels (Dict): A dictionary containing label information, including object instances.
+
+        Returns:
+            (Dict): The modified labels dictionary with transformed object instances.
+
+        Examples:
+            >>> transform = BaseTransform()
+            >>> labels = {"instances": Instances(xyxy=torch.rand(5, 4), cls=torch.randint(0, 80, (5,)))}
+            >>> transformed_labels = transform.apply_instances(labels)
+        """
+        pass
+
+    def apply_semantic(self, labels):
+        """
+        Applies semantic segmentation transformations to an image.
+
+        This method is intended to be overridden by subclasses to implement specific semantic segmentation
+        transformations. In its base form, it does not perform any operations.
+
+        Args:
+            labels (Any): The input labels or semantic segmentation mask to be transformed.
+
+        Returns:
+            (Any): The transformed semantic segmentation mask or labels.
+
+        Examples:
+            >>> transform = BaseTransform()
+            >>> semantic_mask = np.zeros((100, 100), dtype=np.uint8)
+            >>> transformed_mask = transform.apply_semantic(semantic_mask)
+        """
+        pass
+
+    def __call__(self, labels):
+        """
+        Applies all label transformations to an image, instances, and semantic masks.
+
+        This method orchestrates the application of various transformations defined in the BaseTransform class
+        to the input labels. It sequentially calls the apply_image and apply_instances methods to process the
+        image and object instances, respectively.
+
+        Args:
+            labels (Dict): A dictionary containing image data and annotations. Expected keys include 'img' for
+                the image data, and 'instances' for object instances.
+
+        Returns:
+            (Dict): The input labels dictionary with transformed image and instances.
+
+        Examples:
+            >>> transform = BaseTransform()
+            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": []}
+            >>> transformed_labels = transform(labels)
+        """
+        self.apply_image(labels)
+        self.apply_instances(labels)
+        self.apply_semantic(labels)
+
+
+class Compose:
+    """
+    A class for composing multiple image transformations.
+
+    Attributes:
+        transforms (List[Callable]): A list of transformation functions to be applied sequentially.
+
+    Methods:
+        __call__: Applies a series of transformations to input data.
+        append: Appends a new transform to the existing list of transforms.
+        insert: Inserts a new transform at a specified index in the list of transforms.
+        __getitem__: Retrieves a specific transform or a set of transforms using indexing.
+        __setitem__: Sets a specific transform or a set of transforms using indexing.
+        tolist: Converts the list of transforms to a standard Python list.
+
+    Examples:
+        >>> transforms = [RandomFlip(), RandomPerspective(30)]
+        >>> compose = Compose(transforms)
+        >>> transformed_data = compose(data)
+        >>> compose.append(CenterCrop((224, 224)))
+        >>> compose.insert(0, RandomFlip())
+    """
+
+    def __init__(self, transforms):
+        """
+        Initializes the Compose object with a list of transforms.
+
+        Args:
+            transforms (List[Callable]): A list of callable transform objects to be applied sequentially.
+
+        Examples:
+            >>> from ultralytics.data.augment import Compose, RandomHSV, RandomFlip
+            >>> transforms = [RandomHSV(), RandomFlip()]
+            >>> compose = Compose(transforms)
+        """
+        self.transforms = transforms if isinstance(transforms, list) else [transforms]
+
+    def __call__(self, data):
+        """
+        Applies a series of transformations to input data. This method sequentially applies each transformation in the
+        Compose object's list of transforms to the input data.
+
+        Args:
+            data (Any): The input data to be transformed. This can be of any type, depending on the
+                transformations in the list.
+
+        Returns:
+            (Any): The transformed data after applying all transformations in sequence.
+
+        Examples:
+            >>> transforms = [Transform1(), Transform2(), Transform3()]
+            >>> compose = Compose(transforms)
+            >>> transformed_data = compose(input_data)
+        """
+        for t in self.transforms:
+            data = t(data)
+        return data
+
+    def append(self, transform):
+        """
+        Appends a new transform to the existing list of transforms.
+
+        Args:
+            transform (BaseTransform): The transformation to be added to the composition.
+
+        Examples:
+            >>> compose = Compose([RandomFlip(), RandomPerspective()])
+            >>> compose.append(RandomHSV())
+        """
+        self.transforms.append(transform)
+
+    def insert(self, index, transform):
+        """
+        Inserts a new transform at a specified index in the existing list of transforms.
+
+        Args:
+            index (int): The index at which to insert the new transform.
+            transform (BaseTransform): The transform object to be inserted.
+
+        Examples:
+            >>> compose = Compose([Transform1(), Transform2()])
+            >>> compose.insert(1, Transform3())
+            >>> len(compose.transforms)
+            3
+        """
+        self.transforms.insert(index, transform)
+
+    def __getitem__(self, index: Union[list, int]) -> "Compose":
+        """
+        Retrieves a specific transform or a set of transforms using indexing.
+
+        Args:
+            index (int | List[int]): Index or list of indices of the transforms to retrieve.
+
+        Returns:
+            (Compose): A new Compose object containing the selected transform(s).
+
+        Raises:
+            AssertionError: If the index is not of type int or list.
+
+        Examples:
+            >>> transforms = [RandomFlip(), RandomPerspective(10), RandomHSV(0.5, 0.5, 0.5)]
+            >>> compose = Compose(transforms)
+            >>> single_transform = compose[1]  # Returns a Compose object with only RandomPerspective
+            >>> multiple_transforms = compose[0:2]  # Returns a Compose object with RandomFlip and RandomPerspective
+        """
+        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
+        index = [index] if isinstance(index, int) else index
+        return Compose([self.transforms[i] for i in index])
+
+    def __setitem__(self, index: Union[list, int], value: Union[list, int]) -> None:
+        """
+        Sets one or more transforms in the composition using indexing.
+
+        Args:
+            index (int | List[int]): Index or list of indices to set transforms at.
+            value (Any | List[Any]): Transform or list of transforms to set at the specified index(es).
+
+        Raises:
+            AssertionError: If index type is invalid, value type doesn't match index type, or index is out of range.
+
+        Examples:
+            >>> compose = Compose([Transform1(), Transform2(), Transform3()])
+            >>> compose[1] = NewTransform()  # Replace second transform
+            >>> compose[0:2] = [NewTransform1(), NewTransform2()]  # Replace first two transforms
+        """
+        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
+        if isinstance(index, list):
+            assert isinstance(value, list), (
+                f"The indices should be the same type as values, but got {type(index)} and {type(value)}"
+            )
+        if isinstance(index, int):
+            index, value = [index], [value]
+        for i, v in zip(index, value):
+            assert i < len(self.transforms), f"list index {i} out of range {len(self.transforms)}."
+            self.transforms[i] = v
+
+    def tolist(self):
+        """
+        Converts the list of transforms to a standard Python list.
+
+        Returns:
+            (List): A list containing all the transform objects in the Compose instance.
+
+        Examples:
+            >>> transforms = [RandomFlip(), RandomPerspective(10), CenterCrop()]
+            >>> compose = Compose(transforms)
+            >>> transform_list = compose.tolist()
+            >>> print(len(transform_list))
+            3
+        """
+        return self.transforms
+
+    def __repr__(self):
+        """
+        Returns a string representation of the Compose object.
+
+        Returns:
+            (str): A string representation of the Compose object, including the list of transforms.
+
+        Examples:
+            >>> transforms = [RandomFlip(), RandomPerspective(degrees=10, translate=0.1, scale=0.1)]
+            >>> compose = Compose(transforms)
+            >>> print(compose)
+            Compose([
+                RandomFlip(),
+                RandomPerspective(degrees=10, translate=0.1, scale=0.1)
+            ])
+        """
+        return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})"
+
+
+class BaseMixTransform:
+    """
+    Base class for mix transformations like MixUp and Mosaic.
+
+    This class provides a foundation for implementing mix transformations on datasets. It handles the
+    probability-based application of transforms and manages the mixing of multiple images and labels.
+
+    Attributes:
+        dataset (Any): The dataset object containing images and labels.
+        pre_transform (Callable | None): Optional transform to apply before mixing.
+        p (float): Probability of applying the mix transformation.
+
+    Methods:
+        __call__: Applies the mix transformation to the input labels.
+        _mix_transform: Abstract method to be implemented by subclasses for specific mix operations.
+        get_indexes: Abstract method to get indexes of images to be mixed.
+        _update_label_text: Updates label text for mixed images.
+
+    Examples:
+        >>> class CustomMixTransform(BaseMixTransform):
+        ...     def _mix_transform(self, labels):
+        ...         # Implement custom mix logic here
+        ...         return labels
+        ...
+        ...     def get_indexes(self):
+        ...         return [random.randint(0, len(self.dataset) - 1) for _ in range(3)]
+        >>> dataset = YourDataset()
+        >>> transform = CustomMixTransform(dataset, p=0.5)
+        >>> mixed_labels = transform(original_labels)
+    """
+
+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
+        """
+        Initializes the BaseMixTransform object for mix transformations like MixUp and Mosaic.
+
+        This class serves as a base for implementing mix transformations in image processing pipelines.
+
+        Args:
+            dataset (Any): The dataset object containing images and labels for mixing.
+            pre_transform (Callable | None): Optional transform to apply before mixing.
+            p (float): Probability of applying the mix transformation. Should be in the range [0.0, 1.0].
+
+        Examples:
+            >>> dataset = YOLODataset("path/to/data")
+            >>> pre_transform = Compose([RandomFlip(), RandomPerspective()])
+            >>> mix_transform = BaseMixTransform(dataset, pre_transform, p=0.5)
+        """
+        self.dataset = dataset
+        self.pre_transform = pre_transform
+        self.p = p
+
+    def __call__(self, labels):
+        """
+        Applies pre-processing transforms and mixup/mosaic transforms to labels data.
+
+        This method determines whether to apply the mix transform based on a probability factor. If applied, it
+        selects additional images, applies pre-transforms if specified, and then performs the mix transform.
+
+        Args:
+            labels (Dict): A dictionary containing label data for an image.
+
+        Returns:
+            (Dict): The transformed labels dictionary, which may include mixed data from other images.
+
+        Examples:
+            >>> transform = BaseMixTransform(dataset, pre_transform=None, p=0.5)
+            >>> result = transform({"image": img, "bboxes": boxes, "cls": classes})
+        """
+        if random.uniform(0, 1) > self.p:
+            return labels
+
+        # Get index of one or three other images
+        indexes = self.get_indexes()
+        if isinstance(indexes, int):
+            indexes = [indexes]
+
+        # Get images information will be used for Mosaic or MixUp
+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
+
+        if self.pre_transform is not None:
+            for i, data in enumerate(mix_labels):
+                mix_labels[i] = self.pre_transform(data)
+        labels["mix_labels"] = mix_labels
+
+        # Update cls and texts
+        labels = self._update_label_text(labels)
+        # Mosaic or MixUp
+        labels = self._mix_transform(labels)
+        labels.pop("mix_labels", None)
+        return labels
+
+    def _mix_transform(self, labels):
+        """
+        Applies MixUp or Mosaic augmentation to the label dictionary.
+
+        This method should be implemented by subclasses to perform specific mix transformations like MixUp or
+        Mosaic. It modifies the input label dictionary in-place with the augmented data.
+
+        Args:
+            labels (Dict): A dictionary containing image and label data. Expected to have a 'mix_labels' key
+                with a list of additional image and label data for mixing.
+
+        Returns:
+            (Dict): The modified labels dictionary with augmented data after applying the mix transform.
+
+        Examples:
+            >>> transform = BaseMixTransform(dataset)
+            >>> labels = {"image": img, "bboxes": boxes, "mix_labels": [{"image": img2, "bboxes": boxes2}]}
+            >>> augmented_labels = transform._mix_transform(labels)
+        """
+        raise NotImplementedError
+
+    def get_indexes(self):
+        """
+        Gets a list of shuffled indexes for mosaic augmentation.
+
+        Returns:
+            (List[int]): A list of shuffled indexes from the dataset.
+
+        Examples:
+            >>> transform = BaseMixTransform(dataset)
+            >>> indexes = transform.get_indexes()
+            >>> print(indexes)  # [3, 18, 7, 2]
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def _update_label_text(labels):
+        """
+        Updates label text and class IDs for mixed labels in image augmentation.
+
+        This method processes the 'texts' and 'cls' fields of the input labels dictionary and any mixed labels,
+        creating a unified set of text labels and updating class IDs accordingly.
+
+        Args:
+            labels (Dict): A dictionary containing label information, including 'texts' and 'cls' fields,
+                and optionally a 'mix_labels' field with additional label dictionaries.
+
+        Returns:
+            (Dict): The updated labels dictionary with unified text labels and updated class IDs.
+
+        Examples:
+            >>> labels = {
+            ...     "texts": [["cat"], ["dog"]],
+            ...     "cls": torch.tensor([[0], [1]]),
+            ...     "mix_labels": [{"texts": [["bird"], ["fish"]], "cls": torch.tensor([[0], [1]])}],
+            ... }
+            >>> updated_labels = self._update_label_text(labels)
+            >>> print(updated_labels["texts"])
+            [['cat'], ['dog'], ['bird'], ['fish']]
+            >>> print(updated_labels["cls"])
+            tensor([[0],
+                    [1]])
+            >>> print(updated_labels["mix_labels"][0]["cls"])
+            tensor([[2],
+                    [3]])
+        """
+        if "texts" not in labels:
+            return labels
+
+        mix_texts = sum([labels["texts"]] + [x["texts"] for x in labels["mix_labels"]], [])
+        mix_texts = list({tuple(x) for x in mix_texts})
+        text2id = {text: i for i, text in enumerate(mix_texts)}
+
+        for label in [labels] + labels["mix_labels"]:
+            for i, cls in enumerate(label["cls"].squeeze(-1).tolist()):
+                text = label["texts"][int(cls)]
+                label["cls"][i] = text2id[tuple(text)]
+            label["texts"] = mix_texts
+        return labels
+
+
+class Mosaic(BaseMixTransform):
+    """
+    Mosaic augmentation for image datasets.
+
+    This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
+    The augmentation is applied to a dataset with a given probability.
+
+    Attributes:
+        dataset: The dataset on which the mosaic augmentation is applied.
+        imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
+        p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
+        n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
+        border (Tuple[int, int]): Border size for width and height.
+
+    Methods:
+        get_indexes: Returns a list of random indexes from the dataset.
+        _mix_transform: Applies mixup transformation to the input image and labels.
+        _mosaic3: Creates a 1x3 image mosaic.
+        _mosaic4: Creates a 2x2 image mosaic.
+        _mosaic9: Creates a 3x3 image mosaic.
+        _update_labels: Updates labels with padding.
+        _cat_labels: Concatenates labels and clips mosaic border instances.
+
+    Examples:
+        >>> from ultralytics.data.augment import Mosaic
+        >>> dataset = YourDataset(...)  # Your image dataset
+        >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
+        >>> augmented_labels = mosaic_aug(original_labels)
+    """
+
+    def __init__(self, dataset, imgsz=640, p=1.0, n=4):
+        """
+        Initializes the Mosaic augmentation object.
+
+        This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
+        The augmentation is applied to a dataset with a given probability.
+
+        Args:
+            dataset (Any): The dataset on which the mosaic augmentation is applied.
+            imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
+            p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
+            n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
+
+        Examples:
+            >>> from ultralytics.data.augment import Mosaic
+            >>> dataset = YourDataset(...)
+            >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
+        """
+        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
+        assert n in {4, 9}, "grid must be equal to 4 or 9."
+        super().__init__(dataset=dataset, p=p)
+        self.imgsz = imgsz
+        self.border = (-imgsz // 2, -imgsz // 2)  # width, height
+        self.n = n
+
+    def get_indexes(self, buffer=True):
+        """
+        Returns a list of random indexes from the dataset for mosaic augmentation.
+
+        This method selects random image indexes either from a buffer or from the entire dataset, depending on
+        the 'buffer' parameter. It is used to choose images for creating mosaic augmentations.
+
+        Args:
+            buffer (bool): If True, selects images from the dataset buffer. If False, selects from the entire
+                dataset.
+
+        Returns:
+            (List[int]): A list of random image indexes. The length of the list is n-1, where n is the number
+                of images used in the mosaic (either 3 or 8, depending on whether n is 4 or 9).
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
+            >>> indexes = mosaic.get_indexes()
+            >>> print(len(indexes))  # Output: 3
+        """
+        if buffer:  # select images from buffer
+            return random.choices(list(self.dataset.buffer), k=self.n - 1)
+        else:  # select any images
+            return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
+
+    def _mix_transform(self, labels):
+        """
+        Applies mosaic augmentation to the input image and labels.
+
+        This method combines multiple images (3, 4, or 9) into a single mosaic image based on the 'n' attribute.
+        It ensures that rectangular annotations are not present and that there are other images available for
+        mosaic augmentation.
+
+        Args:
+            labels (Dict): A dictionary containing image data and annotations. Expected keys include:
+                - 'rect_shape': Should be None as rect and mosaic are mutually exclusive.
+                - 'mix_labels': A list of dictionaries containing data for other images to be used in the mosaic.
+
+        Returns:
+            (Dict): A dictionary containing the mosaic-augmented image and updated annotations.
+
+        Raises:
+            AssertionError: If 'rect_shape' is not None or if 'mix_labels' is empty.
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
+            >>> augmented_data = mosaic._mix_transform(labels)
+        """
+        assert labels.get("rect_shape", None) is None, "rect and mosaic are mutually exclusive."
+        assert len(labels.get("mix_labels", [])), "There are no other images for mosaic augment."
+        return (
+            self._mosaic3(labels) if self.n == 3 else self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
+        )  # This code is modified for mosaic3 method.
+
+    def _mosaic3(self, labels):
+        """
+        Creates a 1x3 image mosaic by combining three images.
+
+        This method arranges three images in a horizontal layout, with the main image in the center and two
+        additional images on either side. It's part of the Mosaic augmentation technique used in object detection.
+
+        Args:
+            labels (Dict): A dictionary containing image and label information for the main (center) image.
+                Must include 'img' key with the image array, and 'mix_labels' key with a list of two
+                dictionaries containing information for the side images.
+
+        Returns:
+            (Dict): A dictionary with the mosaic image and updated labels. Keys include:
+                - 'img' (np.ndarray): The mosaic image array with shape (H, W, C).
+                - Other keys from the input labels, updated to reflect the new image dimensions.
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=3)
+            >>> labels = {
+            ...     "img": np.random.rand(480, 640, 3),
+            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(2)],
+            ... }
+            >>> result = mosaic._mosaic3(labels)
+            >>> print(result["img"].shape)
+            (640, 640, 3)
+        """
+        mosaic_labels = []
+        s = self.imgsz
+        for i in range(3):
+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
+            # Load image
+            img = labels_patch["img"]
+            h, w = labels_patch.pop("resized_shape")
+
+            # Place img in img3
+            if i == 0:  # center
+                img3 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 3 tiles
+                h0, w0 = h, w
+                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
+            elif i == 1:  # right
+                c = s + w0, s, s + w0 + w, s + h
+            elif i == 2:  # left
+                c = s - w, s + h0 - h, s, s + h0
+
+            padw, padh = c[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
+
+            img3[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img3[ymin:ymax, xmin:xmax]
+            # hp, wp = h, w  # height, width previous for next iteration
+
+            # Labels assuming imgsz*2 mosaic size
+            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
+            mosaic_labels.append(labels_patch)
+        final_labels = self._cat_labels(mosaic_labels)
+
+        final_labels["img"] = img3[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
+        return final_labels
+
+    def _mosaic4(self, labels):
+        """
+        Creates a 2x2 image mosaic from four input images.
+
+        This method combines four images into a single mosaic image by placing them in a 2x2 grid. It also
+        updates the corresponding labels for each image in the mosaic.
+
+        Args:
+            labels (Dict): A dictionary containing image data and labels for the base image (index 0) and three
+                additional images (indices 1-3) in the 'mix_labels' key.
+
+        Returns:
+            (Dict): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
+                image as a numpy array, and other keys contain the combined and adjusted labels for all four images.
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
+            >>> labels = {
+            ...     "img": np.random.rand(480, 640, 3),
+            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(3)],
+            ... }
+            >>> result = mosaic._mosaic4(labels)
+            >>> assert result["img"].shape == (1280, 1280, 3)
+        """
+        mosaic_labels = []
+        s = self.imgsz
+        yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border)  # mosaic center x, y
+        for i in range(4):
+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
+            # Load image
+            img = labels_patch["img"]
+            h, w = labels_patch.pop("resized_shape")
+
+            # Place img in img4
+            if i == 0:  # top left
+                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+            padw = x1a - x1b
+            padh = y1a - y1b
+
+            labels_patch = self._update_labels(labels_patch, padw, padh)
+            mosaic_labels.append(labels_patch)
+        final_labels = self._cat_labels(mosaic_labels)
+        final_labels["img"] = img4
+        return final_labels
+
+    def _mosaic9(self, labels):
+        """
+        Creates a 3x3 image mosaic from the input image and eight additional images.
+
+        This method combines nine images into a single mosaic image. The input image is placed at the center,
+        and eight additional images from the dataset are placed around it in a 3x3 grid pattern.
+
+        Args:
+            labels (Dict): A dictionary containing the input image and its associated labels. It should have
+                the following keys:
+                - 'img' (numpy.ndarray): The input image.
+                - 'resized_shape' (Tuple[int, int]): The shape of the resized image (height, width).
+                - 'mix_labels' (List[Dict]): A list of dictionaries containing information for the additional
+                  eight images, each with the same structure as the input labels.
+
+        Returns:
+            (Dict): A dictionary containing the mosaic image and updated labels. It includes the following keys:
+                - 'img' (numpy.ndarray): The final mosaic image.
+                - Other keys from the input labels, updated to reflect the new mosaic arrangement.
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=9)
+            >>> input_labels = dataset[0]
+            >>> mosaic_result = mosaic._mosaic9(input_labels)
+            >>> mosaic_image = mosaic_result["img"]
+        """
+        mosaic_labels = []
+        s = self.imgsz
+        hp, wp = -1, -1  # height, width previous
+        for i in range(9):
+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
+            # Load image
+            img = labels_patch["img"]
+            h, w = labels_patch.pop("resized_shape")
+
+            # Place img in img9
+            if i == 0:  # center
+                img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                h0, w0 = h, w
+                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
+            elif i == 1:  # top
+                c = s, s - h, s + w, s
+            elif i == 2:  # top right
+                c = s + wp, s - h, s + wp + w, s
+            elif i == 3:  # right
+                c = s + w0, s, s + w0 + w, s + h
+            elif i == 4:  # bottom right
+                c = s + w0, s + hp, s + w0 + w, s + hp + h
+            elif i == 5:  # bottom
+                c = s + w0 - w, s + h0, s + w0, s + h0 + h
+            elif i == 6:  # bottom left
+                c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h
+            elif i == 7:  # left
+                c = s - w, s + h0 - h, s, s + h0
+            elif i == 8:  # top left
+                c = s - w, s + h0 - hp - h, s, s + h0 - hp
+
+            padw, padh = c[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
+
+            # Image
+            img9[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img9[ymin:ymax, xmin:xmax]
+            hp, wp = h, w  # height, width previous for next iteration
+
+            # Labels assuming imgsz*2 mosaic size
+            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
+            mosaic_labels.append(labels_patch)
+        final_labels = self._cat_labels(mosaic_labels)
+
+        final_labels["img"] = img9[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
+        return final_labels
+
+    @staticmethod
+    def _update_labels(labels, padw, padh):
+        """
+        Updates label coordinates with padding values.
+
+        This method adjusts the bounding box coordinates of object instances in the labels by adding padding
+        values. It also denormalizes the coordinates if they were previously normalized.
+
+        Args:
+            labels (Dict): A dictionary containing image and instance information.
+            padw (int): Padding width to be added to the x-coordinates.
+            padh (int): Padding height to be added to the y-coordinates.
+
+        Returns:
+            (Dict): Updated labels dictionary with adjusted instance coordinates.
+
+        Examples:
+            >>> labels = {"img": np.zeros((100, 100, 3)), "instances": Instances(...)}
+            >>> padw, padh = 50, 50
+            >>> updated_labels = Mosaic._update_labels(labels, padw, padh)
+        """
+        nh, nw = labels["img"].shape[:2]
+        labels["instances"].convert_bbox(format="xyxy")
+        labels["instances"].denormalize(nw, nh)
+        labels["instances"].add_padding(padw, padh)
+        return labels
+
+    def _cat_labels(self, mosaic_labels):
+        """
+        Concatenates and processes labels for mosaic augmentation.
+
+        This method combines labels from multiple images used in mosaic augmentation, clips instances to the
+        mosaic border, and removes zero-area boxes.
+
+        Args:
+            mosaic_labels (List[Dict]): A list of label dictionaries for each image in the mosaic.
+
+        Returns:
+            (Dict): A dictionary containing concatenated and processed labels for the mosaic image, including:
+                - im_file (str): File path of the first image in the mosaic.
+                - ori_shape (Tuple[int, int]): Original shape of the first image.
+                - resized_shape (Tuple[int, int]): Shape of the mosaic image (imgsz * 2, imgsz * 2).
+                - cls (np.ndarray): Concatenated class labels.
+                - instances (Instances): Concatenated instance annotations.
+                - mosaic_border (Tuple[int, int]): Mosaic border size.
+                - texts (List[str], optional): Text labels if present in the original labels.
+
+        Examples:
+            >>> mosaic = Mosaic(dataset, imgsz=640)
+            >>> mosaic_labels = [{"cls": np.array([0, 1]), "instances": Instances(...)} for _ in range(4)]
+            >>> result = mosaic._cat_labels(mosaic_labels)
+            >>> print(result.keys())
+            dict_keys(['im_file', 'ori_shape', 'resized_shape', 'cls', 'instances', 'mosaic_border'])
+        """
+        if len(mosaic_labels) == 0:
+            return {}
+        cls = []
+        instances = []
+        imgsz = self.imgsz * 2  # mosaic imgsz
+        for labels in mosaic_labels:
+            cls.append(labels["cls"])
+            instances.append(labels["instances"])
+        # Final labels
+        final_labels = {
+            "im_file": mosaic_labels[0]["im_file"],
+            "ori_shape": mosaic_labels[0]["ori_shape"],
+            "resized_shape": (imgsz, imgsz),
+            "cls": np.concatenate(cls, 0),
+            "instances": Instances.concatenate(instances, axis=0),
+            "mosaic_border": self.border,
+        }
+        final_labels["instances"].clip(imgsz, imgsz)
+        good = final_labels["instances"].remove_zero_area_boxes()
+        final_labels["cls"] = final_labels["cls"][good]
+        if "texts" in mosaic_labels[0]:
+            final_labels["texts"] = mosaic_labels[0]["texts"]
+        return final_labels
+
+
+class MixUp(BaseMixTransform):
+    """
+    Applies MixUp augmentation to image datasets.
+
+    This class implements the MixUp augmentation technique as described in the paper "mixup: Beyond Empirical Risk
+    Minimization" (https://arxiv.org/abs/1710.09412). MixUp combines two images and their labels using a random weight.
+
+    Attributes:
+        dataset (Any): The dataset to which MixUp augmentation will be applied.
+        pre_transform (Callable | None): Optional transform to apply before MixUp.
+        p (float): Probability of applying MixUp augmentation.
+
+    Methods:
+        get_indexes: Returns a random index from the dataset.
+        _mix_transform: Applies MixUp augmentation to the input labels.
+
+    Examples:
+        >>> from ultralytics.data.augment import MixUp
+        >>> dataset = YourDataset(...)  # Your image dataset
+        >>> mixup = MixUp(dataset, p=0.5)
+        >>> augmented_labels = mixup(original_labels)
+    """
+
+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
+        """
+        Initializes the MixUp augmentation object.
+
+        MixUp is an image augmentation technique that combines two images by taking a weighted sum of their pixel
+        values and labels. This implementation is designed for use with the Ultralytics YOLO framework.
+
+        Args:
+            dataset (Any): The dataset to which MixUp augmentation will be applied.
+            pre_transform (Callable | None): Optional transform to apply to images before MixUp.
+            p (float): Probability of applying MixUp augmentation to an image. Must be in the range [0, 1].
+
+        Examples:
+            >>> from ultralytics.data.dataset import YOLODataset
+            >>> dataset = YOLODataset("path/to/data.yaml")
+            >>> mixup = MixUp(dataset, pre_transform=None, p=0.5)
+        """
+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
+
+    def get_indexes(self):
+        """
+        Get a random index from the dataset.
+
+        This method returns a single random index from the dataset, which is used to select an image for MixUp
+        augmentation.
+
+        Returns:
+            (int): A random integer index within the range of the dataset length.
+
+        Examples:
+            >>> mixup = MixUp(dataset)
+            >>> index = mixup.get_indexes()
+            >>> print(index)
+            42
+        """
+        return random.randint(0, len(self.dataset) - 1)
+
+    def _mix_transform(self, labels):
+        """
+        Applies MixUp augmentation to the input labels.
+
+        This method implements the MixUp augmentation technique as described in the paper
+        "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412).
+
+        Args:
+            labels (Dict): A dictionary containing the original image and label information.
+
+        Returns:
+            (Dict): A dictionary containing the mixed-up image and combined label information.
+
+        Examples:
+            >>> mixer = MixUp(dataset)
+            >>> mixed_labels = mixer._mix_transform(labels)
+        """
+        r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+        labels2 = labels["mix_labels"][0]
+        labels["img"] = (labels["img"] * r + labels2["img"] * (1 - r)).astype(np.uint8)
+        labels["instances"] = Instances.concatenate([labels["instances"], labels2["instances"]], axis=0)
+        labels["cls"] = np.concatenate([labels["cls"], labels2["cls"]], 0)
+        return labels
+
+
+class RandomPerspective:
+    """
+    Implements random perspective and affine transformations on images and corresponding annotations.
+
+    This class applies random rotations, translations, scaling, shearing, and perspective transformations
+    to images and their associated bounding boxes, segments, and keypoints. It can be used as part of an
+    augmentation pipeline for object detection and instance segmentation tasks.
+
+    Attributes:
+        degrees (float): Maximum absolute degree range for random rotations.
+        translate (float): Maximum translation as a fraction of the image size.
+        scale (float): Scaling factor range, e.g., scale=0.1 means 0.9-1.1.
+        shear (float): Maximum shear angle in degrees.
+        perspective (float): Perspective distortion factor.
+        border (Tuple[int, int]): Mosaic border size as (x, y).
+        pre_transform (Callable | None): Optional transform to apply before the random perspective.
+
+    Methods:
+        affine_transform: Applies affine transformations to the input image.
+        apply_bboxes: Transforms bounding boxes using the affine matrix.
+        apply_segments: Transforms segments and generates new bounding boxes.
+        apply_keypoints: Transforms keypoints using the affine matrix.
+        __call__: Applies the random perspective transformation to images and annotations.
+        box_candidates: Filters transformed bounding boxes based on size and aspect ratio.
+
+    Examples:
+        >>> transform = RandomPerspective(degrees=10, translate=0.1, scale=0.1, shear=10)
+        >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
+        >>> labels = {"img": image, "cls": np.array([0, 1]), "instances": Instances(...)}
+        >>> result = transform(labels)
+        >>> transformed_image = result["img"]
+        >>> transformed_instances = result["instances"]
+    """
+
+    def __init__(
+        self, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, border=(0, 0), pre_transform=None
+    ):
+        """
+        Initializes RandomPerspective object with transformation parameters.
+
+        This class implements random perspective and affine transformations on images and corresponding bounding boxes,
+        segments, and keypoints. Transformations include rotation, translation, scaling, and shearing.
+
+        Args:
+            degrees (float): Degree range for random rotations.
+            translate (float): Fraction of total width and height for random translation.
+            scale (float): Scaling factor interval, e.g., a scale factor of 0.5 allows a resize between 50%-150%.
+            shear (float): Shear intensity (angle in degrees).
+            perspective (float): Perspective distortion factor.
+            border (Tuple[int, int]): Tuple specifying mosaic border (top/bottom, left/right).
+            pre_transform (Callable | None): Function/transform to apply to the image before starting the random
+                transformation.
+
+        Examples:
+            >>> transform = RandomPerspective(degrees=10.0, translate=0.1, scale=0.5, shear=5.0)
+            >>> result = transform(labels)  # Apply random perspective to labels
+        """
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.perspective = perspective
+        self.border = border  # mosaic border
+        self.pre_transform = pre_transform
+
+    def affine_transform(self, img, border):
+        """
+        Applies a sequence of affine transformations centered around the image center.
+
+        This function performs a series of geometric transformations on the input image, including
+        translation, perspective change, rotation, scaling, and shearing. The transformations are
+        applied in a specific order to maintain consistency.
+
+        Args:
+            img (np.ndarray): Input image to be transformed.
+            border (Tuple[int, int]): Border dimensions for the transformed image.
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray, float]): A tuple containing:
+                - np.ndarray: Transformed image.
+                - np.ndarray: 3x3 transformation matrix.
+                - float: Scale factor applied during the transformation.
+
+        Examples:
+            >>> import numpy as np
+            >>> img = np.random.rand(100, 100, 3)
+            >>> border = (10, 10)
+            >>> transformed_img, matrix, scale = affine_transform(img, border)
+        """
+        # Center
+        C = np.eye(3, dtype=np.float32)
+
+        C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+        C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+        # Perspective
+        P = np.eye(3, dtype=np.float32)
+        P[2, 0] = random.uniform(-self.perspective, self.perspective)  # x perspective (about y)
+        P[2, 1] = random.uniform(-self.perspective, self.perspective)  # y perspective (about x)
+
+        # Rotation and Scale
+        R = np.eye(3, dtype=np.float32)
+        a = random.uniform(-self.degrees, self.degrees)
+        # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+        s = random.uniform(1 - self.scale, 1 + self.scale)
+        # s = 2 ** random.uniform(-scale, scale)
+        R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+        # Shear
+        S = np.eye(3, dtype=np.float32)
+        S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # x shear (deg)
+        S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # y shear (deg)
+
+        # Translation
+        T = np.eye(3, dtype=np.float32)
+        T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0]  # x translation (pixels)
+        T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1]  # y translation (pixels)
+
+        # Combined rotation matrix
+        M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+        # Affine image
+        if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+            if self.perspective:
+                img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114))
+            else:  # affine
+                img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114))
+        return img, M, s
+
+    def apply_bboxes(self, bboxes, M):
+        """
+        Apply affine transformation to bounding boxes.
+
+        This function applies an affine transformation to a set of bounding boxes using the provided
+        transformation matrix.
+
+        Args:
+            bboxes (torch.Tensor): Bounding boxes in xyxy format with shape (N, 4), where N is the number
+                of bounding boxes.
+            M (torch.Tensor): Affine transformation matrix with shape (3, 3).
+
+        Returns:
+            (torch.Tensor): Transformed bounding boxes in xyxy format with shape (N, 4).
+
+        Examples:
+            >>> bboxes = torch.tensor([[10, 10, 20, 20], [30, 30, 40, 40]])
+            >>> M = torch.eye(3)
+            >>> transformed_bboxes = apply_bboxes(bboxes, M)
+        """
+        n = len(bboxes)
+        if n == 0:
+            return bboxes
+
+        xy = np.ones((n * 4, 3), dtype=bboxes.dtype)
+        xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
+
+        # Create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
+
+    def apply_segments(self, segments, M):
+        """
+        Apply affine transformations to segments and generate new bounding boxes.
+
+        This function applies affine transformations to input segments and generates new bounding boxes based on
+        the transformed segments. It clips the transformed segments to fit within the new bounding boxes.
+
+        Args:
+            segments (np.ndarray): Input segments with shape (N, M, 2), where N is the number of segments and M is the
+                number of points in each segment.
+            M (np.ndarray): Affine transformation matrix with shape (3, 3).
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): A tuple containing:
+                - New bounding boxes with shape (N, 4) in xyxy format.
+                - Transformed and clipped segments with shape (N, M, 2).
+
+        Examples:
+            >>> segments = np.random.rand(10, 500, 2)  # 10 segments with 500 points each
+            >>> M = np.eye(3)  # Identity transformation matrix
+            >>> new_bboxes, new_segments = apply_segments(segments, M)
+        """
+        n, num = segments.shape[:2]
+        if n == 0:
+            return [], segments
+
+        xy = np.ones((n * num, 3), dtype=segments.dtype)
+        segments = segments.reshape(-1, 2)
+        xy[:, :2] = segments
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2] / xy[:, 2:3]
+        segments = xy.reshape(n, -1, 2)
+        bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
+        segments[..., 0] = segments[..., 0].clip(bboxes[:, 0:1], bboxes[:, 2:3])
+        segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
+        return bboxes, segments
+
+    def apply_keypoints(self, keypoints, M):
+        """
+        Applies affine transformation to keypoints.
+
+        This method transforms the input keypoints using the provided affine transformation matrix. It handles
+        perspective rescaling if necessary and updates the visibility of keypoints that fall outside the image
+        boundaries after transformation.
+
+        Args:
+            keypoints (np.ndarray): Array of keypoints with shape (N, 17, 3), where N is the number of instances,
+                17 is the number of keypoints per instance, and 3 represents (x, y, visibility).
+            M (np.ndarray): 3x3 affine transformation matrix.
+
+        Returns:
+            (np.ndarray): Transformed keypoints array with the same shape as input (N, 17, 3).
+
+        Examples:
+            >>> random_perspective = RandomPerspective()
+            >>> keypoints = np.random.rand(5, 17, 3)  # 5 instances, 17 keypoints each
+            >>> M = np.eye(3)  # Identity transformation
+            >>> transformed_keypoints = random_perspective.apply_keypoints(keypoints, M)
+        """
+        n, nkpt = keypoints.shape[:2]
+        if n == 0:
+            return keypoints
+        xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype)
+        visible = keypoints[..., 2].reshape(n * nkpt, 1)
+        xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2] / xy[:, 2:3]  # perspective rescale or affine
+        out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
+        visible[out_mask] = 0
+        return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
+
+    def __call__(self, labels):
+        """
+        Applies random perspective and affine transformations to an image and its associated labels.
+
+        This method performs a series of transformations including rotation, translation, scaling, shearing,
+        and perspective distortion on the input image and adjusts the corresponding bounding boxes, segments,
+        and keypoints accordingly.
+
+        Args:
+            labels (Dict): A dictionary containing image data and annotations.
+                Must include:
+                    'img' (ndarray): The input image.
+                    'cls' (ndarray): Class labels.
+                    'instances' (Instances): Object instances with bounding boxes, segments, and keypoints.
+                May include:
+                    'mosaic_border' (Tuple[int, int]): Border size for mosaic augmentation.
+
+        Returns:
+            (Dict): Transformed labels dictionary containing:
+                - 'img' (np.ndarray): The transformed image.
+                - 'cls' (np.ndarray): Updated class labels.
+                - 'instances' (Instances): Updated object instances.
+                - 'resized_shape' (Tuple[int, int]): New image shape after transformation.
+
+        Examples:
+            >>> transform = RandomPerspective()
+            >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
+            >>> labels = {
+            ...     "img": image,
+            ...     "cls": np.array([0, 1, 2]),
+            ...     "instances": Instances(bboxes=np.array([[10, 10, 50, 50], [100, 100, 150, 150]])),
+            ... }
+            >>> result = transform(labels)
+            >>> assert result["img"].shape[:2] == result["resized_shape"]
+        """
+        if self.pre_transform and "mosaic_border" not in labels:
+            labels = self.pre_transform(labels)
+        labels.pop("ratio_pad", None)  # do not need ratio pad
+
+        img = labels["img"]
+        cls = labels["cls"]
+        instances = labels.pop("instances")
+        # Make sure the coord formats are right
+        instances.convert_bbox(format="xyxy")
+        instances.denormalize(*img.shape[:2][::-1])
+
+        border = labels.pop("mosaic_border", self.border)
+        self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2  # w, h
+        # M is affine matrix
+        # Scale for func:`box_candidates`
+        img, M, scale = self.affine_transform(img, border)
+
+        bboxes = self.apply_bboxes(instances.bboxes, M)
+
+        segments = instances.segments
+        keypoints = instances.keypoints
+        # Update bboxes if there are segments.
+        if len(segments):
+            bboxes, segments = self.apply_segments(segments, M)
+
+        if keypoints is not None:
+            keypoints = self.apply_keypoints(keypoints, M)
+        new_instances = Instances(bboxes, segments, keypoints, bbox_format="xyxy", normalized=False)
+        # Clip
+        new_instances.clip(*self.size)
+
+        # Filter instances
+        instances.scale(scale_w=scale, scale_h=scale, bbox_only=True)
+        # Make the bboxes have the same scale with new_bboxes
+        i = self.box_candidates(
+            box1=instances.bboxes.T, box2=new_instances.bboxes.T, area_thr=0.01 if len(segments) else 0.10
+        )
+        labels["instances"] = new_instances[i]
+        labels["cls"] = cls[i]
+        labels["img"] = img
+        labels["resized_shape"] = img.shape[:2]
+        return labels
+
+    @staticmethod
+    def box_candidates(box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):
+        """
+        Compute candidate boxes for further processing based on size and aspect ratio criteria.
+
+        This method compares boxes before and after augmentation to determine if they meet specified
+        thresholds for width, height, aspect ratio, and area. It's used to filter out boxes that have
+        been overly distorted or reduced by the augmentation process.
+
+        Args:
+            box1 (numpy.ndarray): Original boxes before augmentation, shape (4, N) where n is the
+                number of boxes. Format is [x1, y1, x2, y2] in absolute coordinates.
+            box2 (numpy.ndarray): Augmented boxes after transformation, shape (4, N). Format is
+                [x1, y1, x2, y2] in absolute coordinates.
+            wh_thr (float): Width and height threshold in pixels. Boxes smaller than this in either
+                dimension are rejected.
+            ar_thr (float): Aspect ratio threshold. Boxes with an aspect ratio greater than this
+                value are rejected.
+            area_thr (float): Area ratio threshold. Boxes with an area ratio (new/old) less than
+                this value are rejected.
+            eps (float): Small epsilon value to prevent division by zero.
+
+        Returns:
+            (numpy.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
+                True values correspond to boxes that meet all criteria.
+
+        Examples:
+            >>> random_perspective = RandomPerspective()
+            >>> box1 = np.array([[0, 0, 100, 100], [0, 0, 50, 50]]).T
+            >>> box2 = np.array([[10, 10, 90, 90], [5, 5, 45, 45]]).T
+            >>> candidates = random_perspective.box_candidates(box1, box2)
+            >>> print(candidates)
+            [True True]
+        """
+        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+        ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
+        return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates
+
+
+class RandomHSV:
+    """
+    Randomly adjusts the Hue, Saturation, and Value (HSV) channels of an image.
+
+    This class applies random HSV augmentation to images within predefined limits set by hgain, sgain, and vgain.
+
+    Attributes:
+        hgain (float): Maximum variation for hue. Range is typically [0, 1].
+        sgain (float): Maximum variation for saturation. Range is typically [0, 1].
+        vgain (float): Maximum variation for value. Range is typically [0, 1].
+
+    Methods:
+        __call__: Applies random HSV augmentation to an image.
+
+    Examples:
+        >>> import numpy as np
+        >>> from ultralytics.data.augment import RandomHSV
+        >>> augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
+        >>> image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+        >>> labels = {"img": image}
+        >>> augmenter(labels)
+        >>> augmented_image = augmented_labels["img"]
+    """
+
+    def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
+        """
+        Initializes the RandomHSV object for random HSV (Hue, Saturation, Value) augmentation.
+
+        This class applies random adjustments to the HSV channels of an image within specified limits.
+
+        Args:
+            hgain (float): Maximum variation for hue. Should be in the range [0, 1].
+            sgain (float): Maximum variation for saturation. Should be in the range [0, 1].
+            vgain (float): Maximum variation for value. Should be in the range [0, 1].
+
+        Examples:
+            >>> hsv_aug = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
+            >>> hsv_aug(image)
+        """
+        self.hgain = hgain
+        self.sgain = sgain
+        self.vgain = vgain
+
+    def __call__(self, labels):
+        """
+        Applies random HSV augmentation to an image within predefined limits.
+
+        This method modifies the input image by randomly adjusting its Hue, Saturation, and Value (HSV) channels.
+        The adjustments are made within the limits set by hgain, sgain, and vgain during initialization.
+
+        Args:
+            labels (Dict): A dictionary containing image data and metadata. Must include an 'img' key with
+                the image as a numpy array.
+
+        Returns:
+            (None): The function modifies the input 'labels' dictionary in-place, updating the 'img' key
+                with the HSV-augmented image.
+
+        Examples:
+            >>> hsv_augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
+            >>> labels = {"img": np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)}
+            >>> hsv_augmenter(labels)
+            >>> augmented_img = labels["img"]
+        """
+        img = labels["img"]
+        if self.hgain or self.sgain or self.vgain:
+            r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1  # random gains
+            hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+            dtype = img.dtype  # uint8
+
+            x = np.arange(0, 256, dtype=r.dtype)
+            lut_hue = ((x * r[0]) % 180).astype(dtype)
+            lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+            lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+            im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+            cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+        return labels
+
+
+class RandomFlip:
+    """
+    Applies a random horizontal or vertical flip to an image with a given probability.
+
+    This class performs random image flipping and updates corresponding instance annotations such as
+    bounding boxes and keypoints.
+
+    Attributes:
+        p (float): Probability of applying the flip. Must be between 0 and 1.
+        direction (str): Direction of flip, either 'horizontal' or 'vertical'.
+        flip_idx (array-like): Index mapping for flipping keypoints, if applicable.
+
+    Methods:
+        __call__: Applies the random flip transformation to an image and its annotations.
+
+    Examples:
+        >>> transform = RandomFlip(p=0.5, direction="horizontal")
+        >>> result = transform({"img": image, "instances": instances})
+        >>> flipped_image = result["img"]
+        >>> flipped_instances = result["instances"]
+    """
+
+    def __init__(self, p=0.5, direction="horizontal", flip_idx=None) -> None:
+        """
+        Initializes the RandomFlip class with probability and direction.
+
+        This class applies a random horizontal or vertical flip to an image with a given probability.
+        It also updates any instances (bounding boxes, keypoints, etc.) accordingly.
+
+        Args:
+            p (float): The probability of applying the flip. Must be between 0 and 1.
+            direction (str): The direction to apply the flip. Must be 'horizontal' or 'vertical'.
+            flip_idx (List[int] | None): Index mapping for flipping keypoints, if any.
+
+        Raises:
+            AssertionError: If direction is not 'horizontal' or 'vertical', or if p is not between 0 and 1.
+
+        Examples:
+            >>> flip = RandomFlip(p=0.5, direction="horizontal")
+            >>> flip_with_idx = RandomFlip(p=0.7, direction="vertical", flip_idx=[1, 0, 3, 2, 5, 4])
+        """
+        assert direction in {"horizontal", "vertical"}, f"Support direction `horizontal` or `vertical`, got {direction}"
+        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
+
+        self.p = p
+        self.direction = direction
+        self.flip_idx = flip_idx
+
+    def __call__(self, labels):
+        """
+        Applies random flip to an image and updates any instances like bounding boxes or keypoints accordingly.
+
+        This method randomly flips the input image either horizontally or vertically based on the initialized
+        probability and direction. It also updates the corresponding instances (bounding boxes, keypoints) to
+        match the flipped image.
+
+        Args:
+            labels (Dict): A dictionary containing the following keys:
+                'img' (numpy.ndarray): The image to be flipped.
+                'instances' (ultralytics.utils.instance.Instances): An object containing bounding boxes and
+                    optionally keypoints.
+
+        Returns:
+            (Dict): The same dictionary with the flipped image and updated instances:
+                'img' (numpy.ndarray): The flipped image.
+                'instances' (ultralytics.utils.instance.Instances): Updated instances matching the flipped image.
+
+        Examples:
+            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": Instances(...)}
+            >>> random_flip = RandomFlip(p=0.5, direction="horizontal")
+            >>> flipped_labels = random_flip(labels)
+        """
+        img = labels["img"]
+        instances = labels.pop("instances")
+        instances.convert_bbox(format="xywh")
+        h, w = img.shape[:2]
+        h = 1 if instances.normalized else h
+        w = 1 if instances.normalized else w
+
+        # Flip up-down
+        if self.direction == "vertical" and random.random() < self.p:
+            img = np.flipud(img)
+            instances.flipud(h)
+        if self.direction == "horizontal" and random.random() < self.p:
+            img = np.fliplr(img)
+            instances.fliplr(w)
+            # For keypoints
+            if self.flip_idx is not None and instances.keypoints is not None:
+                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
+        labels["img"] = np.ascontiguousarray(img)
+        labels["instances"] = instances
+        return labels
+
+
+class LetterBox:
+    """
+    Resize image and padding for detection, instance segmentation, pose.
+
+    This class resizes and pads images to a specified shape while preserving aspect ratio. It also updates
+    corresponding labels and bounding boxes.
+
+    Attributes:
+        new_shape (tuple): Target shape (height, width) for resizing.
+        auto (bool): Whether to use minimum rectangle.
+        scaleFill (bool): Whether to stretch the image to new_shape.
+        scaleup (bool): Whether to allow scaling up. If False, only scale down.
+        stride (int): Stride for rounding padding.
+        center (bool): Whether to center the image or align to top-left.
+
+    Methods:
+        __call__: Resize and pad image, update labels and bounding boxes.
+
+    Examples:
+        >>> transform = LetterBox(new_shape=(640, 640))
+        >>> result = transform(labels)
+        >>> resized_img = result["img"]
+        >>> updated_instances = result["instances"]
+    """
+
+    def __init__(self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32):
+        """
+        Initialize LetterBox object for resizing and padding images.
+
+        This class is designed to resize and pad images for object detection, instance segmentation, and pose estimation
+        tasks. It supports various resizing modes including auto-sizing, scale-fill, and letterboxing.
+
+        Args:
+            new_shape (Tuple[int, int]): Target size (height, width) for the resized image.
+            auto (bool): If True, use minimum rectangle to resize. If False, use new_shape directly.
+            scaleFill (bool): If True, stretch the image to new_shape without padding.
+            scaleup (bool): If True, allow scaling up. If False, only scale down.
+            center (bool): If True, center the placed image. If False, place image in top-left corner.
+            stride (int): Stride of the model (e.g., 32 for YOLOv5).
+
+        Attributes:
+            new_shape (Tuple[int, int]): Target size for the resized image.
+            auto (bool): Flag for using minimum rectangle resizing.
+            scaleFill (bool): Flag for stretching image without padding.
+            scaleup (bool): Flag for allowing upscaling.
+            stride (int): Stride value for ensuring image size is divisible by stride.
+
+        Examples:
+            >>> letterbox = LetterBox(new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, stride=32)
+            >>> resized_img = letterbox(original_img)
+        """
+        self.new_shape = new_shape
+        self.auto = auto
+        self.scaleFill = scaleFill
+        self.scaleup = scaleup
+        self.stride = stride
+        self.center = center  # Put the image in the middle or top-left
+
+    def __call__(self, labels=None, image=None):
+        """
+        Resizes and pads an image for object detection, instance segmentation, or pose estimation tasks.
+
+        This method applies letterboxing to the input image, which involves resizing the image while maintaining its
+        aspect ratio and adding padding to fit the new shape. It also updates any associated labels accordingly.
+
+        Args:
+            labels (Dict | None): A dictionary containing image data and associated labels, or empty dict if None.
+            image (np.ndarray | None): The input image as a numpy array. If None, the image is taken from 'labels'.
+
+        Returns:
+            (Dict | Tuple): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
+                updated labels, and additional metadata. If 'labels' is empty, returns a tuple containing the resized
+                and padded image, and a tuple of (ratio, (left_pad, top_pad)).
+
+        Examples:
+            >>> letterbox = LetterBox(new_shape=(640, 640))
+            >>> result = letterbox(labels={"img": np.zeros((480, 640, 3)), "instances": Instances(...)})
+            >>> resized_img = result["img"]
+            >>> updated_instances = result["instances"]
+        """
+        if labels is None:
+            labels = {}
+        img = labels.get("img") if image is None else image
+        shape = img.shape[:2]  # current shape [height, width]
+        new_shape = labels.pop("rect_shape", self.new_shape)
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+
+        # Scale ratio (new / old)
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
+            r = min(r, 1.0)
+
+        # Compute padding
+        ratio = r, r  # width, height ratios
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+        if self.auto:  # minimum rectangle
+            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
+        elif self.scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+        if self.center:
+            dw /= 2  # divide padding into 2 sides
+            dh /= 2
+
+        if shape[::-1] != new_unpad:  # resize
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )  # add border
+        if labels.get("ratio_pad"):
+            labels["ratio_pad"] = (labels["ratio_pad"], (left, top))  # for evaluation
+
+        if len(labels):
+            labels = self._update_labels(labels, ratio, left, top)
+            labels["img"] = img
+            labels["resized_shape"] = new_shape
+            return labels
+        else:
+            return img
+
+    @staticmethod
+    def _update_labels(labels, ratio, padw, padh):
+        """
+        Updates labels after applying letterboxing to an image.
+
+        This method modifies the bounding box coordinates of instances in the labels
+        to account for resizing and padding applied during letterboxing.
+
+        Args:
+            labels (Dict): A dictionary containing image labels and instances.
+            ratio (Tuple[float, float]): Scaling ratios (width, height) applied to the image.
+            padw (float): Padding width added to the image.
+            padh (float): Padding height added to the image.
+
+        Returns:
+            (Dict): Updated labels dictionary with modified instance coordinates.
+
+        Examples:
+            >>> letterbox = LetterBox(new_shape=(640, 640))
+            >>> labels = {"instances": Instances(...)}
+            >>> ratio = (0.5, 0.5)
+            >>> padw, padh = 10, 20
+            >>> updated_labels = letterbox._update_labels(labels, ratio, padw, padh)
+        """
+        labels["instances"].convert_bbox(format="xyxy")
+        labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
+        labels["instances"].scale(*ratio)
+        labels["instances"].add_padding(padw, padh)
+        return labels
+
+
+class CopyPaste(BaseMixTransform):
+    """
+    CopyPaste class for applying Copy-Paste augmentation to image datasets.
+
+    This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
+    Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
+    different images to create new training samples.
+
+    Attributes:
+        dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
+        pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
+        p (float): Probability of applying Copy-Paste augmentation.
+
+    Methods:
+        get_indexes: Returns a random index from the dataset.
+        _mix_transform: Applies Copy-Paste augmentation to the input labels.
+        __call__: Applies the Copy-Paste transformation to images and annotations.
+
+    Examples:
+        >>> from ultralytics.data.augment import CopyPaste
+        >>> dataset = YourDataset(...)  # Your image dataset
+        >>> copypaste = CopyPaste(dataset, p=0.5)
+        >>> augmented_labels = copypaste(original_labels)
+    """
+
+    def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
+        """Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
+        assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
+        self.mode = mode
+
+    def get_indexes(self):
+        """Returns a list of random indexes from the dataset for CopyPaste augmentation."""
+        return random.randint(0, len(self.dataset) - 1)
+
+    def _mix_transform(self, labels):
+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
+        labels2 = labels["mix_labels"][0]
+        return self._transform(labels, labels2)
+
+    def __call__(self, labels):
+        """Applies Copy-Paste augmentation to an image and its labels."""
+        if len(labels["instances"].segments) == 0 or self.p == 0:
+            return labels
+        if self.mode == "flip":
+            return self._transform(labels)
+
+        # Get index of one or three other images
+        indexes = self.get_indexes()
+        if isinstance(indexes, int):
+            indexes = [indexes]
+
+        # Get images information will be used for Mosaic or MixUp
+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
+
+        if self.pre_transform is not None:
+            for i, data in enumerate(mix_labels):
+                mix_labels[i] = self.pre_transform(data)
+        labels["mix_labels"] = mix_labels
+
+        # Update cls and texts
+        labels = self._update_label_text(labels)
+        # Mosaic or MixUp
+        labels = self._mix_transform(labels)
+        labels.pop("mix_labels", None)
+        return labels
+
+    def _transform(self, labels1, labels2={}):
+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
+        im = labels1["img"]
+        cls = labels1["cls"]
+        h, w = im.shape[:2]
+        instances = labels1.pop("instances")
+        instances.convert_bbox(format="xyxy")
+        instances.denormalize(w, h)
+
+        im_new = np.zeros(im.shape, np.uint8)
+        instances2 = labels2.pop("instances", None)
+        if instances2 is None:
+            instances2 = deepcopy(instances)
+            instances2.fliplr(w)
+        ioa = bbox_ioa(instances2.bboxes, instances.bboxes)  # intersection over area, (N, M)
+        indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
+        n = len(indexes)
+        sorted_idx = np.argsort(ioa.max(1)[indexes])
+        indexes = indexes[sorted_idx]
+        for j in indexes[: round(self.p * n)]:
+            cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
+            instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
+            cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
+
+        result = labels2.get("img", cv2.flip(im, 1))  # augment segments
+        i = im_new.astype(bool)
+        im[i] = result[i]
+
+        labels1["img"] = im
+        labels1["cls"] = cls
+        labels1["instances"] = instances
+        return labels1
+
+
+class Albumentations:
+    """
+    Albumentations transformations for image augmentation.
+
+    This class applies various image transformations using the Albumentations library. It includes operations such as
+    Blur, Median Blur, conversion to grayscale, Contrast Limited Adaptive Histogram Equalization (CLAHE), random changes
+    in brightness and contrast, RandomGamma, and image quality reduction through compression.
+
+    Attributes:
+        p (float): Probability of applying the transformations.
+        transform (albumentations.Compose): Composed Albumentations transforms.
+        contains_spatial (bool): Indicates if the transforms include spatial operations.
+
+    Methods:
+        __call__: Applies the Albumentations transformations to the input labels.
+
+    Examples:
+        >>> transform = Albumentations(p=0.5)
+        >>> augmented_labels = transform(labels)
+
+    Notes:
+        - The Albumentations package must be installed to use this class.
+        - If the package is not installed or an error occurs during initialization, the transform will be set to None.
+        - Spatial transforms are handled differently and require special processing for bounding boxes.
+    """
+
+    def __init__(self, p=1.0):
+        """
+        Initialize the Albumentations transform object for YOLO bbox formatted parameters.
+
+        This class applies various image augmentations using the Albumentations library, including Blur, Median Blur,
+        conversion to grayscale, Contrast Limited Adaptive Histogram Equalization, random changes of brightness and
+        contrast, RandomGamma, and image quality reduction through compression.
+
+        Args:
+            p (float): Probability of applying the augmentations. Must be between 0 and 1.
+
+        Attributes:
+            p (float): Probability of applying the augmentations.
+            transform (albumentations.Compose): Composed Albumentations transforms.
+            contains_spatial (bool): Indicates if the transforms include spatial transformations.
+
+        Raises:
+            ImportError: If the Albumentations package is not installed.
+            Exception: For any other errors during initialization.
+
+        Examples:
+            >>> transform = Albumentations(p=0.5)
+            >>> augmented = transform(image=image, bboxes=bboxes, class_labels=classes)
+            >>> augmented_image = augmented["image"]
+            >>> augmented_bboxes = augmented["bboxes"]
+
+        Notes:
+            - Requires Albumentations version 1.0.3 or higher.
+            - Spatial transforms are handled differently to ensure bbox compatibility.
+            - Some transforms are applied with very low probability (0.01) by default.
+        """
+        self.p = p
+        self.transform = None
+        prefix = colorstr("albumentations: ")
+
+        try:
+            import albumentations as A
+
+            check_version(A.__version__, "1.0.3", hard=True)  # version requirement
+
+            # List of possible spatial transforms
+            spatial_transforms = {
+                "Affine",
+                "BBoxSafeRandomCrop",
+                "CenterCrop",
+                "CoarseDropout",
+                "Crop",
+                "CropAndPad",
+                "CropNonEmptyMaskIfExists",
+                "D4",
+                "ElasticTransform",
+                "Flip",
+                "GridDistortion",
+                "GridDropout",
+                "HorizontalFlip",
+                "Lambda",
+                "LongestMaxSize",
+                "MaskDropout",
+                "MixUp",
+                "Morphological",
+                "NoOp",
+                "OpticalDistortion",
+                "PadIfNeeded",
+                "Perspective",
+                "PiecewiseAffine",
+                "PixelDropout",
+                "RandomCrop",
+                "RandomCropFromBorders",
+                "RandomGridShuffle",
+                "RandomResizedCrop",
+                "RandomRotate90",
+                "RandomScale",
+                "RandomSizedBBoxSafeCrop",
+                "RandomSizedCrop",
+                "Resize",
+                "Rotate",
+                "SafeRotate",
+                "ShiftScaleRotate",
+                "SmallestMaxSize",
+                "Transpose",
+                "VerticalFlip",
+                "XYMasking",
+            }  # from https://albumentations.ai/docs/getting_started/transforms_and_targets/#spatial-level-transforms
+
+            # Transforms
+            T = [
+                A.Blur(p=0.01),
+                A.MedianBlur(p=0.01),
+                A.ToGray(p=0.01),
+                A.CLAHE(p=0.01),
+                A.RandomBrightnessContrast(p=0.0),
+                A.RandomGamma(p=0.0),
+                A.ImageCompression(quality_range=(75, 100), p=0.5),
+            ]
+
+            # Compose transforms
+            self.contains_spatial = any(transform.__class__.__name__ in spatial_transforms for transform in T)
+            self.transform = (
+                A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
+                if self.contains_spatial
+                else A.Compose(T)
+            )
+            if hasattr(self.transform, "set_random_seed"):
+                # Required for deterministic transforms in albumentations>=1.4.21
+                self.transform.set_random_seed(torch.initial_seed())
+            LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
+        except ImportError:  # package not installed, skip
+            pass
+        except Exception as e:
+            LOGGER.info(f"{prefix}{e}")
+
+    def __call__(self, labels):
+        """
+        Applies Albumentations transformations to input labels.
+
+        This method applies a series of image augmentations using the Albumentations library. It can perform both
+        spatial and non-spatial transformations on the input image and its corresponding labels.
+
+        Args:
+            labels (Dict): A dictionary containing image data and annotations. Expected keys are:
+                - 'img': numpy.ndarray representing the image
+                - 'cls': numpy.ndarray of class labels
+                - 'instances': object containing bounding boxes and other instance information
+
+        Returns:
+            (Dict): The input dictionary with augmented image and updated annotations.
+
+        Examples:
+            >>> transform = Albumentations(p=0.5)
+            >>> labels = {
+            ...     "img": np.random.rand(640, 640, 3),
+            ...     "cls": np.array([0, 1]),
+            ...     "instances": Instances(bboxes=np.array([[0, 0, 1, 1], [0.5, 0.5, 0.8, 0.8]])),
+            ... }
+            >>> augmented = transform(labels)
+            >>> assert augmented["img"].shape == (640, 640, 3)
+
+        Notes:
+            - The method applies transformations with probability self.p.
+            - Spatial transforms update bounding boxes, while non-spatial transforms only modify the image.
+            - Requires the Albumentations library to be installed.
+        """
+        if self.transform is None or random.random() > self.p:
+            return labels
+
+        if self.contains_spatial:
+            cls = labels["cls"]
+            if len(cls):
+                im = labels["img"]
+                labels["instances"].convert_bbox("xywh")
+                labels["instances"].normalize(*im.shape[:2][::-1])
+                bboxes = labels["instances"].bboxes
+                # TODO: add supports of segments and keypoints
+                new = self.transform(image=im, bboxes=bboxes, class_labels=cls)  # transformed
+                if len(new["class_labels"]) > 0:  # skip update if no bbox in new im
+                    labels["img"] = new["image"]
+                    labels["cls"] = np.array(new["class_labels"])
+                    bboxes = np.array(new["bboxes"], dtype=np.float32)
+                labels["instances"].update(bboxes=bboxes)
+        else:
+            labels["img"] = self.transform(image=labels["img"])["image"]  # transformed
+
+        return labels
+
+
+class Format:
+    """
+    A class for formatting image annotations for object detection, instance segmentation, and pose estimation tasks.
+
+    This class standardizes image and instance annotations to be used by the `collate_fn` in PyTorch DataLoader.
+
+    Attributes:
+        bbox_format (str): Format for bounding boxes. Options are 'xywh' or 'xyxy'.
+        normalize (bool): Whether to normalize bounding boxes.
+        return_mask (bool): Whether to return instance masks for segmentation.
+        return_keypoint (bool): Whether to return keypoints for pose estimation.
+        return_obb (bool): Whether to return oriented bounding boxes.
+        mask_ratio (int): Downsample ratio for masks.
+        mask_overlap (bool): Whether to overlap masks.
+        batch_idx (bool): Whether to keep batch indexes.
+        bgr (float): The probability to return BGR images.
+
+    Methods:
+        __call__: Formats labels dictionary with image, classes, bounding boxes, and optionally masks and keypoints.
+        _format_img: Converts image from Numpy array to PyTorch tensor.
+        _format_segments: Converts polygon points to bitmap masks.
+
+    Examples:
+        >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
+        >>> formatted_labels = formatter(labels)
+        >>> img = formatted_labels["img"]
+        >>> bboxes = formatted_labels["bboxes"]
+        >>> masks = formatted_labels["masks"]
+    """
+
+    def __init__(
+        self,
+        bbox_format="xywh",
+        normalize=True,
+        return_mask=False,
+        return_keypoint=False,
+        return_obb=False,
+        mask_ratio=4,
+        mask_overlap=True,
+        batch_idx=True,
+        bgr=0.0,
+    ):
+        """
+        Initializes the Format class with given parameters for image and instance annotation formatting.
+
+        This class standardizes image and instance annotations for object detection, instance segmentation, and pose
+        estimation tasks, preparing them for use in PyTorch DataLoader's `collate_fn`.
+
+        Args:
+            bbox_format (str): Format for bounding boxes. Options are 'xywh', 'xyxy', etc.
+            normalize (bool): Whether to normalize bounding boxes to [0,1].
+            return_mask (bool): If True, returns instance masks for segmentation tasks.
+            return_keypoint (bool): If True, returns keypoints for pose estimation tasks.
+            return_obb (bool): If True, returns oriented bounding boxes.
+            mask_ratio (int): Downsample ratio for masks.
+            mask_overlap (bool): If True, allows mask overlap.
+            batch_idx (bool): If True, keeps batch indexes.
+            bgr (float): Probability of returning BGR images instead of RGB.
+
+        Attributes:
+            bbox_format (str): Format for bounding boxes.
+            normalize (bool): Whether bounding boxes are normalized.
+            return_mask (bool): Whether to return instance masks.
+            return_keypoint (bool): Whether to return keypoints.
+            return_obb (bool): Whether to return oriented bounding boxes.
+            mask_ratio (int): Downsample ratio for masks.
+            mask_overlap (bool): Whether masks can overlap.
+            batch_idx (bool): Whether to keep batch indexes.
+            bgr (float): The probability to return BGR images.
+
+        Examples:
+            >>> format = Format(bbox_format="xyxy", return_mask=True, return_keypoint=False)
+            >>> print(format.bbox_format)
+            xyxy
+        """
+        self.bbox_format = bbox_format
+        self.normalize = normalize
+        self.return_mask = return_mask  # set False when training detection only
+        self.return_keypoint = return_keypoint
+        self.return_obb = return_obb
+        self.mask_ratio = mask_ratio
+        self.mask_overlap = mask_overlap
+        self.batch_idx = batch_idx  # keep the batch indexes
+        self.bgr = bgr
+
+    def __call__(self, labels):
+        """
+        Formats image annotations for object detection, instance segmentation, and pose estimation tasks.
+
+        This method standardizes the image and instance annotations to be used by the `collate_fn` in PyTorch
+        DataLoader. It processes the input labels dictionary, converting annotations to the specified format and
+        applying normalization if required.
+
+        Args:
+            labels (Dict): A dictionary containing image and annotation data with the following keys:
+                - 'img': The input image as a numpy array.
+                - 'cls': Class labels for instances.
+                - 'instances': An Instances object containing bounding boxes, segments, and keypoints.
+
+        Returns:
+            (Dict): A dictionary with formatted data, including:
+                - 'img': Formatted image tensor.
+                - 'cls': Class label's tensor.
+                - 'bboxes': Bounding boxes tensor in the specified format.
+                - 'masks': Instance masks tensor (if return_mask is True).
+                - 'keypoints': Keypoints tensor (if return_keypoint is True).
+                - 'batch_idx': Batch index tensor (if batch_idx is True).
+
+        Examples:
+            >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
+            >>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1]), "instances": Instances(...)}
+            >>> formatted_labels = formatter(labels)
+            >>> print(formatted_labels.keys())
+        """
+        img = labels.pop("img")
+        h, w = img.shape[:2]
+        cls = labels.pop("cls")
+        instances = labels.pop("instances")
+        instances.convert_bbox(format=self.bbox_format)
+        instances.denormalize(w, h)
+        nl = len(instances)
+
+        if self.return_mask:
+            if nl:
+                masks, instances, cls = self._format_segments(instances, cls, w, h)
+                masks = torch.from_numpy(masks)
+            else:
+                masks = torch.zeros(
+                    1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio, img.shape[1] // self.mask_ratio
+                )
+            labels["masks"] = masks
+        labels["img"] = self._format_img(img)
+        labels["cls"] = torch.from_numpy(cls) if nl else torch.zeros(nl)
+        labels["bboxes"] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
+        if self.return_keypoint:
+            labels["keypoints"] = torch.from_numpy(instances.keypoints)
+            if self.normalize:
+                labels["keypoints"][..., 0] /= w
+                labels["keypoints"][..., 1] /= h
+        if self.return_obb:
+            labels["bboxes"] = (
+                xyxyxyxy2xywhr(torch.from_numpy(instances.segments)) if len(instances.segments) else torch.zeros((0, 5))
+            )
+        # NOTE: need to normalize obb in xywhr format for width-height consistency
+        if self.normalize:
+            labels["bboxes"][:, [0, 2]] /= w
+            labels["bboxes"][:, [1, 3]] /= h
+        # Then we can use collate_fn
+        if self.batch_idx:
+            labels["batch_idx"] = torch.zeros(nl)
+        return labels
+
+    def _format_img(self, img):
+        """
+        Formats an image for YOLO from a Numpy array to a PyTorch tensor.
+
+        This function performs the following operations:
+        1. Ensures the image has 3 dimensions (adds a channel dimension if needed).
+        2. Transposes the image from HWC to CHW format.
+        3. Optionally flips the color channels from RGB to BGR.
+        4. Converts the image to a contiguous array.
+        5. Converts the Numpy array to a PyTorch tensor.
+
+        Args:
+            img (np.ndarray): Input image as a Numpy array with shape (H, W, C) or (H, W).
+
+        Returns:
+            (torch.Tensor): Formatted image as a PyTorch tensor with shape (C, H, W).
+
+        Examples:
+            >>> import numpy as np
+            >>> img = np.random.rand(100, 100, 3)
+            >>> formatted_img = self._format_img(img)
+            >>> print(formatted_img.shape)
+            torch.Size([3, 100, 100])
+        """
+        if len(img.shape) < 3:
+            img = np.expand_dims(img, -1)
+        img = img.transpose(2, 0, 1)
+        img = np.ascontiguousarray(img[::-1] if random.uniform(0, 1) > self.bgr else img)
+        img = torch.from_numpy(img)
+        return img
+
+    def _format_segments(self, instances, cls, w, h):
+        """
+        Converts polygon segments to bitmap masks.
+
+        Args:
+            instances (Instances): Object containing segment information.
+            cls (numpy.ndarray): Class labels for each instance.
+            w (int): Width of the image.
+            h (int): Height of the image.
+
+        Returns:
+            masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
+            instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
+            cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
+
+        Notes:
+            - If self.mask_overlap is True, masks are overlapped and sorted by area.
+            - If self.mask_overlap is False, each mask is represented separately.
+            - Masks are downsampled according to self.mask_ratio.
+        """
+        segments = instances.segments
+        if self.mask_overlap:
+            masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
+            masks = masks[None]  # (640, 640) -> (1, 640, 640)
+            instances = instances[sorted_idx]
+            cls = cls[sorted_idx]
+        else:
+            masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio)
+
+        return masks, instances, cls
+
+
+class RandomLoadText:
+    """
+    Randomly samples positive and negative texts and updates class indices accordingly.
+
+    This class is responsible for sampling texts from a given set of class texts, including both positive
+    (present in the image) and negative (not present in the image) samples. It updates the class indices
+    to reflect the sampled texts and can optionally pad the text list to a fixed length.
+
+    Attributes:
+        prompt_format (str): Format string for text prompts.
+        neg_samples (Tuple[int, int]): Range for randomly sampling negative texts.
+        max_samples (int): Maximum number of different text samples in one image.
+        padding (bool): Whether to pad texts to max_samples.
+        padding_value (str): The text used for padding when padding is True.
+
+    Methods:
+        __call__: Processes the input labels and returns updated classes and texts.
+
+    Examples:
+        >>> loader = RandomLoadText(prompt_format="Object: {}", neg_samples=(5, 10), max_samples=20)
+        >>> labels = {"cls": [0, 1, 2], "texts": [["cat"], ["dog"], ["bird"]], "instances": [...]}
+        >>> updated_labels = loader(labels)
+        >>> print(updated_labels["texts"])
+        ['Object: cat', 'Object: dog', 'Object: bird', 'Object: elephant', 'Object: car']
+    """
+
+    def __init__(
+        self,
+        prompt_format: str = "{}",
+        neg_samples: Tuple[int, int] = (80, 80),
+        max_samples: int = 80,
+        padding: bool = False,
+        padding_value: str = "",
+    ) -> None:
+        """
+        Initializes the RandomLoadText class for randomly sampling positive and negative texts.
+
+        This class is designed to randomly sample positive texts and negative texts, and update the class
+        indices accordingly to the number of samples. It can be used for text-based object detection tasks.
+
+        Args:
+            prompt_format (str): Format string for the prompt. Default is '{}'. The format string should
+                contain a single pair of curly braces {} where the text will be inserted.
+            neg_samples (Tuple[int, int]): A range to randomly sample negative texts. The first integer
+                specifies the minimum number of negative samples, and the second integer specifies the
+                maximum. Default is (80, 80).
+            max_samples (int): The maximum number of different text samples in one image. Default is 80.
+            padding (bool): Whether to pad texts to max_samples. If True, the number of texts will always
+                be equal to max_samples. Default is False.
+            padding_value (str): The padding text to use when padding is True. Default is an empty string.
+
+        Attributes:
+            prompt_format (str): The format string for the prompt.
+            neg_samples (Tuple[int, int]): The range for sampling negative texts.
+            max_samples (int): The maximum number of text samples.
+            padding (bool): Whether padding is enabled.
+            padding_value (str): The value used for padding.
+
+        Examples:
+            >>> random_load_text = RandomLoadText(prompt_format="Object: {}", neg_samples=(50, 100), max_samples=120)
+            >>> random_load_text.prompt_format
+            'Object: {}'
+            >>> random_load_text.neg_samples
+            (50, 100)
+            >>> random_load_text.max_samples
+            120
+        """
+        self.prompt_format = prompt_format
+        self.neg_samples = neg_samples
+        self.max_samples = max_samples
+        self.padding = padding
+        self.padding_value = padding_value
+
+    def __call__(self, labels: dict) -> dict:
+        """
+        Randomly samples positive and negative texts and updates class indices accordingly.
+
+        This method samples positive texts based on the existing class labels in the image, and randomly
+        selects negative texts from the remaining classes. It then updates the class indices to match the
+        new sampled text order.
+
+        Args:
+            labels (Dict): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
+
+        Returns:
+            (Dict): Updated labels dictionary with new 'cls' and 'texts' entries.
+
+        Examples:
+            >>> loader = RandomLoadText(prompt_format="A photo of {}", neg_samples=(5, 10), max_samples=20)
+            >>> labels = {"cls": np.array([[0], [1], [2]]), "texts": [["dog"], ["cat"], ["bird"]]}
+            >>> updated_labels = loader(labels)
+        """
+        assert "texts" in labels, "No texts found in labels."
+        class_texts = labels["texts"]
+        num_classes = len(class_texts)
+        cls = np.asarray(labels.pop("cls"), dtype=int)
+        pos_labels = np.unique(cls).tolist()
+
+        if len(pos_labels) > self.max_samples:
+            pos_labels = random.sample(pos_labels, k=self.max_samples)
+
+        neg_samples = min(min(num_classes, self.max_samples) - len(pos_labels), random.randint(*self.neg_samples))
+        neg_labels = [i for i in range(num_classes) if i not in pos_labels]
+        neg_labels = random.sample(neg_labels, k=neg_samples)
+
+        sampled_labels = pos_labels + neg_labels
+        random.shuffle(sampled_labels)
+
+        label2ids = {label: i for i, label in enumerate(sampled_labels)}
+        valid_idx = np.zeros(len(labels["instances"]), dtype=bool)
+        new_cls = []
+        for i, label in enumerate(cls.squeeze(-1).tolist()):
+            if label not in label2ids:
+                continue
+            valid_idx[i] = True
+            new_cls.append([label2ids[label]])
+        labels["instances"] = labels["instances"][valid_idx]
+        labels["cls"] = np.array(new_cls)
+
+        # Randomly select one prompt when there's more than one prompts
+        texts = []
+        for label in sampled_labels:
+            prompts = class_texts[label]
+            assert len(prompts) > 0
+            prompt = self.prompt_format.format(prompts[random.randrange(len(prompts))])
+            texts.append(prompt)
+
+        if self.padding:
+            valid_labels = len(pos_labels) + len(neg_labels)
+            num_padding = self.max_samples - valid_labels
+            if num_padding > 0:
+                texts += [self.padding_value] * num_padding
+
+        labels["texts"] = texts
+        return labels
+
+
+def v8_transforms(dataset, imgsz, hyp, stretch=False):
+    """
+    Applies a series of image transformations for training.
+
+    This function creates a composition of image augmentation techniques to prepare images for YOLO training.
+    It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
+
+    Args:
+        dataset (Dataset): The dataset object containing image data and annotations.
+        imgsz (int): The target image size for resizing.
+        hyp (Namespace): A dictionary of hyperparameters controlling various aspects of the transformations.
+        stretch (bool): If True, applies stretching to the image. If False, uses LetterBox resizing.
+
+    Returns:
+        (Compose): A composition of image transformations to be applied to the dataset.
+
+    Examples:
+        >>> from ultralytics.data.dataset import YOLODataset
+        >>> from ultralytics.utils import IterableSimpleNamespace
+        >>> dataset = YOLODataset(img_path="path/to/images", imgsz=640)
+        >>> hyp = IterableSimpleNamespace(mosaic=1.0, copy_paste=0.5, degrees=10.0, translate=0.2, scale=0.9)
+        >>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
+        >>> augmented_data = transforms(dataset[0])
+    """
+    mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
+    affine = RandomPerspective(
+        degrees=hyp.degrees,
+        translate=hyp.translate,
+        scale=hyp.scale,
+        shear=hyp.shear,
+        perspective=hyp.perspective,
+        pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
+    )
+
+    pre_transform = Compose([mosaic, affine])
+    if hyp.copy_paste_mode == "flip":
+        pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
+    else:
+        pre_transform.append(
+            CopyPaste(
+                dataset,
+                pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
+                p=hyp.copy_paste,
+                mode=hyp.copy_paste_mode,
+            )
+        )
+    flip_idx = dataset.data.get("flip_idx", [])  # for keypoints augmentation
+    if dataset.use_keypoints:
+        kpt_shape = dataset.data.get("kpt_shape", None)
+        if len(flip_idx) == 0 and hyp.fliplr > 0.0:
+            hyp.fliplr = 0.0
+            LOGGER.warning("WARNING ⚠️ No 'flip_idx' array defined in data.yaml, setting augmentation 'fliplr=0.0'")
+        elif flip_idx and (len(flip_idx) != kpt_shape[0]):
+            raise ValueError(f"data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}")
+
+    return Compose(
+        [
+            pre_transform,
+            MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
+            Albumentations(p=1.0),
+            RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
+            RandomFlip(direction="vertical", p=hyp.flipud),
+            RandomFlip(direction="horizontal", p=hyp.fliplr, flip_idx=flip_idx),
+        ]
+    )  # transforms
+
+
+# Classification augmentations -----------------------------------------------------------------------------------------
+def classify_transforms(
+    size=224,
+    mean=DEFAULT_MEAN,
+    std=DEFAULT_STD,
+    interpolation="BILINEAR",
+    crop_fraction: float = DEFAULT_CROP_FRACTION,
+):
+    """
+    Creates a composition of image transforms for classification tasks.
+
+    This function generates a sequence of torchvision transforms suitable for preprocessing images
+    for classification models during evaluation or inference. The transforms include resizing,
+    center cropping, conversion to tensor, and normalization.
+
+    Args:
+        size (int | tuple): The target size for the transformed image. If an int, it defines the shortest edge. If a
+            tuple, it defines (height, width).
+        mean (tuple): Mean values for each RGB channel used in normalization.
+        std (tuple): Standard deviation values for each RGB channel used in normalization.
+        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
+        crop_fraction (float): Fraction of the image to be cropped.
+
+    Returns:
+        (torchvision.transforms.Compose): A composition of torchvision transforms.
+
+    Examples:
+        >>> transforms = classify_transforms(size=224)
+        >>> img = Image.open("path/to/image.jpg")
+        >>> transformed_img = transforms(img)
+    """
+    import torchvision.transforms as T  # scope for faster 'import ultralytics'
+
+    if isinstance(size, (tuple, list)):
+        assert len(size) == 2, f"'size' tuples must be length 2, not length {len(size)}"
+        scale_size = tuple(math.floor(x / crop_fraction) for x in size)
+    else:
+        scale_size = math.floor(size / crop_fraction)
+        scale_size = (scale_size, scale_size)
+
+    # Aspect ratio is preserved, crops center within image, no borders are added, image is lost
+    if scale_size[0] == scale_size[1]:
+        # Simple case, use torchvision built-in Resize with the shortest edge mode (scalar size arg)
+        tfl = [T.Resize(scale_size[0], interpolation=getattr(T.InterpolationMode, interpolation))]
+    else:
+        # Resize the shortest edge to matching target dim for non-square target
+        tfl = [T.Resize(scale_size)]
+    tfl.extend(
+        [
+            T.CenterCrop(size),
+            T.ToTensor(),
+            T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
+        ]
+    )
+    return T.Compose(tfl)
+
+
+# Classification training augmentations --------------------------------------------------------------------------------
+def classify_augmentations(
+    size=224,
+    mean=DEFAULT_MEAN,
+    std=DEFAULT_STD,
+    scale=None,
+    ratio=None,
+    hflip=0.5,
+    vflip=0.0,
+    auto_augment=None,
+    hsv_h=0.015,  # image HSV-Hue augmentation (fraction)
+    hsv_s=0.4,  # image HSV-Saturation augmentation (fraction)
+    hsv_v=0.4,  # image HSV-Value augmentation (fraction)
+    force_color_jitter=False,
+    erasing=0.0,
+    interpolation="BILINEAR",
+):
+    """
+    Creates a composition of image augmentation transforms for classification tasks.
+
+    This function generates a set of image transformations suitable for training classification models. It includes
+    options for resizing, flipping, color jittering, auto augmentation, and random erasing.
+
+    Args:
+        size (int): Target size for the image after transformations.
+        mean (tuple): Mean values for normalization, one per channel.
+        std (tuple): Standard deviation values for normalization, one per channel.
+        scale (tuple | None): Range of size of the origin size cropped.
+        ratio (tuple | None): Range of aspect ratio of the origin aspect ratio cropped.
+        hflip (float): Probability of horizontal flip.
+        vflip (float): Probability of vertical flip.
+        auto_augment (str | None): Auto augmentation policy. Can be 'randaugment', 'augmix', 'autoaugment' or None.
+        hsv_h (float): Image HSV-Hue augmentation factor.
+        hsv_s (float): Image HSV-Saturation augmentation factor.
+        hsv_v (float): Image HSV-Value augmentation factor.
+        force_color_jitter (bool): Whether to apply color jitter even if auto augment is enabled.
+        erasing (float): Probability of random erasing.
+        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
+
+    Returns:
+        (torchvision.transforms.Compose): A composition of image augmentation transforms.
+
+    Examples:
+        >>> transforms = classify_augmentations(size=224, auto_augment="randaugment")
+        >>> augmented_image = transforms(original_image)
+    """
+    # Transforms to apply if Albumentations not installed
+    import torchvision.transforms as T  # scope for faster 'import ultralytics'
+
+    if not isinstance(size, int):
+        raise TypeError(f"classify_transforms() size {size} must be integer, not (list, tuple)")
+    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
+    ratio = tuple(ratio or (3.0 / 4.0, 4.0 / 3.0))  # default imagenet ratio range
+    interpolation = getattr(T.InterpolationMode, interpolation)
+    primary_tfl = [T.RandomResizedCrop(size, scale=scale, ratio=ratio, interpolation=interpolation)]
+    if hflip > 0.0:
+        primary_tfl.append(T.RandomHorizontalFlip(p=hflip))
+    if vflip > 0.0:
+        primary_tfl.append(T.RandomVerticalFlip(p=vflip))
+
+    secondary_tfl = []
+    disable_color_jitter = False
+    if auto_augment:
+        assert isinstance(auto_augment, str), f"Provided argument should be string, but got type {type(auto_augment)}"
+        # color jitter is typically disabled if AA/RA on,
+        # this allows override without breaking old hparm cfgs
+        disable_color_jitter = not force_color_jitter
+
+        if auto_augment == "randaugment":
+            if TORCHVISION_0_11:
+                secondary_tfl.append(T.RandAugment(interpolation=interpolation))
+            else:
+                LOGGER.warning('"auto_augment=randaugment" requires torchvision >= 0.11.0. Disabling it.')
+
+        elif auto_augment == "augmix":
+            if TORCHVISION_0_13:
+                secondary_tfl.append(T.AugMix(interpolation=interpolation))
+            else:
+                LOGGER.warning('"auto_augment=augmix" requires torchvision >= 0.13.0. Disabling it.')
+
+        elif auto_augment == "autoaugment":
+            if TORCHVISION_0_10:
+                secondary_tfl.append(T.AutoAugment(interpolation=interpolation))
+            else:
+                LOGGER.warning('"auto_augment=autoaugment" requires torchvision >= 0.10.0. Disabling it.')
+
+        else:
+            raise ValueError(
+                f'Invalid auto_augment policy: {auto_augment}. Should be one of "randaugment", '
+                f'"augmix", "autoaugment" or None'
+            )
+
+    if not disable_color_jitter:
+        secondary_tfl.append(T.ColorJitter(brightness=hsv_v, contrast=hsv_v, saturation=hsv_s, hue=hsv_h))
+
+    final_tfl = [
+        T.ToTensor(),
+        T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
+        T.RandomErasing(p=erasing, inplace=True),
+    ]
+
+    return T.Compose(primary_tfl + secondary_tfl + final_tfl)
+
+
+# NOTE: keep this class for backward compatibility
+class ClassifyLetterBox:
+    """
+    A class for resizing and padding images for classification tasks.
+
+    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
+    It resizes and pads images to a specified size while maintaining the original aspect ratio.
+
+    Attributes:
+        h (int): Target height of the image.
+        w (int): Target width of the image.
+        auto (bool): If True, automatically calculates the short side using stride.
+        stride (int): The stride value, used when 'auto' is True.
+
+    Methods:
+        __call__: Applies the letterbox transformation to an input image.
+
+    Examples:
+        >>> transform = ClassifyLetterBox(size=(640, 640), auto=False, stride=32)
+        >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+        >>> result = transform(img)
+        >>> print(result.shape)
+        (640, 640, 3)
+    """
+
+    def __init__(self, size=(640, 640), auto=False, stride=32):
+        """
+        Initializes the ClassifyLetterBox object for image preprocessing.
+
+        This class is designed to be part of a transformation pipeline for image classification tasks. It resizes and
+        pads images to a specified size while maintaining the original aspect ratio.
+
+        Args:
+            size (int | Tuple[int, int]): Target size for the letterboxed image. If an int, a square image of
+                (size, size) is created. If a tuple, it should be (height, width).
+            auto (bool): If True, automatically calculates the short side based on stride. Default is False.
+            stride (int): The stride value, used when 'auto' is True. Default is 32.
+
+        Attributes:
+            h (int): Target height of the letterboxed image.
+            w (int): Target width of the letterboxed image.
+            auto (bool): Flag indicating whether to automatically calculate short side.
+            stride (int): Stride value for automatic short side calculation.
+
+        Examples:
+            >>> transform = ClassifyLetterBox(size=224)
+            >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+            >>> result = transform(img)
+            >>> print(result.shape)
+            (224, 224, 3)
+        """
+        super().__init__()
+        self.h, self.w = (size, size) if isinstance(size, int) else size
+        self.auto = auto  # pass max size integer, automatically solve for short side using stride
+        self.stride = stride  # used with auto
+
+    def __call__(self, im):
+        """
+        Resizes and pads an image using the letterbox method.
+
+        This method resizes the input image to fit within the specified dimensions while maintaining its aspect ratio,
+        then pads the resized image to match the target size.
+
+        Args:
+            im (numpy.ndarray): Input image as a numpy array with shape (H, W, C).
+
+        Returns:
+            (numpy.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
+                the target height and width respectively.
+
+        Examples:
+            >>> letterbox = ClassifyLetterBox(size=(640, 640))
+            >>> image = np.random.randint(0, 255, (720, 1280, 3), dtype=np.uint8)
+            >>> resized_image = letterbox(image)
+            >>> print(resized_image.shape)
+            (640, 640, 3)
+        """
+        imh, imw = im.shape[:2]
+        r = min(self.h / imh, self.w / imw)  # ratio of new/old dimensions
+        h, w = round(imh * r), round(imw * r)  # resized image dimensions
+
+        # Calculate padding dimensions
+        hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else (self.h, self.w)
+        top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
+
+        # Create padded image
+        im_out = np.full((hs, ws, 3), 114, dtype=im.dtype)
+        im_out[top : top + h, left : left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
+        return im_out
+
+
+# NOTE: keep this class for backward compatibility
+class CenterCrop:
+    """
+    Applies center cropping to images for classification tasks.
+
+    This class performs center cropping on input images, resizing them to a specified size while maintaining the aspect
+    ratio. It is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
+
+    Attributes:
+        h (int): Target height of the cropped image.
+        w (int): Target width of the cropped image.
+
+    Methods:
+        __call__: Applies the center crop transformation to an input image.
+
+    Examples:
+        >>> transform = CenterCrop(640)
+        >>> image = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
+        >>> cropped_image = transform(image)
+        >>> print(cropped_image.shape)
+        (640, 640, 3)
+    """
+
+    def __init__(self, size=640):
+        """
+        Initializes the CenterCrop object for image preprocessing.
+
+        This class is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
+        It performs a center crop on input images to a specified size.
+
+        Args:
+            size (int | Tuple[int, int]): The desired output size of the crop. If size is an int, a square crop
+                (size, size) is made. If size is a sequence like (h, w), it is used as the output size.
+
+        Returns:
+            (None): This method initializes the object and does not return anything.
+
+        Examples:
+            >>> transform = CenterCrop(224)
+            >>> img = np.random.rand(300, 300, 3)
+            >>> cropped_img = transform(img)
+            >>> print(cropped_img.shape)
+            (224, 224, 3)
+        """
+        super().__init__()
+        self.h, self.w = (size, size) if isinstance(size, int) else size
+
+    def __call__(self, im):
+        """
+        Applies center cropping to an input image.
+
+        This method resizes and crops the center of the image using a letterbox method. It maintains the aspect
+        ratio of the original image while fitting it into the specified dimensions.
+
+        Args:
+            im (numpy.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
+                PIL Image object.
+
+        Returns:
+            (numpy.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
+
+        Examples:
+            >>> transform = CenterCrop(size=224)
+            >>> image = np.random.randint(0, 255, (640, 480, 3), dtype=np.uint8)
+            >>> cropped_image = transform(image)
+            >>> assert cropped_image.shape == (224, 224, 3)
+        """
+        if isinstance(im, Image.Image):  # convert from PIL to numpy array if required
+            im = np.asarray(im)
+        imh, imw = im.shape[:2]
+        m = min(imh, imw)  # min dimension
+        top, left = (imh - m) // 2, (imw - m) // 2
+        return cv2.resize(im[top : top + m, left : left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
+
+
+# NOTE: keep this class for backward compatibility
+class ToTensor:
+    """
+    Converts an image from a numpy array to a PyTorch tensor.
+
+    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
+
+    Attributes:
+        half (bool): If True, converts the image to half precision (float16).
+
+    Methods:
+        __call__: Applies the tensor conversion to an input image.
+
+    Examples:
+        >>> transform = ToTensor(half=True)
+        >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
+        >>> tensor_img = transform(img)
+        >>> print(tensor_img.shape, tensor_img.dtype)
+        torch.Size([3, 640, 640]) torch.float16
+
+    Notes:
+        The input image is expected to be in BGR format with shape (H, W, C).
+        The output tensor will be in RGB format with shape (C, H, W), normalized to [0, 1].
+    """
+
+    def __init__(self, half=False):
+        """
+        Initializes the ToTensor object for converting images to PyTorch tensors.
+
+        This class is designed to be used as part of a transformation pipeline for image preprocessing in the
+        Ultralytics YOLO framework. It converts numpy arrays or PIL Images to PyTorch tensors, with an option
+        for half-precision (float16) conversion.
+
+        Args:
+            half (bool): If True, converts the tensor to half precision (float16). Default is False.
+
+        Examples:
+            >>> transform = ToTensor(half=True)
+            >>> img = np.random.rand(640, 640, 3)
+            >>> tensor_img = transform(img)
+            >>> print(tensor_img.dtype)
+            torch.float16
+        """
+        super().__init__()
+        self.half = half
+
+    def __call__(self, im):
+        """
+        Transforms an image from a numpy array to a PyTorch tensor.
+
+        This method converts the input image from a numpy array to a PyTorch tensor, applying optional
+        half-precision conversion and normalization. The image is transposed from HWC to CHW format and
+        the color channels are reversed from BGR to RGB.
+
+        Args:
+            im (numpy.ndarray): Input image as a numpy array with shape (H, W, C) in BGR order.
+
+        Returns:
+            (torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized
+                to [0, 1] with shape (C, H, W) in RGB order.
+
+        Examples:
+            >>> transform = ToTensor(half=True)
+            >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
+            >>> tensor_img = transform(img)
+            >>> print(tensor_img.shape, tensor_img.dtype)
+            torch.Size([3, 640, 640]) torch.float16
+        """
+        im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1])  # HWC to CHW -> BGR to RGB -> contiguous
+        im = torch.from_numpy(im)  # to torch
+        im = im.half() if self.half else im.float()  # uint8 to fp16/32
+        im /= 255.0  # 0-255 to 0.0-1.0
+        return im

+ 346 - 0
ultralytics/data/base.py

@@ -0,0 +1,346 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import glob
+import math
+import os
+import random
+from copy import deepcopy
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import numpy as np
+import psutil
+from torch.utils.data import Dataset
+
+from ultralytics.data.utils import FORMATS_HELP_MSG, HELP_URL, IMG_FORMATS
+from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
+
+
+class BaseDataset(Dataset):
+    """
+    Base dataset class for loading and processing image data.
+
+    Args:
+        img_path (str): Path to the folder containing images.
+        imgsz (int, optional): Image size. Defaults to 640.
+        cache (bool, optional): Cache images to RAM or disk during training. Defaults to False.
+        augment (bool, optional): If True, data augmentation is applied. Defaults to True.
+        hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None.
+        prefix (str, optional): Prefix to print in log messages. Defaults to ''.
+        rect (bool, optional): If True, rectangular training is used. Defaults to False.
+        batch_size (int, optional): Size of batches. Defaults to None.
+        stride (int, optional): Stride. Defaults to 32.
+        pad (float, optional): Padding. Defaults to 0.0.
+        single_cls (bool, optional): If True, single class training is used. Defaults to False.
+        classes (list): List of included classes. Default is None.
+        fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data).
+
+    Attributes:
+        im_files (list): List of image file paths.
+        labels (list): List of label data dictionaries.
+        ni (int): Number of images in the dataset.
+        ims (list): List of loaded images.
+        npy_files (list): List of numpy file paths.
+        transforms (callable): Image transformation function.
+    """
+
+    def __init__(
+        self,
+        img_path,
+        imgsz=640,
+        cache=False,
+        augment=True,
+        hyp=DEFAULT_CFG,
+        prefix="",
+        rect=False,
+        batch_size=16,
+        stride=32,
+        pad=0.5,
+        single_cls=False,
+        classes=None,
+        fraction=1.0,
+    ):
+        """Initialize BaseDataset with given configuration and options."""
+        super().__init__()
+        self.img_path = img_path
+        self.imgsz = imgsz
+        self.augment = augment
+        self.single_cls = single_cls
+        self.prefix = prefix
+        self.fraction = fraction
+        self.im_files = self.get_img_files(self.img_path)
+        self.labels = self.get_labels()
+        self.update_labels(include_class=classes)  # single_cls and include_class
+        self.ni = len(self.labels)  # number of images
+        self.rect = rect
+        self.batch_size = batch_size
+        self.stride = stride
+        self.pad = pad
+        if self.rect:
+            assert self.batch_size is not None
+            self.set_rectangle()
+
+        # Buffer thread for mosaic images
+        self.buffer = []  # buffer size = batch size
+        self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
+
+        # Cache images (options are cache = True, False, None, "ram", "disk")
+        self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
+        self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
+        self.cache = cache.lower() if isinstance(cache, str) else "ram" if cache is True else None
+        if self.cache == "ram" and self.check_cache_ram():
+            if hyp.deterministic:
+                LOGGER.warning(
+                    "WARNING ⚠️ cache='ram' may produce non-deterministic training results. "
+                    "Consider cache='disk' as a deterministic alternative if your disk space allows."
+                )
+            self.cache_images()
+        elif self.cache == "disk" and self.check_cache_disk():
+            self.cache_images()
+
+        # Transforms
+        self.transforms = self.build_transforms(hyp=hyp)
+
+    def get_img_files(self, img_path):
+        """Read image files."""
+        try:
+            f = []  # image files
+            for p in img_path if isinstance(img_path, list) else [img_path]:
+                p = Path(p)  # os-agnostic
+                if p.is_dir():  # dir
+                    f += glob.glob(str(p / "**" / "*.*"), recursive=True)
+                    # F = list(p.rglob('*.*'))  # pathlib
+                elif p.is_file():  # file
+                    with open(p) as t:
+                        t = t.read().strip().splitlines()
+                        parent = str(p.parent) + os.sep
+                        f += [x.replace("./", parent) if x.startswith("./") else x for x in t]  # local to global path
+                        # F += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
+                else:
+                    raise FileNotFoundError(f"{self.prefix}{p} does not exist")
+            im_files = sorted(x.replace("/", os.sep) for x in f if x.split(".")[-1].lower() in IMG_FORMATS)
+            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS])  # pathlib
+            assert im_files, f"{self.prefix}No images found in {img_path}. {FORMATS_HELP_MSG}"
+        except Exception as e:
+            raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
+        if self.fraction < 1:
+            im_files = im_files[: round(len(im_files) * self.fraction)]  # retain a fraction of the dataset
+        return im_files
+
+    def update_labels(self, include_class: Optional[list]):
+        """Update labels to include only these classes (optional)."""
+        include_class_array = np.array(include_class).reshape(1, -1)
+        for i in range(len(self.labels)):
+            if include_class is not None:
+                cls = self.labels[i]["cls"]
+                bboxes = self.labels[i]["bboxes"]
+                segments = self.labels[i]["segments"]
+                keypoints = self.labels[i]["keypoints"]
+                j = (cls == include_class_array).any(1)
+                self.labels[i]["cls"] = cls[j]
+                self.labels[i]["bboxes"] = bboxes[j]
+                if segments:
+                    self.labels[i]["segments"] = [segments[si] for si, idx in enumerate(j) if idx]
+                if keypoints is not None:
+                    self.labels[i]["keypoints"] = keypoints[j]
+            if self.single_cls:
+                self.labels[i]["cls"][:, 0] = 0
+
+    def load_image(self, i, rect_mode=True):
+        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
+        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
+        if im is None:  # not cached in RAM
+            if fn.exists():  # load npy
+                try:
+                    im = np.load(fn)
+                except Exception as e:
+                    LOGGER.warning(f"{self.prefix}WARNING ⚠️ Removing corrupt *.npy image file {fn} due to: {e}")
+                    Path(fn).unlink(missing_ok=True)
+                    im = cv2.imread(f)  # BGR
+            else:  # read image
+                im = cv2.imread(f)  # BGR
+            if im is None:
+                raise FileNotFoundError(f"Image Not Found {f}")
+
+            h0, w0 = im.shape[:2]  # orig hw
+            if rect_mode:  # resize long side to imgsz while maintaining aspect ratio
+                r = self.imgsz / max(h0, w0)  # ratio
+                if r != 1:  # if sizes are not equal
+                    w, h = (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz))
+                    im = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
+            elif not (h0 == w0 == self.imgsz):  # resize by stretching image to square imgsz
+                im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
+
+            # Add to buffer if training with augmentations
+            if self.augment:
+                self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+                self.buffer.append(i)
+                if 1 < len(self.buffer) >= self.max_buffer_length:  # prevent empty buffer
+                    j = self.buffer.pop(0)
+                    if self.cache != "ram":
+                        self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
+
+            return im, (h0, w0), im.shape[:2]
+
+        return self.ims[i], self.im_hw0[i], self.im_hw[i]
+
+    def cache_images(self):
+        """Cache images to memory or disk."""
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        fcn, storage = (self.cache_images_to_disk, "Disk") if self.cache == "disk" else (self.load_image, "RAM")
+        with ThreadPool(NUM_THREADS) as pool:
+            results = pool.imap(fcn, range(self.ni))
+            pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
+            for i, x in pbar:
+                if self.cache == "disk":
+                    b += self.npy_files[i].stat().st_size
+                else:  # 'ram'
+                    self.ims[i], self.im_hw0[i], self.im_hw[i] = x  # im, hw_orig, hw_resized = load_image(self, i)
+                    b += self.ims[i].nbytes
+                pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {storage})"
+            pbar.close()
+
+    def cache_images_to_disk(self, i):
+        """Saves an image as an *.npy file for faster loading."""
+        f = self.npy_files[i]
+        if not f.exists():
+            np.save(f.as_posix(), cv2.imread(self.im_files[i]), allow_pickle=False)
+
+    def check_cache_disk(self, safety_margin=0.5):
+        """Check image caching requirements vs available disk space."""
+        import shutil
+
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        n = min(self.ni, 30)  # extrapolate from 30 random images
+        for _ in range(n):
+            im_file = random.choice(self.im_files)
+            im = cv2.imread(im_file)
+            if im is None:
+                continue
+            b += im.nbytes
+            if not os.access(Path(im_file).parent, os.W_OK):
+                self.cache = None
+                LOGGER.info(f"{self.prefix}Skipping caching images to disk, directory not writeable ⚠️")
+                return False
+        disk_required = b * self.ni / n * (1 + safety_margin)  # bytes required to cache dataset to disk
+        total, used, free = shutil.disk_usage(Path(self.im_files[0]).parent)
+        if disk_required > free:
+            self.cache = None
+            LOGGER.info(
+                f"{self.prefix}{disk_required / gb:.1f}GB disk space required, "
+                f"with {int(safety_margin * 100)}% safety margin but only "
+                f"{free / gb:.1f}/{total / gb:.1f}GB free, not caching images to disk ⚠️"
+            )
+            return False
+        return True
+
+    def check_cache_ram(self, safety_margin=0.5):
+        """Check image caching requirements vs available memory."""
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        n = min(self.ni, 30)  # extrapolate from 30 random images
+        for _ in range(n):
+            im = cv2.imread(random.choice(self.im_files))  # sample image
+            if im is None:
+                continue
+            ratio = self.imgsz / max(im.shape[0], im.shape[1])  # max(h, w)  # ratio
+            b += im.nbytes * ratio**2
+        mem_required = b * self.ni / n * (1 + safety_margin)  # GB required to cache dataset into RAM
+        mem = psutil.virtual_memory()
+        if mem_required > mem.available:
+            self.cache = None
+            LOGGER.info(
+                f"{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images "
+                f"with {int(safety_margin * 100)}% safety margin but only "
+                f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, not caching images ⚠️"
+            )
+            return False
+        return True
+
+    def set_rectangle(self):
+        """Sets the shape of bounding boxes for YOLO detections as rectangles."""
+        bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int)  # batch index
+        nb = bi[-1] + 1  # number of batches
+
+        s = np.array([x.pop("shape") for x in self.labels])  # hw
+        ar = s[:, 0] / s[:, 1]  # aspect ratio
+        irect = ar.argsort()
+        self.im_files = [self.im_files[i] for i in irect]
+        self.labels = [self.labels[i] for i in irect]
+        ar = ar[irect]
+
+        # Set training image shapes
+        shapes = [[1, 1]] * nb
+        for i in range(nb):
+            ari = ar[bi == i]
+            mini, maxi = ari.min(), ari.max()
+            if maxi < 1:
+                shapes[i] = [maxi, 1]
+            elif mini > 1:
+                shapes[i] = [1, 1 / mini]
+
+        self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
+        self.batch = bi  # batch index of image
+
+    def __getitem__(self, index):
+        """Returns transformed label information for given index."""
+        return self.transforms(self.get_image_and_label(index))
+
+    def get_image_and_label(self, index):
+        """Get and return label information from the dataset."""
+        label = deepcopy(self.labels[index])  # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
+        label.pop("shape", None)  # shape is for rect, remove it
+        label["img"], label["ori_shape"], label["resized_shape"] = self.load_image(index)
+        label["ratio_pad"] = (
+            label["resized_shape"][0] / label["ori_shape"][0],
+            label["resized_shape"][1] / label["ori_shape"][1],
+        )  # for evaluation
+        if self.rect:
+            label["rect_shape"] = self.batch_shapes[self.batch[index]]
+        return self.update_labels_info(label)
+
+    def __len__(self):
+        """Returns the length of the labels list for the dataset."""
+        return len(self.labels)
+
+    def update_labels_info(self, label):
+        """Custom your label format here."""
+        return label
+
+    def build_transforms(self, hyp=None):
+        """
+        Users can customize augmentations here.
+
+        Example:
+            ```python
+            if self.augment:
+                # Training transforms
+                return Compose([])
+            else:
+                # Val transforms
+                return Compose([])
+            ```
+        """
+        raise NotImplementedError
+
+    def get_labels(self):
+        """
+        Users can customize their own format here.
+
+        Note:
+            Ensure output is a dictionary with the following keys:
+            ```python
+            dict(
+                im_file=im_file,
+                shape=shape,  # format: (height, width)
+                cls=cls,
+                bboxes=bboxes,  # xywh
+                segments=segments,  # xy
+                keypoints=keypoints,  # xy
+                normalized=True,  # or False
+                bbox_format="xyxy",  # or xywh, ltwh
+            )
+            ```
+        """
+        raise NotImplementedError

+ 215 - 0
ultralytics/data/build.py

@@ -0,0 +1,215 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import dataloader, distributed
+
+from ultralytics.data.dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
+from ultralytics.data.loaders import (
+    LOADERS,
+    LoadImagesAndVideos,
+    LoadPilAndNumpy,
+    LoadScreenshots,
+    LoadStreams,
+    LoadTensor,
+    SourceTypes,
+    autocast_list,
+)
+from ultralytics.data.utils import IMG_FORMATS, PIN_MEMORY, VID_FORMATS
+from ultralytics.utils import RANK, colorstr
+from ultralytics.utils.checks import check_file
+
+
+class InfiniteDataLoader(dataloader.DataLoader):
+    """
+    Dataloader that reuses workers.
+
+    Uses same syntax as vanilla DataLoader.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Dataloader that infinitely recycles workers, inherits from DataLoader."""
+        super().__init__(*args, **kwargs)
+        object.__setattr__(self, "batch_sampler", _RepeatSampler(self.batch_sampler))
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        """Returns the length of the batch sampler's sampler."""
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        """Creates a sampler that repeats indefinitely."""
+        for _ in range(len(self)):
+            yield next(self.iterator)
+
+    def __del__(self):
+        """Ensure that workers are terminated."""
+        if hasattr(self.iterator, "_workers"):
+            for w in self.iterator._workers:  # force terminate
+                if w.is_alive():
+                    w.terminate()
+            self.iterator._shutdown_workers()  # cleanup
+
+    def reset(self):
+        """
+        Reset iterator.
+
+        This is useful when we want to modify settings of dataset while training.
+        """
+        self.iterator = self._get_iterator()
+
+
+class _RepeatSampler:
+    """
+    Sampler that repeats forever.
+
+    Args:
+        sampler (Dataset.sampler): The sampler to repeat.
+    """
+
+    def __init__(self, sampler):
+        """Initializes an object that repeats a given sampler indefinitely."""
+        self.sampler = sampler
+
+    def __iter__(self):
+        """Iterates over the 'sampler' and yields its contents."""
+        while True:
+            yield from iter(self.sampler)
+
+
+def seed_worker(worker_id):  # noqa
+    """Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32, multi_modal=False):
+    """Build YOLO Dataset."""
+    dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
+    return dataset(
+        img_path=img_path,
+        imgsz=cfg.imgsz,
+        batch_size=batch,
+        augment=mode == "train",  # augmentation
+        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
+        rect=cfg.rect or rect,  # rectangular batches
+        cache=cfg.cache or None,
+        single_cls=cfg.single_cls or False,
+        stride=int(stride),
+        pad=0.0 if mode == "train" else 0.5,
+        prefix=colorstr(f"{mode}: "),
+        task=cfg.task,
+        classes=cfg.classes,
+        data=data,
+        fraction=cfg.fraction if mode == "train" else 1.0,
+    )
+
+
+def build_grounding(cfg, img_path, json_file, batch, mode="train", rect=False, stride=32):
+    """Build YOLO Dataset."""
+    return GroundingDataset(
+        img_path=img_path,
+        json_file=json_file,
+        imgsz=cfg.imgsz,
+        batch_size=batch,
+        augment=mode == "train",  # augmentation
+        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
+        rect=cfg.rect or rect,  # rectangular batches
+        cache=cfg.cache or None,
+        single_cls=cfg.single_cls or False,
+        stride=int(stride),
+        pad=0.0 if mode == "train" else 0.5,
+        prefix=colorstr(f"{mode}: "),
+        task=cfg.task,
+        classes=cfg.classes,
+        fraction=cfg.fraction if mode == "train" else 1.0,
+    )
+
+
+def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
+    """Return an InfiniteDataLoader or DataLoader for training or validation set."""
+    batch = min(batch, len(dataset))
+    nd = torch.cuda.device_count()  # number of CUDA devices
+    nw = min(os.cpu_count() // max(nd, 1), workers)  # number of workers
+    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
+    generator = torch.Generator()
+    generator.manual_seed(6148914691236517205 + RANK)
+    return InfiniteDataLoader(
+        dataset=dataset,
+        batch_size=batch,
+        shuffle=shuffle and sampler is None,
+        num_workers=nw,
+        sampler=sampler,
+        pin_memory=PIN_MEMORY,
+        collate_fn=getattr(dataset, "collate_fn", None),
+        worker_init_fn=seed_worker,
+        generator=generator,
+    )
+
+
+def check_source(source):
+    """Check source type and return corresponding flag values."""
+    webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
+    if isinstance(source, (str, int, Path)):  # int for local usb camera
+        source = str(source)
+        is_file = Path(source).suffix[1:] in (IMG_FORMATS | VID_FORMATS)
+        is_url = source.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://"))
+        webcam = source.isnumeric() or source.endswith(".streams") or (is_url and not is_file)
+        screenshot = source.lower() == "screen"
+        if is_url and is_file:
+            source = check_file(source)  # download
+    elif isinstance(source, LOADERS):
+        in_memory = True
+    elif isinstance(source, (list, tuple)):
+        source = autocast_list(source)  # convert all list elements to PIL or np arrays
+        from_img = True
+    elif isinstance(source, (Image.Image, np.ndarray)):
+        from_img = True
+    elif isinstance(source, torch.Tensor):
+        tensor = True
+    else:
+        raise TypeError("Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict")
+
+    return source, webcam, screenshot, from_img, in_memory, tensor
+
+
+def load_inference_source(source=None, batch=1, vid_stride=1, buffer=False):
+    """
+    Loads an inference source for object detection and applies necessary transformations.
+
+    Args:
+        source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
+        batch (int, optional): Batch size for dataloaders. Default is 1.
+        vid_stride (int, optional): The frame interval for video sources. Default is 1.
+        buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.
+
+    Returns:
+        dataset (Dataset): A dataset object for the specified input source.
+    """
+    source, stream, screenshot, from_img, in_memory, tensor = check_source(source)
+    source_type = source.source_type if in_memory else SourceTypes(stream, screenshot, from_img, tensor)
+
+    # Dataloader
+    if tensor:
+        dataset = LoadTensor(source)
+    elif in_memory:
+        dataset = source
+    elif stream:
+        dataset = LoadStreams(source, vid_stride=vid_stride, buffer=buffer)
+    elif screenshot:
+        dataset = LoadScreenshots(source)
+    elif from_img:
+        dataset = LoadPilAndNumpy(source)
+    else:
+        dataset = LoadImagesAndVideos(source, batch=batch, vid_stride=vid_stride)
+
+    # Attach source types to the dataset
+    setattr(dataset, "source_type", source_type)
+
+    return dataset

+ 702 - 0
ultralytics/data/converter.py

@@ -0,0 +1,702 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import json
+import random
+import shutil
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from ultralytics.utils import DATASETS_DIR, LOGGER, NUM_THREADS, TQDM
+from ultralytics.utils.downloads import download
+from ultralytics.utils.files import increment_path
+
+
+def coco91_to_coco80_class():
+    """
+    Converts 91-index COCO class IDs to 80-index COCO class IDs.
+
+    Returns:
+        (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
+            corresponding 91-index class ID.
+    """
+    return [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        None,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        None,
+        24,
+        25,
+        None,
+        None,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        None,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        None,
+        60,
+        None,
+        None,
+        61,
+        None,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        None,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        None,
+    ]
+
+
+def coco80_to_coco91_class():
+    r"""
+    Converts 80-index (val2014) to 91-index (paper).
+    For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/.
+
+    Example:
+        ```python
+        import numpy as np
+
+        a = np.loadtxt("data/coco.names", dtype="str", delimiter="\n")
+        b = np.loadtxt("data/coco_paper.names", dtype="str", delimiter="\n")
+        x1 = [list(a[i] == b).index(True) + 1 for i in range(80)]  # darknet to coco
+        x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)]  # coco to darknet
+        ```
+    """
+    return [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        27,
+        28,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        67,
+        70,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        84,
+        85,
+        86,
+        87,
+        88,
+        89,
+        90,
+    ]
+
+
+def convert_coco(
+    labels_dir="../coco/annotations/",
+    save_dir="coco_converted/",
+    use_segments=False,
+    use_keypoints=False,
+    cls91to80=True,
+    lvis=False,
+):
+    """
+    Converts COCO dataset annotations to a YOLO annotation format  suitable for training YOLO models.
+
+    Args:
+        labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
+        save_dir (str, optional): Path to directory to save results to.
+        use_segments (bool, optional): Whether to include segmentation masks in the output.
+        use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
+        cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
+        lvis (bool, optional): Whether to convert data in lvis dataset way.
+
+    Example:
+        ```python
+        from ultralytics.data.converter import convert_coco
+
+        convert_coco("../datasets/coco/annotations/", use_segments=True, use_keypoints=False, cls91to80=False)
+        convert_coco(
+            "../datasets/lvis/annotations/", use_segments=True, use_keypoints=False, cls91to80=False, lvis=True
+        )
+        ```
+
+    Output:
+        Generates output files in the specified output directory.
+    """
+    # Create dataset directory
+    save_dir = increment_path(save_dir)  # increment if save directory already exists
+    for p in save_dir / "labels", save_dir / "images":
+        p.mkdir(parents=True, exist_ok=True)  # make dir
+
+    # Convert classes
+    coco80 = coco91_to_coco80_class()
+
+    # Import json
+    for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
+        lname = "" if lvis else json_file.stem.replace("instances_", "")
+        fn = Path(save_dir) / "labels" / lname  # folder name
+        fn.mkdir(parents=True, exist_ok=True)
+        if lvis:
+            # NOTE: create folders for both train and val in advance,
+            # since LVIS val set contains images from COCO 2017 train in addition to the COCO 2017 val split.
+            (fn / "train2017").mkdir(parents=True, exist_ok=True)
+            (fn / "val2017").mkdir(parents=True, exist_ok=True)
+        with open(json_file, encoding="utf-8") as f:
+            data = json.load(f)
+
+        # Create image dict
+        images = {f"{x['id']:d}": x for x in data["images"]}
+        # Create image-annotations dict
+        imgToAnns = defaultdict(list)
+        for ann in data["annotations"]:
+            imgToAnns[ann["image_id"]].append(ann)
+
+        image_txt = []
+        # Write labels file
+        for img_id, anns in TQDM(imgToAnns.items(), desc=f"Annotations {json_file}"):
+            img = images[f"{img_id:d}"]
+            h, w = img["height"], img["width"]
+            f = str(Path(img["coco_url"]).relative_to("http://images.cocodataset.org")) if lvis else img["file_name"]
+            if lvis:
+                image_txt.append(str(Path("./images") / f))
+
+            bboxes = []
+            segments = []
+            keypoints = []
+            for ann in anns:
+                if ann.get("iscrowd", False):
+                    continue
+                # The COCO box format is [top left x, top left y, width, height]
+                box = np.array(ann["bbox"], dtype=np.float64)
+                box[:2] += box[2:] / 2  # xy top-left corner to center
+                box[[0, 2]] /= w  # normalize x
+                box[[1, 3]] /= h  # normalize y
+                if box[2] <= 0 or box[3] <= 0:  # if w <= 0 and h <= 0
+                    continue
+
+                cls = coco80[ann["category_id"] - 1] if cls91to80 else ann["category_id"] - 1  # class
+                box = [cls] + box.tolist()
+                if box not in bboxes:
+                    bboxes.append(box)
+                    if use_segments and ann.get("segmentation") is not None:
+                        if len(ann["segmentation"]) == 0:
+                            segments.append([])
+                            continue
+                        elif len(ann["segmentation"]) > 1:
+                            s = merge_multi_segment(ann["segmentation"])
+                            s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
+                        else:
+                            s = [j for i in ann["segmentation"] for j in i]  # all segments concatenated
+                            s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
+                        s = [cls] + s
+                        segments.append(s)
+                    if use_keypoints and ann.get("keypoints") is not None:
+                        keypoints.append(
+                            box + (np.array(ann["keypoints"]).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
+                        )
+
+            # Write
+            with open((fn / f).with_suffix(".txt"), "a") as file:
+                for i in range(len(bboxes)):
+                    if use_keypoints:
+                        line = (*(keypoints[i]),)  # cls, box, keypoints
+                    else:
+                        line = (
+                            *(segments[i] if use_segments and len(segments[i]) > 0 else bboxes[i]),
+                        )  # cls, box or segments
+                    file.write(("%g " * len(line)).rstrip() % line + "\n")
+
+        if lvis:
+            with open((Path(save_dir) / json_file.name.replace("lvis_v1_", "").replace(".json", ".txt")), "a") as f:
+                f.writelines(f"{line}\n" for line in image_txt)
+
+    LOGGER.info(f"{'LVIS' if lvis else 'COCO'} data converted successfully.\nResults saved to {save_dir.resolve()}")
+
+
+def convert_segment_masks_to_yolo_seg(masks_dir, output_dir, classes):
+    """
+    Converts a dataset of segmentation mask images to the YOLO segmentation format.
+
+    This function takes the directory containing the binary format mask images and converts them into YOLO segmentation format.
+    The converted masks are saved in the specified output directory.
+
+    Args:
+        masks_dir (str): The path to the directory where all mask images (png, jpg) are stored.
+        output_dir (str): The path to the directory where the converted YOLO segmentation masks will be stored.
+        classes (int): Total classes in the dataset i.e. for COCO classes=80
+
+    Example:
+        ```python
+        from ultralytics.data.converter import convert_segment_masks_to_yolo_seg
+
+        # The classes here is the total classes in the dataset, for COCO dataset we have 80 classes
+        convert_segment_masks_to_yolo_seg("path/to/masks_directory", "path/to/output/directory", classes=80)
+        ```
+
+    Notes:
+        The expected directory structure for the masks is:
+
+            - masks
+                ├─ mask_image_01.png or mask_image_01.jpg
+                ├─ mask_image_02.png or mask_image_02.jpg
+                ├─ mask_image_03.png or mask_image_03.jpg
+                └─ mask_image_04.png or mask_image_04.jpg
+
+        After execution, the labels will be organized in the following structure:
+
+            - output_dir
+                ├─ mask_yolo_01.txt
+                ├─ mask_yolo_02.txt
+                ├─ mask_yolo_03.txt
+                └─ mask_yolo_04.txt
+    """
+    pixel_to_class_mapping = {i + 1: i for i in range(classes)}
+    for mask_path in Path(masks_dir).iterdir():
+        if mask_path.suffix in {".png", ".jpg"}:
+            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)  # Read the mask image in grayscale
+            img_height, img_width = mask.shape  # Get image dimensions
+            LOGGER.info(f"Processing {mask_path} imgsz = {img_height} x {img_width}")
+
+            unique_values = np.unique(mask)  # Get unique pixel values representing different classes
+            yolo_format_data = []
+
+            for value in unique_values:
+                if value == 0:
+                    continue  # Skip background
+                class_index = pixel_to_class_mapping.get(value, -1)
+                if class_index == -1:
+                    LOGGER.warning(f"Unknown class for pixel value {value} in file {mask_path}, skipping.")
+                    continue
+
+                # Create a binary mask for the current class and find contours
+                contours, _ = cv2.findContours(
+                    (mask == value).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )  # Find contours
+
+                for contour in contours:
+                    if len(contour) >= 3:  # YOLO requires at least 3 points for a valid segmentation
+                        contour = contour.squeeze()  # Remove single-dimensional entries
+                        yolo_format = [class_index]
+                        for point in contour:
+                            # Normalize the coordinates
+                            yolo_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
+                            yolo_format.append(round(point[1] / img_height, 6))
+                        yolo_format_data.append(yolo_format)
+            # Save Ultralytics YOLO format data to file
+            output_path = Path(output_dir) / f"{mask_path.stem}.txt"
+            with open(output_path, "w") as file:
+                for item in yolo_format_data:
+                    line = " ".join(map(str, item))
+                    file.write(line + "\n")
+            LOGGER.info(f"Processed and stored at {output_path} imgsz = {img_height} x {img_width}")
+
+
+def convert_dota_to_yolo_obb(dota_root_path: str):
+    """
+    Converts DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.
+
+    The function processes images in the 'train' and 'val' folders of the DOTA dataset. For each image, it reads the
+    associated label from the original labels directory and writes new labels in YOLO OBB format to a new directory.
+
+    Args:
+        dota_root_path (str): The root directory path of the DOTA dataset.
+
+    Example:
+        ```python
+        from ultralytics.data.converter import convert_dota_to_yolo_obb
+
+        convert_dota_to_yolo_obb("path/to/DOTA")
+        ```
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+
+            - DOTA
+                ├─ images
+                │   ├─ train
+                │   └─ val
+                └─ labels
+                    ├─ train_original
+                    └─ val_original
+
+        After execution, the function will organize the labels into:
+
+            - DOTA
+                └─ labels
+                    ├─ train
+                    └─ val
+    """
+    dota_root_path = Path(dota_root_path)
+
+    # Class names to indices mapping
+    class_mapping = {
+        "plane": 0,
+        "ship": 1,
+        "storage-tank": 2,
+        "baseball-diamond": 3,
+        "tennis-court": 4,
+        "basketball-court": 5,
+        "ground-track-field": 6,
+        "harbor": 7,
+        "bridge": 8,
+        "large-vehicle": 9,
+        "small-vehicle": 10,
+        "helicopter": 11,
+        "roundabout": 12,
+        "soccer-ball-field": 13,
+        "swimming-pool": 14,
+        "container-crane": 15,
+        "airport": 16,
+        "helipad": 17,
+    }
+
+    def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
+        """Converts a single image's DOTA annotation to YOLO OBB format and saves it to a specified directory."""
+        orig_label_path = orig_label_dir / f"{image_name}.txt"
+        save_path = save_dir / f"{image_name}.txt"
+
+        with orig_label_path.open("r") as f, save_path.open("w") as g:
+            lines = f.readlines()
+            for line in lines:
+                parts = line.strip().split()
+                if len(parts) < 9:
+                    continue
+                class_name = parts[8]
+                class_idx = class_mapping[class_name]
+                coords = [float(p) for p in parts[:8]]
+                normalized_coords = [
+                    coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)
+                ]
+                formatted_coords = [f"{coord:.6g}" for coord in normalized_coords]
+                g.write(f"{class_idx} {' '.join(formatted_coords)}\n")
+
+    for phase in ["train", "val"]:
+        image_dir = dota_root_path / "images" / phase
+        orig_label_dir = dota_root_path / "labels" / f"{phase}_original"
+        save_dir = dota_root_path / "labels" / phase
+
+        save_dir.mkdir(parents=True, exist_ok=True)
+
+        image_paths = list(image_dir.iterdir())
+        for image_path in TQDM(image_paths, desc=f"Processing {phase} images"):
+            if image_path.suffix != ".png":
+                continue
+            image_name_without_ext = image_path.stem
+            img = cv2.imread(str(image_path))
+            h, w = img.shape[:2]
+            convert_label(image_name_without_ext, w, h, orig_label_dir, save_dir)
+
+
+def min_index(arr1, arr2):
+    """
+    Find a pair of indexes with the shortest distance between two arrays of 2D points.
+
+    Args:
+        arr1 (np.ndarray): A NumPy array of shape (N, 2) representing N 2D points.
+        arr2 (np.ndarray): A NumPy array of shape (M, 2) representing M 2D points.
+
+    Returns:
+        (tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
+    """
+    dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
+    return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
+
+
+def merge_multi_segment(segments):
+    """
+    Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
+    This function connects these coordinates with a thin line to merge all segments into one.
+
+    Args:
+        segments (List[List]): Original segmentations in COCO's JSON file.
+                               Each element is a list of coordinates, like [segmentation1, segmentation2,...].
+
+    Returns:
+        s (List[np.ndarray]): A list of connected segments represented as NumPy arrays.
+    """
+    s = []
+    segments = [np.array(i).reshape(-1, 2) for i in segments]
+    idx_list = [[] for _ in range(len(segments))]
+
+    # Record the indexes with min distance between each segment
+    for i in range(1, len(segments)):
+        idx1, idx2 = min_index(segments[i - 1], segments[i])
+        idx_list[i - 1].append(idx1)
+        idx_list[i].append(idx2)
+
+    # Use two round to connect all the segments
+    for k in range(2):
+        # Forward connection
+        if k == 0:
+            for i, idx in enumerate(idx_list):
+                # Middle segments have two indexes, reverse the index of middle segments
+                if len(idx) == 2 and idx[0] > idx[1]:
+                    idx = idx[::-1]
+                    segments[i] = segments[i][::-1, :]
+
+                segments[i] = np.roll(segments[i], -idx[0], axis=0)
+                segments[i] = np.concatenate([segments[i], segments[i][:1]])
+                # Deal with the first segment and the last one
+                if i in {0, len(idx_list) - 1}:
+                    s.append(segments[i])
+                else:
+                    idx = [0, idx[1] - idx[0]]
+                    s.append(segments[i][idx[0] : idx[1] + 1])
+
+        else:
+            for i in range(len(idx_list) - 1, -1, -1):
+                if i not in {0, len(idx_list) - 1}:
+                    idx = idx_list[i]
+                    nidx = abs(idx[1] - idx[0])
+                    s.append(segments[i][nidx:])
+    return s
+
+
+def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt", device=None):
+    """
+    Converts existing object detection dataset (bounding boxes) to segmentation dataset or oriented bounding box (OBB)
+    in YOLO format. Generates segmentation data using SAM auto-annotator as needed.
+
+    Args:
+        im_dir (str | Path): Path to image directory to convert.
+        save_dir (str | Path): Path to save the generated labels, labels will be saved
+            into `labels-segment` in the same directory level of `im_dir` if save_dir is None. Default: None.
+        sam_model (str): Segmentation model to use for intermediate segmentation data; optional.
+        device (int | str): The specific device to run SAM models. Default: None.
+
+    Notes:
+        The input directory structure assumed for dataset:
+
+            - im_dir
+                ├─ 001.jpg
+                ├─ ...
+                └─ NNN.jpg
+            - labels
+                ├─ 001.txt
+                ├─ ...
+                └─ NNN.txt
+    """
+    from ultralytics import SAM
+    from ultralytics.data import YOLODataset
+    from ultralytics.utils import LOGGER
+    from ultralytics.utils.ops import xywh2xyxy
+
+    # NOTE: add placeholder to pass class index check
+    dataset = YOLODataset(im_dir, data=dict(names=list(range(1000))))
+    if len(dataset.labels[0]["segments"]) > 0:  # if it's segment data
+        LOGGER.info("Segmentation labels detected, no need to generate new ones!")
+        return
+
+    LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
+    sam_model = SAM(sam_model)
+    for label in TQDM(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
+        h, w = label["shape"]
+        boxes = label["bboxes"]
+        if len(boxes) == 0:  # skip empty labels
+            continue
+        boxes[:, [0, 2]] *= w
+        boxes[:, [1, 3]] *= h
+        im = cv2.imread(label["im_file"])
+        sam_results = sam_model(im, bboxes=xywh2xyxy(boxes), verbose=False, save=False, device=device)
+        label["segments"] = sam_results[0].masks.xyn
+
+    save_dir = Path(save_dir) if save_dir else Path(im_dir).parent / "labels-segment"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    for label in dataset.labels:
+        texts = []
+        lb_name = Path(label["im_file"]).with_suffix(".txt").name
+        txt_file = save_dir / lb_name
+        cls = label["cls"]
+        for i, s in enumerate(label["segments"]):
+            if len(s) == 0:
+                continue
+            line = (int(cls[i]), *s.reshape(-1))
+            texts.append(("%g " * len(line)).rstrip() % line)
+        with open(txt_file, "a") as f:
+            f.writelines(text + "\n" for text in texts)
+    LOGGER.info(f"Generated segment labels saved in {save_dir}")
+
+
+def create_synthetic_coco_dataset():
+    """
+    Creates a synthetic COCO dataset with random images based on filenames from label lists.
+
+    This function downloads COCO labels, reads image filenames from label list files,
+    creates synthetic images for train2017 and val2017 subsets, and organizes
+    them in the COCO dataset structure. It uses multithreading to generate images efficiently.
+
+    Examples:
+        >>> from ultralytics.data.converter import create_synthetic_coco_dataset
+        >>> create_synthetic_coco_dataset()
+
+    Notes:
+        - Requires internet connection to download label files.
+        - Generates random RGB images of varying sizes (480x480 to 640x640 pixels).
+        - Existing test2017 directory is removed as it's not needed.
+        - Reads image filenames from train2017.txt and val2017.txt files.
+    """
+
+    def create_synthetic_image(image_file):
+        """Generates synthetic images with random sizes and colors for dataset augmentation or testing purposes."""
+        if not image_file.exists():
+            size = (random.randint(480, 640), random.randint(480, 640))
+            Image.new(
+                "RGB",
+                size=size,
+                color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
+            ).save(image_file)
+
+    # Download labels
+    dir = DATASETS_DIR / "coco"
+    url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
+    label_zip = "coco2017labels-segments.zip"
+    download([url + label_zip], dir=dir.parent)
+
+    # Create synthetic images
+    shutil.rmtree(dir / "labels" / "test2017", ignore_errors=True)  # Remove test2017 directory as not needed
+    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+        for subset in ["train2017", "val2017"]:
+            subset_dir = dir / "images" / subset
+            subset_dir.mkdir(parents=True, exist_ok=True)
+
+            # Read image filenames from label list file
+            label_list_file = dir / f"{subset}.txt"
+            if label_list_file.exists():
+                with open(label_list_file) as f:
+                    image_files = [dir / line.strip() for line in f]
+
+                # Submit all tasks
+                futures = [executor.submit(create_synthetic_image, image_file) for image_file in image_files]
+                for _ in TQDM(as_completed(futures), total=len(futures), desc=f"Generating images for {subset}"):
+                    pass  # The actual work is done in the background
+            else:
+                print(f"Warning: Labels file {label_list_file} does not exist. Skipping image creation for {subset}.")
+
+    print("Synthetic COCO dataset created successfully.")

+ 521 - 0
ultralytics/data/dataset.py

@@ -0,0 +1,521 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import json
+from collections import defaultdict
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import ConcatDataset
+
+from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr
+from ultralytics.utils.ops import resample_segments
+from ultralytics.utils.torch_utils import TORCHVISION_0_18
+
+from .augment import (
+    Compose,
+    Format,
+    Instances,
+    LetterBox,
+    RandomLoadText,
+    classify_augmentations,
+    classify_transforms,
+    v8_transforms,
+)
+from .base import BaseDataset
+from .utils import (
+    HELP_URL,
+    LOGGER,
+    get_hash,
+    img2label_paths,
+    load_dataset_cache_file,
+    save_dataset_cache_file,
+    verify_image,
+    verify_image_label,
+)
+
+# Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
+DATASET_CACHE_VERSION = "1.0.3"
+
+
+class YOLODataset(BaseDataset):
+    """
+    Dataset class for loading object detection and/or segmentation labels in YOLO format.
+
+    Args:
+        data (dict, optional): A dataset YAML dictionary. Defaults to None.
+        task (str): An explicit arg to point current task, Defaults to 'detect'.
+
+    Returns:
+        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
+    """
+
+    def __init__(self, *args, data=None, task="detect", **kwargs):
+        """Initializes the YOLODataset with optional configurations for segments and keypoints."""
+        self.use_segments = task == "segment"
+        self.use_keypoints = task == "pose"
+        self.use_obb = task == "obb"
+        self.data = data
+        assert not (self.use_segments and self.use_keypoints), "Can not use both segments and keypoints."
+        super().__init__(*args, **kwargs)
+
+    def cache_labels(self, path=Path("./labels.cache")):
+        """
+        Cache dataset labels, check images and read shapes.
+
+        Args:
+            path (Path): Path where to save the cache file. Default is Path("./labels.cache").
+
+        Returns:
+            (dict): labels.
+        """
+        x = {"labels": []}
+        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
+        desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
+        total = len(self.im_files)
+        nkpt, ndim = self.data.get("kpt_shape", (0, 0))
+        if self.use_keypoints and (nkpt <= 0 or ndim not in {2, 3}):
+            raise ValueError(
+                "'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
+                "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
+            )
+        with ThreadPool(NUM_THREADS) as pool:
+            results = pool.imap(
+                func=verify_image_label,
+                iterable=zip(
+                    self.im_files,
+                    self.label_files,
+                    repeat(self.prefix),
+                    repeat(self.use_keypoints),
+                    repeat(len(self.data["names"])),
+                    repeat(nkpt),
+                    repeat(ndim),
+                ),
+            )
+            pbar = TQDM(results, desc=desc, total=total)
+            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
+                nm += nm_f
+                nf += nf_f
+                ne += ne_f
+                nc += nc_f
+                if im_file:
+                    x["labels"].append(
+                        {
+                            "im_file": im_file,
+                            "shape": shape,
+                            "cls": lb[:, 0:1],  # n, 1
+                            "bboxes": lb[:, 1:],  # n, 4
+                            "segments": segments,
+                            "keypoints": keypoint,
+                            "normalized": True,
+                            "bbox_format": "xywh",
+                        }
+                    )
+                if msg:
+                    msgs.append(msg)
+                pbar.desc = f"{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt"
+            pbar.close()
+
+        if msgs:
+            LOGGER.info("\n".join(msgs))
+        if nf == 0:
+            LOGGER.warning(f"{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}")
+        x["hash"] = get_hash(self.label_files + self.im_files)
+        x["results"] = nf, nm, ne, nc, len(self.im_files)
+        x["msgs"] = msgs  # warnings
+        save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
+        return x
+
+    def get_labels(self):
+        """Returns dictionary of labels for YOLO training."""
+        self.label_files = img2label_paths(self.im_files)
+        cache_path = Path(self.label_files[0]).parent.with_suffix(".cache")
+        try:
+            cache, exists = load_dataset_cache_file(cache_path), True  # attempt to load a *.cache file
+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
+            assert cache["hash"] == get_hash(self.label_files + self.im_files)  # identical hash
+        except (FileNotFoundError, AssertionError, AttributeError):
+            cache, exists = self.cache_labels(cache_path), False  # run cache ops
+
+        # Display cache
+        nf, nm, ne, nc, n = cache.pop("results")  # found, missing, empty, corrupt, total
+        if exists and LOCAL_RANK in {-1, 0}:
+            d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
+            TQDM(None, desc=self.prefix + d, total=n, initial=n)  # display results
+            if cache["msgs"]:
+                LOGGER.info("\n".join(cache["msgs"]))  # display warnings
+
+        # Read cache
+        [cache.pop(k) for k in ("hash", "version", "msgs")]  # remove items
+        labels = cache["labels"]
+        if not labels:
+            LOGGER.warning(f"WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}")
+        self.im_files = [lb["im_file"] for lb in labels]  # update im_files
+
+        # Check if the dataset is all boxes or all segments
+        lengths = ((len(lb["cls"]), len(lb["bboxes"]), len(lb["segments"])) for lb in labels)
+        len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
+        if len_segments and len_boxes != len_segments:
+            LOGGER.warning(
+                f"WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, "
+                f"len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. "
+                "To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset."
+            )
+            for lb in labels:
+                lb["segments"] = []
+        if len_cls == 0:
+            LOGGER.warning(f"WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}")
+        return labels
+
+    def build_transforms(self, hyp=None):
+        """Builds and appends transforms to the list."""
+        if self.augment:
+            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
+            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
+            transforms = v8_transforms(self, self.imgsz, hyp)
+        else:
+            transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
+        transforms.append(
+            Format(
+                bbox_format="xywh",
+                normalize=True,
+                return_mask=self.use_segments,
+                return_keypoint=self.use_keypoints,
+                return_obb=self.use_obb,
+                batch_idx=True,
+                mask_ratio=hyp.mask_ratio,
+                mask_overlap=hyp.overlap_mask,
+                bgr=hyp.bgr if self.augment else 0.0,  # only affect training.
+            )
+        )
+        return transforms
+
+    def close_mosaic(self, hyp):
+        """Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations."""
+        hyp.mosaic = 0.0  # set mosaic ratio=0.0
+        hyp.copy_paste = 0.0  # keep the same behavior as previous v8 close-mosaic
+        hyp.mixup = 0.0  # keep the same behavior as previous v8 close-mosaic
+        self.transforms = self.build_transforms(hyp)
+
+    def update_labels_info(self, label):
+        """
+        Custom your label format here.
+
+        Note:
+            cls is not with bboxes now, classification and semantic segmentation need an independent cls label
+            Can also support classification and semantic segmentation by adding or removing dict keys there.
+        """
+        bboxes = label.pop("bboxes")
+        segments = label.pop("segments", [])
+        keypoints = label.pop("keypoints", None)
+        bbox_format = label.pop("bbox_format")
+        normalized = label.pop("normalized")
+
+        # NOTE: do NOT resample oriented boxes
+        segment_resamples = 100 if self.use_obb else 1000
+        if len(segments) > 0:
+            # make sure segments interpolate correctly if original length is greater than segment_resamples
+            max_len = max(len(s) for s in segments)
+            segment_resamples = (max_len + 1) if segment_resamples < max_len else segment_resamples
+            # list[np.array(segment_resamples, 2)] * num_samples
+            segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
+        else:
+            segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
+        label["instances"] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
+        return label
+
+    @staticmethod
+    def collate_fn(batch):
+        """Collates data samples into batches."""
+        new_batch = {}
+        keys = batch[0].keys()
+        values = list(zip(*[list(b.values()) for b in batch]))
+        for i, k in enumerate(keys):
+            value = values[i]
+            if k == "img":
+                value = torch.stack(value, 0)
+            if k in {"masks", "keypoints", "bboxes", "cls", "segments", "obb"}:
+                value = torch.cat(value, 0)
+            new_batch[k] = value
+        new_batch["batch_idx"] = list(new_batch["batch_idx"])
+        for i in range(len(new_batch["batch_idx"])):
+            new_batch["batch_idx"][i] += i  # add target image index for build_targets()
+        new_batch["batch_idx"] = torch.cat(new_batch["batch_idx"], 0)
+        return new_batch
+
+
+class YOLOMultiModalDataset(YOLODataset):
+    """
+    Dataset class for loading object detection and/or segmentation labels in YOLO format.
+
+    Args:
+        data (dict, optional): A dataset YAML dictionary. Defaults to None.
+        task (str): An explicit arg to point current task, Defaults to 'detect'.
+
+    Returns:
+        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
+    """
+
+    def __init__(self, *args, data=None, task="detect", **kwargs):
+        """Initializes a dataset object for object detection tasks with optional specifications."""
+        super().__init__(*args, data=data, task=task, **kwargs)
+
+    def update_labels_info(self, label):
+        """Add texts information for multi-modal model training."""
+        labels = super().update_labels_info(label)
+        # NOTE: some categories are concatenated with its synonyms by `/`.
+        labels["texts"] = [v.split("/") for _, v in self.data["names"].items()]
+        return labels
+
+    def build_transforms(self, hyp=None):
+        """Enhances data transformations with optional text augmentation for multi-modal training."""
+        transforms = super().build_transforms(hyp)
+        if self.augment:
+            # NOTE: hard-coded the args for now.
+            transforms.insert(-1, RandomLoadText(max_samples=min(self.data["nc"], 80), padding=True))
+        return transforms
+
+
+class GroundingDataset(YOLODataset):
+    """Handles object detection tasks by loading annotations from a specified JSON file, supporting YOLO format."""
+
+    def __init__(self, *args, task="detect", json_file, **kwargs):
+        """Initializes a GroundingDataset for object detection, loading annotations from a specified JSON file."""
+        assert task == "detect", "`GroundingDataset` only support `detect` task for now!"
+        self.json_file = json_file
+        super().__init__(*args, task=task, data={}, **kwargs)
+
+    def get_img_files(self, img_path):
+        """The image files would be read in `get_labels` function, return empty list here."""
+        return []
+
+    def get_labels(self):
+        """Loads annotations from a JSON file, filters, and normalizes bounding boxes for each image."""
+        labels = []
+        LOGGER.info("Loading annotation file...")
+        with open(self.json_file) as f:
+            annotations = json.load(f)
+        images = {f"{x['id']:d}": x for x in annotations["images"]}
+        img_to_anns = defaultdict(list)
+        for ann in annotations["annotations"]:
+            img_to_anns[ann["image_id"]].append(ann)
+        for img_id, anns in TQDM(img_to_anns.items(), desc=f"Reading annotations {self.json_file}"):
+            img = images[f"{img_id:d}"]
+            h, w, f = img["height"], img["width"], img["file_name"]
+            im_file = Path(self.img_path) / f
+            if not im_file.exists():
+                continue
+            self.im_files.append(str(im_file))
+            bboxes = []
+            cat2id = {}
+            texts = []
+            for ann in anns:
+                if ann["iscrowd"]:
+                    continue
+                box = np.array(ann["bbox"], dtype=np.float32)
+                box[:2] += box[2:] / 2
+                box[[0, 2]] /= float(w)
+                box[[1, 3]] /= float(h)
+                if box[2] <= 0 or box[3] <= 0:
+                    continue
+
+                caption = img["caption"]
+                cat_name = " ".join([caption[t[0] : t[1]] for t in ann["tokens_positive"]])
+                if cat_name not in cat2id:
+                    cat2id[cat_name] = len(cat2id)
+                    texts.append([cat_name])
+                cls = cat2id[cat_name]  # class
+                box = [cls] + box.tolist()
+                if box not in bboxes:
+                    bboxes.append(box)
+            lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
+            labels.append(
+                {
+                    "im_file": im_file,
+                    "shape": (h, w),
+                    "cls": lb[:, 0:1],  # n, 1
+                    "bboxes": lb[:, 1:],  # n, 4
+                    "normalized": True,
+                    "bbox_format": "xywh",
+                    "texts": texts,
+                }
+            )
+        return labels
+
+    def build_transforms(self, hyp=None):
+        """Configures augmentations for training with optional text loading; `hyp` adjusts augmentation intensity."""
+        transforms = super().build_transforms(hyp)
+        if self.augment:
+            # NOTE: hard-coded the args for now.
+            transforms.insert(-1, RandomLoadText(max_samples=80, padding=True))
+        return transforms
+
+
+class YOLOConcatDataset(ConcatDataset):
+    """
+    Dataset as a concatenation of multiple datasets.
+
+    This class is useful to assemble different existing datasets.
+    """
+
+    @staticmethod
+    def collate_fn(batch):
+        """Collates data samples into batches."""
+        return YOLODataset.collate_fn(batch)
+
+
+# TODO: support semantic segmentation
+class SemanticDataset(BaseDataset):
+    """
+    Semantic Segmentation Dataset.
+
+    This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
+    from the BaseDataset class.
+
+    Note:
+        This class is currently a placeholder and needs to be populated with methods and attributes for supporting
+        semantic segmentation tasks.
+    """
+
+    def __init__(self):
+        """Initialize a SemanticDataset object."""
+        super().__init__()
+
+
+class ClassificationDataset:
+    """
+    Extends torchvision ImageFolder to support YOLO classification tasks, offering functionalities like image
+    augmentation, caching, and verification. It's designed to efficiently handle large datasets for training deep
+    learning models, with optional image transformations and caching mechanisms to speed up training.
+
+    This class allows for augmentations using both torchvision and Albumentations libraries, and supports caching images
+    in RAM or on disk to reduce IO overhead during training. Additionally, it implements a robust verification process
+    to ensure data integrity and consistency.
+
+    Attributes:
+        cache_ram (bool): Indicates if caching in RAM is enabled.
+        cache_disk (bool): Indicates if caching on disk is enabled.
+        samples (list): A list of tuples, each containing the path to an image, its class index, path to its .npy cache
+                        file (if caching on disk), and optionally the loaded image array (if caching in RAM).
+        torch_transforms (callable): PyTorch transforms to be applied to the images.
+    """
+
+    def __init__(self, root, args, augment=False, prefix=""):
+        """
+        Initialize YOLO object with root, image size, augmentations, and cache settings.
+
+        Args:
+            root (str): Path to the dataset directory where images are stored in a class-specific folder structure.
+            args (Namespace): Configuration containing dataset-related settings such as image size, augmentation
+                parameters, and cache settings. It includes attributes like `imgsz` (image size), `fraction` (fraction
+                of data to use), `scale`, `fliplr`, `flipud`, `cache` (disk or RAM caching for faster training),
+                `auto_augment`, `hsv_h`, `hsv_s`, `hsv_v`, and `crop_fraction`.
+            augment (bool, optional): Whether to apply augmentations to the dataset. Default is False.
+            prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification and
+                debugging. Default is an empty string.
+        """
+        import torchvision  # scope for faster 'import ultralytics'
+
+        # Base class assigned as attribute rather than used as base class to allow for scoping slow torchvision import
+        if TORCHVISION_0_18:  # 'allow_empty' argument first introduced in torchvision 0.18
+            self.base = torchvision.datasets.ImageFolder(root=root, allow_empty=True)
+        else:
+            self.base = torchvision.datasets.ImageFolder(root=root)
+        self.samples = self.base.samples
+        self.root = self.base.root
+
+        # Initialize attributes
+        if augment and args.fraction < 1.0:  # reduce training fraction
+            self.samples = self.samples[: round(len(self.samples) * args.fraction)]
+        self.prefix = colorstr(f"{prefix}: ") if prefix else ""
+        self.cache_ram = args.cache is True or str(args.cache).lower() == "ram"  # cache images into RAM
+        if self.cache_ram:
+            LOGGER.warning(
+                "WARNING ⚠️ Classification `cache_ram` training has known memory leak in "
+                "https://github.com/ultralytics/ultralytics/issues/9824, setting `cache_ram=False`."
+            )
+            self.cache_ram = False
+        self.cache_disk = str(args.cache).lower() == "disk"  # cache images on hard drive as uncompressed *.npy files
+        self.samples = self.verify_images()  # filter out bad images
+        self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples]  # file, index, npy, im
+        scale = (1.0 - args.scale, 1.0)  # (0.08, 1.0)
+        self.torch_transforms = (
+            classify_augmentations(
+                size=args.imgsz,
+                scale=scale,
+                hflip=args.fliplr,
+                vflip=args.flipud,
+                erasing=args.erasing,
+                auto_augment=args.auto_augment,
+                hsv_h=args.hsv_h,
+                hsv_s=args.hsv_s,
+                hsv_v=args.hsv_v,
+            )
+            if augment
+            else classify_transforms(size=args.imgsz, crop_fraction=args.crop_fraction)
+        )
+
+    def __getitem__(self, i):
+        """Returns subset of data and targets corresponding to given indices."""
+        f, j, fn, im = self.samples[i]  # filename, index, filename.with_suffix('.npy'), image
+        if self.cache_ram:
+            if im is None:  # Warning: two separate if statements required here, do not combine this with previous line
+                im = self.samples[i][3] = cv2.imread(f)
+        elif self.cache_disk:
+            if not fn.exists():  # load npy
+                np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
+            im = np.load(fn)
+        else:  # read image
+            im = cv2.imread(f)  # BGR
+        # Convert NumPy array to PIL image
+        im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
+        sample = self.torch_transforms(im)
+        return {"img": sample, "cls": j}
+
+    def __len__(self) -> int:
+        """Return the total number of samples in the dataset."""
+        return len(self.samples)
+
+    def verify_images(self):
+        """Verify all images in dataset."""
+        desc = f"{self.prefix}Scanning {self.root}..."
+        path = Path(self.root).with_suffix(".cache")  # *.cache file path
+
+        try:
+            cache = load_dataset_cache_file(path)  # attempt to load a *.cache file
+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
+            assert cache["hash"] == get_hash([x[0] for x in self.samples])  # identical hash
+            nf, nc, n, samples = cache.pop("results")  # found, missing, empty, corrupt, total
+            if LOCAL_RANK in {-1, 0}:
+                d = f"{desc} {nf} images, {nc} corrupt"
+                TQDM(None, desc=d, total=n, initial=n)
+                if cache["msgs"]:
+                    LOGGER.info("\n".join(cache["msgs"]))  # display warnings
+            return samples
+
+        except (FileNotFoundError, AssertionError, AttributeError):
+            # Run scan if *.cache retrieval failed
+            nf, nc, msgs, samples, x = 0, 0, [], [], {}
+            with ThreadPool(NUM_THREADS) as pool:
+                results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
+                pbar = TQDM(results, desc=desc, total=len(self.samples))
+                for sample, nf_f, nc_f, msg in pbar:
+                    if nf_f:
+                        samples.append(sample)
+                    if msg:
+                        msgs.append(msg)
+                    nf += nf_f
+                    nc += nc_f
+                    pbar.desc = f"{desc} {nf} images, {nc} corrupt"
+                pbar.close()
+            if msgs:
+                LOGGER.info("\n".join(msgs))
+            x["hash"] = get_hash([x[0] for x in self.samples])
+            x["results"] = nf, nc, len(samples), samples
+            x["msgs"] = msgs  # warnings
+            save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
+            return samples

+ 658 - 0
ultralytics/data/loaders.py

@@ -0,0 +1,658 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import glob
+import math
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from threading import Thread
+from urllib.parse import urlparse
+
+import cv2
+import numpy as np
+import requests
+import torch
+from PIL import Image
+
+from ultralytics.data.utils import FORMATS_HELP_MSG, IMG_FORMATS, VID_FORMATS
+from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, ops
+from ultralytics.utils.checks import check_requirements
+from ultralytics.utils.patches import imread
+
+
+@dataclass
+class SourceTypes:
+    """
+    Class to represent various types of input sources for predictions.
+
+    This class uses dataclass to define boolean flags for different types of input sources that can be used for
+    making predictions with YOLO models.
+
+    Attributes:
+        stream (bool): Flag indicating if the input source is a video stream.
+        screenshot (bool): Flag indicating if the input source is a screenshot.
+        from_img (bool): Flag indicating if the input source is an image file.
+
+    Examples:
+        >>> source_types = SourceTypes(stream=True, screenshot=False, from_img=False)
+        >>> print(source_types.stream)
+        True
+        >>> print(source_types.from_img)
+        False
+    """
+
+    stream: bool = False
+    screenshot: bool = False
+    from_img: bool = False
+    tensor: bool = False
+
+
+class LoadStreams:
+    """
+    Stream Loader for various types of video streams.
+
+    Supports RTSP, RTMP, HTTP, and TCP streams. This class handles the loading and processing of multiple video
+    streams simultaneously, making it suitable for real-time video analysis tasks.
+
+    Attributes:
+        sources (List[str]): The source input paths or URLs for the video streams.
+        vid_stride (int): Video frame-rate stride.
+        buffer (bool): Whether to buffer input streams.
+        running (bool): Flag to indicate if the streaming thread is running.
+        mode (str): Set to 'stream' indicating real-time capture.
+        imgs (List[List[np.ndarray]]): List of image frames for each stream.
+        fps (List[float]): List of FPS for each stream.
+        frames (List[int]): List of total frames for each stream.
+        threads (List[Thread]): List of threads for each stream.
+        shape (List[Tuple[int, int, int]]): List of shapes for each stream.
+        caps (List[cv2.VideoCapture]): List of cv2.VideoCapture objects for each stream.
+        bs (int): Batch size for processing.
+
+    Methods:
+        update: Read stream frames in daemon thread.
+        close: Close stream loader and release resources.
+        __iter__: Returns an iterator object for the class.
+        __next__: Returns source paths, transformed, and original images for processing.
+        __len__: Return the length of the sources object.
+
+    Examples:
+        >>> stream_loader = LoadStreams("rtsp://example.com/stream1.mp4")
+        >>> for sources, imgs, _ in stream_loader:
+        ...     # Process the images
+        ...     pass
+        >>> stream_loader.close()
+
+    Notes:
+        - The class uses threading to efficiently load frames from multiple streams simultaneously.
+        - It automatically handles YouTube links, converting them to the best available stream URL.
+        - The class implements a buffer system to manage frame storage and retrieval.
+    """
+
+    def __init__(self, sources="file.streams", vid_stride=1, buffer=False):
+        """Initialize stream loader for multiple video sources, supporting various stream types."""
+        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
+        self.buffer = buffer  # buffer input streams
+        self.running = True  # running flag for Thread
+        self.mode = "stream"
+        self.vid_stride = vid_stride  # video frame-rate stride
+
+        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
+        n = len(sources)
+        self.bs = n
+        self.fps = [0] * n  # frames per second
+        self.frames = [0] * n
+        self.threads = [None] * n
+        self.caps = [None] * n  # video capture objects
+        self.imgs = [[] for _ in range(n)]  # images
+        self.shape = [[] for _ in range(n)]  # image shapes
+        self.sources = [ops.clean_str(x) for x in sources]  # clean source names for later
+        for i, s in enumerate(sources):  # index, source
+            # Start thread to read frames from video stream
+            st = f"{i + 1}/{n}: {s}... "
+            if urlparse(s).hostname in {"www.youtube.com", "youtube.com", "youtu.be"}:  # if source is YouTube video
+                # YouTube format i.e. 'https://www.youtube.com/watch?v=Jsn8D3aC840' or 'https://youtu.be/Jsn8D3aC840'
+                s = get_best_youtube_url(s)
+            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
+            if s == 0 and (IS_COLAB or IS_KAGGLE):
+                raise NotImplementedError(
+                    "'source=0' webcam not supported in Colab and Kaggle notebooks. "
+                    "Try running 'source=0' in a local environment."
+                )
+            self.caps[i] = cv2.VideoCapture(s)  # store video capture object
+            if not self.caps[i].isOpened():
+                raise ConnectionError(f"{st}Failed to open {s}")
+            w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
+            fps = self.caps[i].get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan
+            self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
+                "inf"
+            )  # infinite stream fallback
+            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback
+
+            success, im = self.caps[i].read()  # guarantee first frame
+            if not success or im is None:
+                raise ConnectionError(f"{st}Failed to read images from {s}")
+            self.imgs[i].append(im)
+            self.shape[i] = im.shape
+            self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
+            LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
+            self.threads[i].start()
+        LOGGER.info("")  # newline
+
+    def update(self, i, cap, stream):
+        """Read stream frames in daemon thread and update image buffer."""
+        n, f = 0, self.frames[i]  # frame number, frame array
+        while self.running and cap.isOpened() and n < (f - 1):
+            if len(self.imgs[i]) < 30:  # keep a <=30-image buffer
+                n += 1
+                cap.grab()  # .read() = .grab() followed by .retrieve()
+                if n % self.vid_stride == 0:
+                    success, im = cap.retrieve()
+                    if not success:
+                        im = np.zeros(self.shape[i], dtype=np.uint8)
+                        LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
+                        cap.open(stream)  # re-open stream if signal was lost
+                    if self.buffer:
+                        self.imgs[i].append(im)
+                    else:
+                        self.imgs[i] = [im]
+            else:
+                time.sleep(0.01)  # wait until the buffer is empty
+
+    def close(self):
+        """Terminates stream loader, stops threads, and releases video capture resources."""
+        self.running = False  # stop flag for Thread
+        for thread in self.threads:
+            if thread.is_alive():
+                thread.join(timeout=5)  # Add timeout
+        for cap in self.caps:  # Iterate through the stored VideoCapture objects
+            try:
+                cap.release()  # release video capture
+            except Exception as e:
+                LOGGER.warning(f"WARNING ⚠️ Could not release VideoCapture object: {e}")
+        cv2.destroyAllWindows()
+
+    def __iter__(self):
+        """Iterates through YOLO image feed and re-opens unresponsive streams."""
+        self.count = -1
+        return self
+
+    def __next__(self):
+        """Returns the next batch of frames from multiple video streams for processing."""
+        self.count += 1
+
+        images = []
+        for i, x in enumerate(self.imgs):
+            # Wait until a frame is available in each buffer
+            while not x:
+                if not self.threads[i].is_alive() or cv2.waitKey(1) == ord("q"):  # q to quit
+                    self.close()
+                    raise StopIteration
+                time.sleep(1 / min(self.fps))
+                x = self.imgs[i]
+                if not x:
+                    LOGGER.warning(f"WARNING ⚠️ Waiting for stream {i}")
+
+            # Get and remove the first frame from imgs buffer
+            if self.buffer:
+                images.append(x.pop(0))
+
+            # Get the last frame, and clear the rest from the imgs buffer
+            else:
+                images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
+                x.clear()
+
+        return self.sources, images, [""] * self.bs
+
+    def __len__(self):
+        """Return the number of video streams in the LoadStreams object."""
+        return self.bs  # 1E12 frames = 32 streams at 30 FPS for 30 years
+
+
+class LoadScreenshots:
+    """
+    Ultralytics screenshot dataloader for capturing and processing screen images.
+
+    This class manages the loading of screenshot images for processing with YOLO. It is suitable for use with
+    `yolo predict source=screen`.
+
+    Attributes:
+        source (str): The source input indicating which screen to capture.
+        screen (int): The screen number to capture.
+        left (int): The left coordinate for screen capture area.
+        top (int): The top coordinate for screen capture area.
+        width (int): The width of the screen capture area.
+        height (int): The height of the screen capture area.
+        mode (str): Set to 'stream' indicating real-time capture.
+        frame (int): Counter for captured frames.
+        sct (mss.mss): Screen capture object from `mss` library.
+        bs (int): Batch size, set to 1.
+        fps (int): Frames per second, set to 30.
+        monitor (Dict[str, int]): Monitor configuration details.
+
+    Methods:
+        __iter__: Returns an iterator object.
+        __next__: Captures the next screenshot and returns it.
+
+    Examples:
+        >>> loader = LoadScreenshots("0 100 100 640 480")  # screen 0, top-left (100,100), 640x480
+        >>> for source, im, im0s, vid_cap, s in loader:
+        ...     print(f"Captured frame: {im.shape}")
+    """
+
+    def __init__(self, source):
+        """Initialize screenshot capture with specified screen and region parameters."""
+        check_requirements("mss")
+        import mss  # noqa
+
+        source, *params = source.split()
+        self.screen, left, top, width, height = 0, None, None, None, None  # default to full screen 0
+        if len(params) == 1:
+            self.screen = int(params[0])
+        elif len(params) == 4:
+            left, top, width, height = (int(x) for x in params)
+        elif len(params) == 5:
+            self.screen, left, top, width, height = (int(x) for x in params)
+        self.mode = "stream"
+        self.frame = 0
+        self.sct = mss.mss()
+        self.bs = 1
+        self.fps = 30
+
+        # Parse monitor shape
+        monitor = self.sct.monitors[self.screen]
+        self.top = monitor["top"] if top is None else (monitor["top"] + top)
+        self.left = monitor["left"] if left is None else (monitor["left"] + left)
+        self.width = width or monitor["width"]
+        self.height = height or monitor["height"]
+        self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
+
+    def __iter__(self):
+        """Yields the next screenshot image from the specified screen or region for processing."""
+        return self
+
+    def __next__(self):
+        """Captures and returns the next screenshot as a numpy array using the mss library."""
+        im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3]  # BGRA to BGR
+        s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
+
+        self.frame += 1
+        return [str(self.screen)], [im0], [s]  # screen, img, string
+
+
+class LoadImagesAndVideos:
+    """
+    A class for loading and processing images and videos for YOLO object detection.
+
+    This class manages the loading and pre-processing of image and video data from various sources, including
+    single image files, video files, and lists of image and video paths.
+
+    Attributes:
+        files (List[str]): List of image and video file paths.
+        nf (int): Total number of files (images and videos).
+        video_flag (List[bool]): Flags indicating whether a file is a video (True) or an image (False).
+        mode (str): Current mode, 'image' or 'video'.
+        vid_stride (int): Stride for video frame-rate.
+        bs (int): Batch size.
+        cap (cv2.VideoCapture): Video capture object for OpenCV.
+        frame (int): Frame counter for video.
+        frames (int): Total number of frames in the video.
+        count (int): Counter for iteration, initialized at 0 during __iter__().
+        ni (int): Number of images.
+
+    Methods:
+        __init__: Initialize the LoadImagesAndVideos object.
+        __iter__: Returns an iterator object for VideoStream or ImageFolder.
+        __next__: Returns the next batch of images or video frames along with their paths and metadata.
+        _new_video: Creates a new video capture object for the given path.
+        __len__: Returns the number of batches in the object.
+
+    Examples:
+        >>> loader = LoadImagesAndVideos("path/to/data", batch=32, vid_stride=1)
+        >>> for paths, imgs, info in loader:
+        ...     # Process batch of images or video frames
+        ...     pass
+
+    Notes:
+        - Supports various image formats including HEIC.
+        - Handles both local files and directories.
+        - Can read from a text file containing paths to images and videos.
+    """
+
+    def __init__(self, path, batch=1, vid_stride=1):
+        """Initialize dataloader for images and videos, supporting various input formats."""
+        parent = None
+        if isinstance(path, str) and Path(path).suffix == ".txt":  # *.txt file with img/vid/dir on each line
+            parent = Path(path).parent
+            path = Path(path).read_text().splitlines()  # list of sources
+        files = []
+        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
+            a = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
+            if "*" in a:
+                files.extend(sorted(glob.glob(a, recursive=True)))  # glob
+            elif os.path.isdir(a):
+                files.extend(sorted(glob.glob(os.path.join(a, "*.*"))))  # dir
+            elif os.path.isfile(a):
+                files.append(a)  # files (absolute or relative to CWD)
+            elif parent and (parent / p).is_file():
+                files.append(str((parent / p).absolute()))  # files (relative to *.txt file parent)
+            else:
+                raise FileNotFoundError(f"{p} does not exist")
+
+        # Define files as images or videos
+        images, videos = [], []
+        for f in files:
+            suffix = f.split(".")[-1].lower()  # Get file extension without the dot and lowercase
+            if suffix in IMG_FORMATS:
+                images.append(f)
+            elif suffix in VID_FORMATS:
+                videos.append(f)
+        ni, nv = len(images), len(videos)
+
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.ni = ni  # number of images
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = "video" if ni == 0 else "image"  # default to video if no images
+        self.vid_stride = vid_stride  # video frame-rate stride
+        self.bs = batch
+        if any(videos):
+            self._new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        if self.nf == 0:
+            raise FileNotFoundError(f"No images or videos found in {p}. {FORMATS_HELP_MSG}")
+
+    def __iter__(self):
+        """Iterates through image/video files, yielding source paths, images, and metadata."""
+        self.count = 0
+        return self
+
+    def __next__(self):
+        """Returns the next batch of images or video frames with their paths and metadata."""
+        paths, imgs, info = [], [], []
+        while len(imgs) < self.bs:
+            if self.count >= self.nf:  # end of file list
+                if imgs:
+                    return paths, imgs, info  # return last partial batch
+                else:
+                    raise StopIteration
+
+            path = self.files[self.count]
+            if self.video_flag[self.count]:
+                self.mode = "video"
+                if not self.cap or not self.cap.isOpened():
+                    self._new_video(path)
+
+                success = False
+                for _ in range(self.vid_stride):
+                    success = self.cap.grab()
+                    if not success:
+                        break  # end of video or failure
+
+                if success:
+                    success, im0 = self.cap.retrieve()
+                    if success:
+                        self.frame += 1
+                        paths.append(path)
+                        imgs.append(im0)
+                        info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
+                        if self.frame == self.frames:  # end of video
+                            self.count += 1
+                            self.cap.release()
+                else:
+                    # Move to the next file if the current video ended or failed to open
+                    self.count += 1
+                    if self.cap:
+                        self.cap.release()
+                    if self.count < self.nf:
+                        self._new_video(self.files[self.count])
+            else:
+                # Handle image files (including HEIC)
+                self.mode = "image"
+                if path.split(".")[-1].lower() == "heic":
+                    # Load HEIC image using Pillow with pillow-heif
+                    check_requirements("pillow-heif")
+
+                    from pillow_heif import register_heif_opener
+
+                    register_heif_opener()  # Register HEIF opener with Pillow
+                    with Image.open(path) as img:
+                        im0 = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)  # convert image to BGR nparray
+                else:
+                    im0 = imread(path)  # BGR
+                if im0 is None:
+                    LOGGER.warning(f"WARNING ⚠️ Image Read Error {path}")
+                else:
+                    paths.append(path)
+                    imgs.append(im0)
+                    info.append(f"image {self.count + 1}/{self.nf} {path}: ")
+                self.count += 1  # move to the next file
+                if self.count >= self.ni:  # end of image list
+                    break
+
+        return paths, imgs, info
+
+    def _new_video(self, path):
+        """Creates a new video capture object for the given path and initializes video-related attributes."""
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
+        if not self.cap.isOpened():
+            raise FileNotFoundError(f"Failed to open video {path}")
+        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
+
+    def __len__(self):
+        """Returns the number of files (images and videos) in the dataset."""
+        return math.ceil(self.nf / self.bs)  # number of batches
+
+
+class LoadPilAndNumpy:
+    """
+    Load images from PIL and Numpy arrays for batch processing.
+
+    This class manages loading and pre-processing of image data from both PIL and Numpy formats. It performs basic
+    validation and format conversion to ensure that the images are in the required format for downstream processing.
+
+    Attributes:
+        paths (List[str]): List of image paths or autogenerated filenames.
+        im0 (List[np.ndarray]): List of images stored as Numpy arrays.
+        mode (str): Type of data being processed, set to 'image'.
+        bs (int): Batch size, equivalent to the length of `im0`.
+
+    Methods:
+        _single_check: Validate and format a single image to a Numpy array.
+
+    Examples:
+        >>> from PIL import Image
+        >>> import numpy as np
+        >>> pil_img = Image.new("RGB", (100, 100))
+        >>> np_img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+        >>> loader = LoadPilAndNumpy([pil_img, np_img])
+        >>> paths, images, _ = next(iter(loader))
+        >>> print(f"Loaded {len(images)} images")
+        Loaded 2 images
+    """
+
+    def __init__(self, im0):
+        """Initializes a loader for PIL and Numpy images, converting inputs to a standardized format."""
+        if not isinstance(im0, list):
+            im0 = [im0]
+        # use `image{i}.jpg` when Image.filename returns an empty path.
+        self.paths = [getattr(im, "filename", "") or f"image{i}.jpg" for i, im in enumerate(im0)]
+        self.im0 = [self._single_check(im) for im in im0]
+        self.mode = "image"
+        self.bs = len(self.im0)
+
+    @staticmethod
+    def _single_check(im):
+        """Validate and format an image to numpy array, ensuring RGB order and contiguous memory."""
+        assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
+        if isinstance(im, Image.Image):
+            if im.mode != "RGB":
+                im = im.convert("RGB")
+            im = np.asarray(im)[:, :, ::-1]
+            im = np.ascontiguousarray(im)  # contiguous
+        return im
+
+    def __len__(self):
+        """Returns the length of the 'im0' attribute, representing the number of loaded images."""
+        return len(self.im0)
+
+    def __next__(self):
+        """Returns the next batch of images, paths, and metadata for processing."""
+        if self.count == 1:  # loop only once as it's batch inference
+            raise StopIteration
+        self.count += 1
+        return self.paths, self.im0, [""] * self.bs
+
+    def __iter__(self):
+        """Iterates through PIL/numpy images, yielding paths, raw images, and metadata for processing."""
+        self.count = 0
+        return self
+
+
+class LoadTensor:
+    """
+    A class for loading and processing tensor data for object detection tasks.
+
+    This class handles the loading and pre-processing of image data from PyTorch tensors, preparing them for
+    further processing in object detection pipelines.
+
+    Attributes:
+        im0 (torch.Tensor): The input tensor containing the image(s) with shape (B, C, H, W).
+        bs (int): Batch size, inferred from the shape of `im0`.
+        mode (str): Current processing mode, set to 'image'.
+        paths (List[str]): List of image paths or auto-generated filenames.
+
+    Methods:
+        _single_check: Validates and formats an input tensor.
+
+    Examples:
+        >>> import torch
+        >>> tensor = torch.rand(1, 3, 640, 640)
+        >>> loader = LoadTensor(tensor)
+        >>> paths, images, info = next(iter(loader))
+        >>> print(f"Processed {len(images)} images")
+    """
+
+    def __init__(self, im0) -> None:
+        """Initialize LoadTensor object for processing torch.Tensor image data."""
+        self.im0 = self._single_check(im0)
+        self.bs = self.im0.shape[0]
+        self.mode = "image"
+        self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
+
+    @staticmethod
+    def _single_check(im, stride=32):
+        """Validates and formats a single image tensor, ensuring correct shape and normalization."""
+        s = (
+            f"WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
+            f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
+        )
+        if len(im.shape) != 4:
+            if len(im.shape) != 3:
+                raise ValueError(s)
+            LOGGER.warning(s)
+            im = im.unsqueeze(0)
+        if im.shape[2] % stride or im.shape[3] % stride:
+            raise ValueError(s)
+        if im.max() > 1.0 + torch.finfo(im.dtype).eps:  # torch.float32 eps is 1.2e-07
+            LOGGER.warning(
+                f"WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. "
+                f"Dividing input by 255."
+            )
+            im = im.float() / 255.0
+
+        return im
+
+    def __iter__(self):
+        """Yields an iterator object for iterating through tensor image data."""
+        self.count = 0
+        return self
+
+    def __next__(self):
+        """Yields the next batch of tensor images and metadata for processing."""
+        if self.count == 1:
+            raise StopIteration
+        self.count += 1
+        return self.paths, self.im0, [""] * self.bs
+
+    def __len__(self):
+        """Returns the batch size of the tensor input."""
+        return self.bs
+
+
+def autocast_list(source):
+    """Merges a list of sources into a list of numpy arrays or PIL images for Ultralytics prediction."""
+    files = []
+    for im in source:
+        if isinstance(im, (str, Path)):  # filename or uri
+            files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im))
+        elif isinstance(im, (Image.Image, np.ndarray)):  # PIL or np Image
+            files.append(im)
+        else:
+            raise TypeError(
+                f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
+                f"See https://docs.ultralytics.com/modes/predict for supported source types."
+            )
+
+    return files
+
+
+def get_best_youtube_url(url, method="pytube"):
+    """
+    Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
+
+    Args:
+        url (str): The URL of the YouTube video.
+        method (str): The method to use for extracting video info. Options are "pytube", "pafy", and "yt-dlp".
+            Defaults to "pytube".
+
+    Returns:
+        (str | None): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
+
+    Examples:
+        >>> url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+        >>> best_url = get_best_youtube_url(url)
+        >>> print(best_url)
+        https://rr4---sn-q4flrnek.googlevideo.com/videoplayback?expire=...
+
+    Notes:
+        - Requires additional libraries based on the chosen method: pytubefix, pafy, or yt-dlp.
+        - The function prioritizes streams with at least 1080p resolution when available.
+        - For the "yt-dlp" method, it looks for formats with video codec, no audio, and *.mp4 extension.
+    """
+    if method == "pytube":
+        # Switched from pytube to pytubefix to resolve https://github.com/pytube/pytube/issues/1954
+        check_requirements("pytubefix>=6.5.2")
+        from pytubefix import YouTube
+
+        streams = YouTube(url).streams.filter(file_extension="mp4", only_video=True)
+        streams = sorted(streams, key=lambda s: s.resolution, reverse=True)  # sort streams by resolution
+        for stream in streams:
+            if stream.resolution and int(stream.resolution[:-1]) >= 1080:  # check if resolution is at least 1080p
+                return stream.url
+
+    elif method == "pafy":
+        check_requirements(("pafy", "youtube_dl==2020.12.2"))
+        import pafy  # noqa
+
+        return pafy.new(url).getbestvideo(preftype="mp4").url
+
+    elif method == "yt-dlp":
+        check_requirements("yt-dlp")
+        import yt_dlp
+
+        with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
+            info_dict = ydl.extract_info(url, download=False)  # extract info
+        for f in reversed(info_dict.get("formats", [])):  # reversed because best is usually last
+            # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
+            good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
+            if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
+                return f.get("url")
+
+
+# Define constants
+LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)

+ 18 - 0
ultralytics/data/scripts/download_weights.sh

@@ -0,0 +1,18 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download latest models from https://github.com/ultralytics/assets/releases
+# Example usage: bash ultralytics/data/scripts/download_weights.sh
+# parent
+# └── weights
+#     ├── yolov8n.pt  ← downloads here
+#     ├── yolov8s.pt
+#     └── ...
+
+python - <<EOF
+from ultralytics.utils.downloads import attempt_download_asset
+
+assets = [f"yolov8{size}{suffix}.pt" for size in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose")]
+for x in assets:
+    attempt_download_asset(f"weights/{x}")
+
+EOF

+ 60 - 0
ultralytics/data/scripts/get_coco.sh

@@ -0,0 +1,60 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download COCO 2017 dataset https://cocodataset.org
+# Example usage: bash data/scripts/get_coco.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco  ← downloads here
+
+# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
+if [ "$#" -gt 0 ]; then
+  for opt in "$@"; do
+    case "${opt}" in
+    --train) train=true ;;
+    --val) val=true ;;
+    --test) test=true ;;
+    --segments) segments=true ;;
+    --sama) sama=true ;;
+    esac
+  done
+else
+  train=true
+  val=true
+  test=false
+  segments=false
+  sama=false
+fi
+
+# Download/unzip labels
+d='../datasets' # unzip directory
+url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
+if [ "$segments" == "true" ]; then
+  f='coco2017labels-segments.zip' # 169 MB
+elif [ "$sama" == "true" ]; then
+  f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
+else
+  f='coco2017labels.zip' # 46 MB
+fi
+echo 'Downloading' $url$f ' ...'
+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+
+# Download/unzip images
+d='../datasets/coco/images' # unzip directory
+url=http://images.cocodataset.org/zips/
+if [ "$train" == "true" ]; then
+  f='train2017.zip' # 19G, 118k images
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+if [ "$val" == "true" ]; then
+  f='val2017.zip' # 1G, 5k images
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+if [ "$test" == "true" ]; then
+  f='test2017.zip' # 7G, 41k images (optional)
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+wait # finish background tasks

+ 17 - 0
ultralytics/data/scripts/get_coco128.sh

@@ -0,0 +1,17 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
+# Example usage: bash data/scripts/get_coco128.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco128  ← downloads here
+
+# Download/unzip images and labels
+d='../datasets' # unzip directory
+url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
+f='coco128.zip' # or 'coco128-segments.zip', 68 MB
+echo 'Downloading' $url$f ' ...'
+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+
+wait # finish background tasks

+ 51 - 0
ultralytics/data/scripts/get_imagenet.sh

@@ -0,0 +1,51 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download ILSVRC2012 ImageNet dataset https://image-net.org
+# Example usage: bash data/scripts/get_imagenet.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── imagenet  ← downloads here
+
+# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
+if [ "$#" -gt 0 ]; then
+  for opt in "$@"; do
+    case "${opt}" in
+    --train) train=true ;;
+    --val) val=true ;;
+    esac
+  done
+else
+  train=true
+  val=true
+fi
+
+# Make dir
+d='../datasets/imagenet' # unzip directory
+mkdir -p $d && cd $d
+
+# Download/unzip train
+if [ "$train" == "true" ]; then
+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME; do
+    mkdir -p "${NAME%.tar}"
+    tar -xf "${NAME}" -C "${NAME%.tar}"
+    rm -f "${NAME}"
+  done
+  cd ..
+fi
+
+# Download/unzip val
+if [ "$val" == "true" ]; then
+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
+fi
+
+# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
+# rm train/n04266014/n04266014_10835.JPEG
+
+# TFRecords (optional)
+# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt

+ 298 - 0
ultralytics/data/split_dota.py

@@ -0,0 +1,298 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import itertools
+from glob import glob
+from math import ceil
+from pathlib import Path
+
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+from ultralytics.data.utils import exif_size, img2label_paths
+from ultralytics.utils.checks import check_requirements
+
+
+def bbox_iof(polygon1, bbox2, eps=1e-6):
+    """
+    Calculate Intersection over Foreground (IoF) between polygons and bounding boxes.
+
+    Args:
+        polygon1 (np.ndarray): Polygon coordinates, shape (n, 8).
+        bbox2 (np.ndarray): Bounding boxes, shape (n, 4).
+        eps (float, optional): Small value to prevent division by zero. Defaults to 1e-6.
+
+    Returns:
+        (np.ndarray): IoF scores, shape (n, 1) or (n, m) if bbox2 is (m, 4).
+
+    Note:
+        Polygon format: [x1, y1, x2, y2, x3, y3, x4, y4].
+        Bounding box format: [x_min, y_min, x_max, y_max].
+    """
+    check_requirements("shapely")
+    from shapely.geometry import Polygon
+
+    polygon1 = polygon1.reshape(-1, 4, 2)
+    lt_point = np.min(polygon1, axis=-2)  # left-top
+    rb_point = np.max(polygon1, axis=-2)  # right-bottom
+    bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
+
+    lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
+    rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
+    wh = np.clip(rb - lt, 0, np.inf)
+    h_overlaps = wh[..., 0] * wh[..., 1]
+
+    left, top, right, bottom = (bbox2[..., i] for i in range(4))
+    polygon2 = np.stack([left, top, right, top, right, bottom, left, bottom], axis=-1).reshape(-1, 4, 2)
+
+    sg_polys1 = [Polygon(p) for p in polygon1]
+    sg_polys2 = [Polygon(p) for p in polygon2]
+    overlaps = np.zeros(h_overlaps.shape)
+    for p in zip(*np.nonzero(h_overlaps)):
+        overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
+    unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
+    unions = unions[..., None]
+
+    unions = np.clip(unions, eps, np.inf)
+    outputs = overlaps / unions
+    if outputs.ndim == 1:
+        outputs = outputs[..., None]
+    return outputs
+
+
+def load_yolo_dota(data_root, split="train"):
+    """
+    Load DOTA dataset.
+
+    Args:
+        data_root (str): Data root.
+        split (str): The split data set, could be `train` or `val`.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    assert split in {"train", "val"}, f"Split must be 'train' or 'val', not {split}."
+    im_dir = Path(data_root) / "images" / split
+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
+    im_files = glob(str(Path(data_root) / "images" / split / "*"))
+    lb_files = img2label_paths(im_files)
+    annos = []
+    for im_file, lb_file in zip(im_files, lb_files):
+        w, h = exif_size(Image.open(im_file))
+        with open(lb_file) as f:
+            lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
+            lb = np.array(lb, dtype=np.float32)
+        annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
+    return annos
+
+
+def get_windows(im_size, crop_sizes=(1024,), gaps=(200,), im_rate_thr=0.6, eps=0.01):
+    """
+    Get the coordinates of windows.
+
+    Args:
+        im_size (tuple): Original image size, (h, w).
+        crop_sizes (List(int)): Crop size of windows.
+        gaps (List(int)): Gap between crops.
+        im_rate_thr (float): Threshold of windows areas divided by image ares.
+        eps (float): Epsilon value for math operations.
+    """
+    h, w = im_size
+    windows = []
+    for crop_size, gap in zip(crop_sizes, gaps):
+        assert crop_size > gap, f"invalid crop_size gap pair [{crop_size} {gap}]"
+        step = crop_size - gap
+
+        xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
+        xs = [step * i for i in range(xn)]
+        if len(xs) > 1 and xs[-1] + crop_size > w:
+            xs[-1] = w - crop_size
+
+        yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
+        ys = [step * i for i in range(yn)]
+        if len(ys) > 1 and ys[-1] + crop_size > h:
+            ys[-1] = h - crop_size
+
+        start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
+        stop = start + crop_size
+        windows.append(np.concatenate([start, stop], axis=1))
+    windows = np.concatenate(windows, axis=0)
+
+    im_in_wins = windows.copy()
+    im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
+    im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
+    im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
+    win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
+    im_rates = im_areas / win_areas
+    if not (im_rates > im_rate_thr).any():
+        max_rate = im_rates.max()
+        im_rates[abs(im_rates - max_rate) < eps] = 1
+    return windows[im_rates > im_rate_thr]
+
+
+def get_window_obj(anno, windows, iof_thr=0.7):
+    """Get objects for each window."""
+    h, w = anno["ori_size"]
+    label = anno["label"]
+    if len(label):
+        label[:, 1::2] *= w
+        label[:, 2::2] *= h
+        iofs = bbox_iof(label[:, 1:], windows)
+        # Unnormalized and misaligned coordinates
+        return [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))]  # window_anns
+    else:
+        return [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))]  # window_anns
+
+
+def crop_and_save(anno, windows, window_objs, im_dir, lb_dir, allow_background_images=True):
+    """
+    Crop images and save new labels.
+
+    Args:
+        anno (dict): Annotation dict, including `filepath`, `label`, `ori_size` as its keys.
+        windows (list): A list of windows coordinates.
+        window_objs (list): A list of labels inside each window.
+        im_dir (str): The output directory path of images.
+        lb_dir (str): The output directory path of labels.
+        allow_background_images (bool): Whether to include background images without labels.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    im = cv2.imread(anno["filepath"])
+    name = Path(anno["filepath"]).stem
+    for i, window in enumerate(windows):
+        x_start, y_start, x_stop, y_stop = window.tolist()
+        new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
+        patch_im = im[y_start:y_stop, x_start:x_stop]
+        ph, pw = patch_im.shape[:2]
+
+        label = window_objs[i]
+        if len(label) or allow_background_images:
+            cv2.imwrite(str(Path(im_dir) / f"{new_name}.jpg"), patch_im)
+        if len(label):
+            label[:, 1::2] -= x_start
+            label[:, 2::2] -= y_start
+            label[:, 1::2] /= pw
+            label[:, 2::2] /= ph
+
+            with open(Path(lb_dir) / f"{new_name}.txt", "w") as f:
+                for lb in label:
+                    formatted_coords = [f"{coord:.6g}" for coord in lb[1:]]
+                    f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
+
+
+def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=(1024,), gaps=(200,)):
+    """
+    Split both images and labels.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - split
+                - labels
+                    - split
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - split
+                - labels
+                    - split
+    """
+    im_dir = Path(save_dir) / "images" / split
+    im_dir.mkdir(parents=True, exist_ok=True)
+    lb_dir = Path(save_dir) / "labels" / split
+    lb_dir.mkdir(parents=True, exist_ok=True)
+
+    annos = load_yolo_dota(data_root, split=split)
+    for anno in tqdm(annos, total=len(annos), desc=split):
+        windows = get_windows(anno["ori_size"], crop_sizes, gaps)
+        window_objs = get_window_obj(anno, windows)
+        crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
+
+
+def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
+    """
+    Split train and val set of DOTA.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - train
+                    - val
+                - labels
+                    - train
+                    - val
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    for split in ["train", "val"]:
+        split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
+
+
+def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
+    """
+    Split test set of DOTA, labels are not included within this set.
+
+    Notes:
+        The directory structure assumed for the DOTA dataset:
+            - data_root
+                - images
+                    - test
+        and the output directory structure is:
+            - save_dir
+                - images
+                    - test
+    """
+    crop_sizes, gaps = [], []
+    for r in rates:
+        crop_sizes.append(int(crop_size / r))
+        gaps.append(int(gap / r))
+    save_dir = Path(save_dir) / "images" / "test"
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    im_dir = Path(data_root) / "images" / "test"
+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
+    im_files = glob(str(im_dir / "*"))
+    for im_file in tqdm(im_files, total=len(im_files), desc="test"):
+        w, h = exif_size(Image.open(im_file))
+        windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
+        im = cv2.imread(im_file)
+        name = Path(im_file).stem
+        for window in windows:
+            x_start, y_start, x_stop, y_stop = window.tolist()
+            new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
+            patch_im = im[y_start:y_stop, x_start:x_stop]
+            cv2.imwrite(str(save_dir / f"{new_name}.jpg"), patch_im)
+
+
+if __name__ == "__main__":
+    split_trainval(data_root="DOTAv2", save_dir="DOTAv2-split")
+    split_test(data_root="DOTAv2", save_dir="DOTAv2-split")

+ 721 - 0
ultralytics/data/utils.py

@@ -0,0 +1,721 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import hashlib
+import json
+import os
+import random
+import subprocess
+import time
+import zipfile
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from tarfile import is_tarfile
+
+import cv2
+import numpy as np
+from PIL import Image, ImageOps
+
+from ultralytics.nn.autobackend import check_class_names
+from ultralytics.utils import (
+    DATASETS_DIR,
+    LOGGER,
+    NUM_THREADS,
+    ROOT,
+    SETTINGS_FILE,
+    TQDM,
+    clean_url,
+    colorstr,
+    emojis,
+    is_dir_writeable,
+    yaml_load,
+    yaml_save,
+)
+from ultralytics.utils.checks import check_file, check_font, is_ascii
+from ultralytics.utils.downloads import download, safe_download, unzip_file
+from ultralytics.utils.ops import segments2boxes
+
+HELP_URL = "See https://docs.ultralytics.com/datasets for dataset formatting guidance."
+IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm", "heic"}  # image suffixes
+VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"}  # video suffixes
+PIN_MEMORY = str(os.getenv("PIN_MEMORY", True)).lower() == "true"  # global pin_memory for dataloaders
+FORMATS_HELP_MSG = f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
+
+
+def img2label_paths(img_paths):
+    """Define label paths as a function of image paths."""
+    sa, sb = f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}"  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit(".", 1)[0] + ".txt" for x in img_paths]
+
+
+def get_hash(paths):
+    """Returns a single hash value of a list of paths (files or dirs)."""
+    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
+    h = hashlib.sha256(str(size).encode())  # hash sizes
+    h.update("".join(paths).encode())  # hash paths
+    return h.hexdigest()  # return hash
+
+
+def exif_size(img: Image.Image):
+    """Returns exif-corrected PIL size."""
+    s = img.size  # (width, height)
+    if img.format == "JPEG":  # only support JPEG images
+        try:
+            if exif := img.getexif():
+                rotation = exif.get(274, None)  # the EXIF key for the orientation tag is 274
+                if rotation in {6, 8}:  # rotation 270 or 90
+                    s = s[1], s[0]
+        except Exception:
+            pass
+    return s
+
+
+def verify_image(args):
+    """Verify one image."""
+    (im_file, cls), prefix = args
+    # Number (found, corrupt), message
+    nf, nc, msg = 0, 0, ""
+    try:
+        im = Image.open(im_file)
+        im.verify()  # PIL verify
+        shape = exif_size(im)  # image size
+        shape = (shape[1], shape[0])  # hw
+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+        assert im.format.lower() in IMG_FORMATS, f"Invalid image format {im.format}. {FORMATS_HELP_MSG}"
+        if im.format.lower() in {"jpg", "jpeg"}:
+            with open(im_file, "rb") as f:
+                f.seek(-2, 2)
+                if f.read() != b"\xff\xd9":  # corrupt JPEG
+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
+        nf = 1
+    except Exception as e:
+        nc = 1
+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
+    return (im_file, cls), nf, nc, msg
+
+
+def verify_image_label(args):
+    """Verify one image-label pair."""
+    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
+    # Number (missing, found, empty, corrupt), message, segments, keypoints
+    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, "", [], None
+    try:
+        # Verify images
+        im = Image.open(im_file)
+        im.verify()  # PIL verify
+        shape = exif_size(im)  # image size
+        shape = (shape[1], shape[0])  # hw
+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+        assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}. {FORMATS_HELP_MSG}"
+        if im.format.lower() in {"jpg", "jpeg"}:
+            with open(im_file, "rb") as f:
+                f.seek(-2, 2)
+                if f.read() != b"\xff\xd9":  # corrupt JPEG
+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
+
+        # Verify labels
+        if os.path.isfile(lb_file):
+            nf = 1  # label found
+            with open(lb_file) as f:
+                lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
+                if any(len(x) > 6 for x in lb) and (not keypoint):  # is segment
+                    classes = np.array([x[0] for x in lb], dtype=np.float32)
+                    segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb]  # (cls, xy1...)
+                    lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
+                lb = np.array(lb, dtype=np.float32)
+            if nl := len(lb):
+                if keypoint:
+                    assert lb.shape[1] == (5 + nkpt * ndim), f"labels require {(5 + nkpt * ndim)} columns each"
+                    points = lb[:, 5:].reshape(-1, ndim)[:, :2]
+                else:
+                    assert lb.shape[1] == 5, f"labels require 5 columns, {lb.shape[1]} columns detected"
+                    points = lb[:, 1:]
+                assert points.max() <= 1, f"non-normalized or out of bounds coordinates {points[points > 1]}"
+                assert lb.min() >= 0, f"negative label values {lb[lb < 0]}"
+
+                # All labels
+                max_cls = lb[:, 0].max()  # max label count
+                assert max_cls <= num_cls, (
+                    f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
+                    f"Possible class labels are 0-{num_cls - 1}"
+                )
+                _, i = np.unique(lb, axis=0, return_index=True)
+                if len(i) < nl:  # duplicate row check
+                    lb = lb[i]  # remove duplicates
+                    if segments:
+                        segments = [segments[x] for x in i]
+                    msg = f"{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed"
+            else:
+                ne = 1  # label empty
+                lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
+        else:
+            nm = 1  # label missing
+            lb = np.zeros((0, (5 + nkpt * ndim) if keypoints else 5), dtype=np.float32)
+        if keypoint:
+            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
+            if ndim == 2:
+                kpt_mask = np.where((keypoints[..., 0] < 0) | (keypoints[..., 1] < 0), 0.0, 1.0).astype(np.float32)
+                keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1)  # (nl, nkpt, 3)
+        lb = lb[:, :5]
+        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
+    except Exception as e:
+        nc = 1
+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
+        return [None, None, None, None, None, nm, nf, ne, nc, msg]
+
+
+def visualize_image_annotations(image_path, txt_path, label_map):
+    """
+    Visualizes YOLO annotations (bounding boxes and class labels) on an image.
+
+    This function reads an image and its corresponding annotation file in YOLO format, then
+    draws bounding boxes around detected objects and labels them with their respective class names.
+    The bounding box colors are assigned based on the class ID, and the text color is dynamically
+    adjusted for readability, depending on the background color's luminance.
+
+    Args:
+        image_path (str): The path to the image file to annotate, and it can be in formats supported by PIL (e.g., .jpg, .png).
+        txt_path (str): The path to the annotation file in YOLO format, that should contain one line per object with:
+                        - class_id (int): The class index.
+                        - x_center (float): The X center of the bounding box (relative to image width).
+                        - y_center (float): The Y center of the bounding box (relative to image height).
+                        - width (float): The width of the bounding box (relative to image width).
+                        - height (float): The height of the bounding box (relative to image height).
+        label_map (dict): A dictionary that maps class IDs (integers) to class labels (strings).
+
+    Example:
+        >>> label_map = {0: "cat", 1: "dog", 2: "bird"}  # It should include all annotated classes details
+        >>> visualize_image_annotations("path/to/image.jpg", "path/to/annotations.txt", label_map)
+    """
+    import matplotlib.pyplot as plt
+
+    from ultralytics.utils.plotting import colors
+
+    img = np.array(Image.open(image_path))
+    img_height, img_width = img.shape[:2]
+    annotations = []
+    with open(txt_path) as file:
+        for line in file:
+            class_id, x_center, y_center, width, height = map(float, line.split())
+            x = (x_center - width / 2) * img_width
+            y = (y_center - height / 2) * img_height
+            w = width * img_width
+            h = height * img_height
+            annotations.append((x, y, w, h, int(class_id)))
+    fig, ax = plt.subplots(1)  # Plot the image and annotations
+    for x, y, w, h, label in annotations:
+        color = tuple(c / 255 for c in colors(label, True))  # Get and normalize the RGB color
+        rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color, facecolor="none")  # Create a rectangle
+        ax.add_patch(rect)
+        luminance = 0.2126 * color[0] + 0.7152 * color[1] + 0.0722 * color[2]  # Formula for luminance
+        ax.text(x, y - 5, label_map[label], color="white" if luminance < 0.5 else "black", backgroundcolor=color)
+    ax.imshow(img)
+    plt.show()
+
+
+def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
+    """
+    Convert a list of polygons to a binary mask of the specified image size.
+
+    Args:
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
+        downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A binary mask of the specified image size with the polygons filled in.
+    """
+    mask = np.zeros(imgsz, dtype=np.uint8)
+    polygons = np.asarray(polygons, dtype=np.int32)
+    polygons = polygons.reshape((polygons.shape[0], -1, 2))
+    cv2.fillPoly(mask, polygons, color=color)
+    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
+    # Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
+    return cv2.resize(mask, (nw, nh))
+
+
+def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
+    """
+    Convert a list of polygons to a set of binary masks of the specified image size.
+
+    Args:
+        imgsz (tuple): The size of the image as (height, width).
+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
+        color (int): The color value to fill in the polygons on the masks.
+        downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
+
+    Returns:
+        (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
+    """
+    return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
+
+
+def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
+    """Return a (640, 640) overlap mask."""
+    masks = np.zeros(
+        (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
+        dtype=np.int32 if len(segments) > 255 else np.uint8,
+    )
+    areas = []
+    ms = []
+    for si in range(len(segments)):
+        mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1)
+        ms.append(mask.astype(masks.dtype))
+        areas.append(mask.sum())
+    areas = np.asarray(areas)
+    index = np.argsort(-areas)
+    ms = np.array(ms)[index]
+    for i in range(len(segments)):
+        mask = ms[i] * (i + 1)
+        masks = masks + mask
+        masks = np.clip(masks, a_min=0, a_max=i + 1)
+    return masks, index
+
+
+def find_dataset_yaml(path: Path) -> Path:
+    """
+    Find and return the YAML file associated with a Detect, Segment or Pose dataset.
+
+    This function searches for a YAML file at the root level of the provided directory first, and if not found, it
+    performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
+    is raised if no YAML file is found or if multiple YAML files are found.
+
+    Args:
+        path (Path): The directory path to search for the YAML file.
+
+    Returns:
+        (Path): The path of the found YAML file.
+    """
+    files = list(path.glob("*.yaml")) or list(path.rglob("*.yaml"))  # try root level first and then recursive
+    assert files, f"No YAML file found in '{path.resolve()}'"
+    if len(files) > 1:
+        files = [f for f in files if f.stem == path.stem]  # prefer *.yaml files that match
+    assert len(files) == 1, f"Expected 1 YAML file in '{path.resolve()}', but found {len(files)}.\n{files}"
+    return files[0]
+
+
+def check_det_dataset(dataset, autodownload=True):
+    """
+    Download, verify, and/or unzip a dataset if not found locally.
+
+    This function checks the availability of a specified dataset, and if not found, it has the option to download and
+    unzip the dataset. It then reads and parses the accompanying YAML data, ensuring key requirements are met and also
+    resolves paths related to the dataset.
+
+    Args:
+        dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
+        autodownload (bool, optional): Whether to automatically download the dataset if not found. Defaults to True.
+
+    Returns:
+        (dict): Parsed dataset information and paths.
+    """
+    file = check_file(dataset)
+
+    # Download (optional)
+    extract_dir = ""
+    if zipfile.is_zipfile(file) or is_tarfile(file):
+        new_dir = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
+        file = find_dataset_yaml(DATASETS_DIR / new_dir)
+        extract_dir, autodownload = file.parent, False
+
+    # Read YAML
+    data = yaml_load(file, append_filename=True)  # dictionary
+
+    # Checks
+    for k in "train", "val":
+        if k not in data:
+            if k != "val" or "validation" not in data:
+                raise SyntaxError(
+                    emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")
+                )
+            LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
+            data["val"] = data.pop("validation")  # replace 'validation' key with 'val' key
+    if "names" not in data and "nc" not in data:
+        raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
+    if "names" in data and "nc" in data and len(data["names"]) != data["nc"]:
+        raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
+    if "names" not in data:
+        data["names"] = [f"class_{i}" for i in range(data["nc"])]
+    else:
+        data["nc"] = len(data["names"])
+
+    data["names"] = check_class_names(data["names"])
+
+    # Resolve paths
+    path = Path(extract_dir or data.get("path") or Path(data.get("yaml_file", "")).parent)  # dataset root
+    if not path.is_absolute():
+        path = (DATASETS_DIR / path).resolve()
+
+    # Set paths
+    data["path"] = path  # download scripts
+    for k in "train", "val", "test", "minival":
+        if data.get(k):  # prepend path
+            if isinstance(data[k], str):
+                x = (path / data[k]).resolve()
+                if not x.exists() and data[k].startswith("../"):
+                    x = (path / data[k][3:]).resolve()
+                data[k] = str(x)
+            else:
+                data[k] = [str((path / x).resolve()) for x in data[k]]
+
+    # Parse YAML
+    val, s = (data.get(x) for x in ("val", "download"))
+    if val:
+        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
+        if not all(x.exists() for x in val):
+            name = clean_url(dataset)  # dataset name with URL auth stripped
+            m = f"\nDataset '{name}' images not found ⚠️, missing path '{[x for x in val if not x.exists()][0]}'"
+            if s and autodownload:
+                LOGGER.warning(m)
+            else:
+                m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_FILE}'"
+                raise FileNotFoundError(m)
+            t = time.time()
+            r = None  # success
+            if s.startswith("http") and s.endswith(".zip"):  # URL
+                safe_download(url=s, dir=DATASETS_DIR, delete=True)
+            elif s.startswith("bash "):  # bash script
+                LOGGER.info(f"Running {s} ...")
+                r = os.system(s)
+            else:  # python script
+                exec(s, {"yaml": data})
+            dt = f"({round(time.time() - t, 1)}s)"
+            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in {0, None} else f"failure {dt} ❌"
+            LOGGER.info(f"Dataset download {s}\n")
+    check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf")  # download fonts
+
+    return data  # dictionary
+
+
+def check_cls_dataset(dataset, split=""):
+    """
+    Checks a classification dataset such as Imagenet.
+
+    This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
+    If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
+
+    Args:
+        dataset (str | Path): The name of the dataset.
+        split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
+
+    Returns:
+        (dict): A dictionary containing the following keys:
+            - 'train' (Path): The directory path containing the training set of the dataset.
+            - 'val' (Path): The directory path containing the validation set of the dataset.
+            - 'test' (Path): The directory path containing the test set of the dataset.
+            - 'nc' (int): The number of classes in the dataset.
+            - 'names' (dict): A dictionary of class names in the dataset.
+    """
+    # Download (optional if dataset=https://file.zip is passed directly)
+    if str(dataset).startswith(("http:/", "https:/")):
+        dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
+    elif Path(dataset).suffix in {".zip", ".tar", ".gz"}:
+        file = check_file(dataset)
+        dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
+
+    dataset = Path(dataset)
+    data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
+    if not data_dir.is_dir():
+        LOGGER.warning(f"\nDataset not found ⚠️, missing path {data_dir}, attempting download...")
+        t = time.time()
+        if str(dataset) == "imagenet":
+            subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
+        else:
+            url = f"https://github.com/ultralytics/assets/releases/download/v0.0.0/{dataset}.zip"
+            download(url, dir=data_dir.parent)
+        s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
+        LOGGER.info(s)
+    train_set = data_dir / "train"
+    val_set = (
+        data_dir / "val"
+        if (data_dir / "val").exists()
+        else data_dir / "validation"
+        if (data_dir / "validation").exists()
+        else None
+    )  # data/test or data/val
+    test_set = data_dir / "test" if (data_dir / "test").exists() else None  # data/val or data/test
+    if split == "val" and not val_set:
+        LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
+    elif split == "test" and not test_set:
+        LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
+
+    nc = len([x for x in (data_dir / "train").glob("*") if x.is_dir()])  # number of classes
+    names = [x.name for x in (data_dir / "train").iterdir() if x.is_dir()]  # class names list
+    names = dict(enumerate(sorted(names)))
+
+    # Print to console
+    for k, v in {"train": train_set, "val": val_set, "test": test_set}.items():
+        prefix = f"{colorstr(f'{k}:')} {v}..."
+        if v is None:
+            LOGGER.info(prefix)
+        else:
+            files = [path for path in v.rglob("*.*") if path.suffix[1:].lower() in IMG_FORMATS]
+            nf = len(files)  # number of files
+            nd = len({file.parent for file in files})  # number of directories
+            if nf == 0:
+                if k == "train":
+                    raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
+                else:
+                    LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found")
+            elif nd != nc:
+                LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}")
+            else:
+                LOGGER.info(f"{prefix} found {nf} images in {nd} classes ✅ ")
+
+    return {"train": train_set, "val": val_set, "test": test_set, "nc": nc, "names": names}
+
+
+class HUBDatasetStats:
+    """
+    A class for generating HUB dataset JSON and `-hub` dataset directory.
+
+    Args:
+        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco8.yaml'.
+        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
+        autodownload (bool): Attempt to download dataset if not found locally. Default is False.
+
+    Example:
+        Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
+            i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
+        ```python
+        from ultralytics.data.utils import HUBDatasetStats
+
+        stats = HUBDatasetStats("path/to/coco8.zip", task="detect")  # detect dataset
+        stats = HUBDatasetStats("path/to/coco8-seg.zip", task="segment")  # segment dataset
+        stats = HUBDatasetStats("path/to/coco8-pose.zip", task="pose")  # pose dataset
+        stats = HUBDatasetStats("path/to/dota8.zip", task="obb")  # OBB dataset
+        stats = HUBDatasetStats("path/to/imagenet10.zip", task="classify")  # classification dataset
+
+        stats.get_json(save=True)
+        stats.process_images()
+        ```
+    """
+
+    def __init__(self, path="coco8.yaml", task="detect", autodownload=False):
+        """Initialize class."""
+        path = Path(path).resolve()
+        LOGGER.info(f"Starting HUB dataset checks for {path}....")
+
+        self.task = task  # detect, segment, pose, classify, obb
+        if self.task == "classify":
+            unzip_dir = unzip_file(path)
+            data = check_cls_dataset(unzip_dir)
+            data["path"] = unzip_dir
+        else:  # detect, segment, pose, obb
+            _, data_dir, yaml_path = self._unzip(Path(path))
+            try:
+                # Load YAML with checks
+                data = yaml_load(yaml_path)
+                data["path"] = ""  # strip path since YAML should be in dataset root for all HUB datasets
+                yaml_save(yaml_path, data)
+                data = check_det_dataset(yaml_path, autodownload)  # dict
+                data["path"] = data_dir  # YAML path should be set to '' (relative) or parent (absolute)
+            except Exception as e:
+                raise Exception("error/HUB/dataset_stats/init") from e
+
+        self.hub_dir = Path(f"{data['path']}-hub")
+        self.im_dir = self.hub_dir / "images"
+        self.stats = {"nc": len(data["names"]), "names": list(data["names"].values())}  # statistics dictionary
+        self.data = data
+
+    @staticmethod
+    def _unzip(path):
+        """Unzip data.zip."""
+        if not str(path).endswith(".zip"):  # path is data.yaml
+            return False, None, path
+        unzip_dir = unzip_file(path, path=path.parent)
+        assert unzip_dir.is_dir(), (
+            f"Error unzipping {path}, {unzip_dir} not found. path/to/abc.zip MUST unzip to path/to/abc/"
+        )
+        return True, str(unzip_dir), find_dataset_yaml(unzip_dir)  # zipped, data_dir, yaml_path
+
+    def _hub_ops(self, f):
+        """Saves a compressed image for HUB previews."""
+        compress_one_image(f, self.im_dir / Path(f).name)  # save to dataset-hub
+
+    def get_json(self, save=False, verbose=False):
+        """Return dataset JSON for Ultralytics HUB."""
+
+        def _round(labels):
+            """Update labels to integer class and 4 decimal place floats."""
+            if self.task == "detect":
+                coordinates = labels["bboxes"]
+            elif self.task in {"segment", "obb"}:  # Segment and OBB use segments. OBB segments are normalized xyxyxyxy
+                coordinates = [x.flatten() for x in labels["segments"]]
+            elif self.task == "pose":
+                n, nk, nd = labels["keypoints"].shape
+                coordinates = np.concatenate((labels["bboxes"], labels["keypoints"].reshape(n, nk * nd)), 1)
+            else:
+                raise ValueError(f"Undefined dataset task={self.task}.")
+            zipped = zip(labels["cls"], coordinates)
+            return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]
+
+        for split in "train", "val", "test":
+            self.stats[split] = None  # predefine
+            path = self.data.get(split)
+
+            # Check split
+            if path is None:  # no split
+                continue
+            files = [f for f in Path(path).rglob("*.*") if f.suffix[1:].lower() in IMG_FORMATS]  # image files in split
+            if not files:  # no images
+                continue
+
+            # Get dataset statistics
+            if self.task == "classify":
+                from torchvision.datasets import ImageFolder  # scope for faster 'import ultralytics'
+
+                dataset = ImageFolder(self.data[split])
+
+                x = np.zeros(len(dataset.classes)).astype(int)
+                for im in dataset.imgs:
+                    x[im[1]] += 1
+
+                self.stats[split] = {
+                    "instance_stats": {"total": len(dataset), "per_class": x.tolist()},
+                    "image_stats": {"total": len(dataset), "unlabelled": 0, "per_class": x.tolist()},
+                    "labels": [{Path(k).name: v} for k, v in dataset.imgs],
+                }
+            else:
+                from ultralytics.data import YOLODataset
+
+                dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
+                x = np.array(
+                    [
+                        np.bincount(label["cls"].astype(int).flatten(), minlength=self.data["nc"])
+                        for label in TQDM(dataset.labels, total=len(dataset), desc="Statistics")
+                    ]
+                )  # shape(128x80)
+                self.stats[split] = {
+                    "instance_stats": {"total": int(x.sum()), "per_class": x.sum(0).tolist()},
+                    "image_stats": {
+                        "total": len(dataset),
+                        "unlabelled": int(np.all(x == 0, 1).sum()),
+                        "per_class": (x > 0).sum(0).tolist(),
+                    },
+                    "labels": [{Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)],
+                }
+
+        # Save, print and return
+        if save:
+            self.hub_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/
+            stats_path = self.hub_dir / "stats.json"
+            LOGGER.info(f"Saving {stats_path.resolve()}...")
+            with open(stats_path, "w") as f:
+                json.dump(self.stats, f)  # save stats.json
+        if verbose:
+            LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
+        return self.stats
+
+    def process_images(self):
+        """Compress images for Ultralytics HUB."""
+        from ultralytics.data import YOLODataset  # ClassificationDataset
+
+        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/images/
+        for split in "train", "val", "test":
+            if self.data.get(split) is None:
+                continue
+            dataset = YOLODataset(img_path=self.data[split], data=self.data)
+            with ThreadPool(NUM_THREADS) as pool:
+                for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f"{split} images"):
+                    pass
+        LOGGER.info(f"Done. All images saved to {self.im_dir}")
+        return self.im_dir
+
+
+def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
+    """
+    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Python
+    Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
+    resized.
+
+    Args:
+        f (str): The path to the input image file.
+        f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
+        max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
+        quality (int, optional): The image compression quality as a percentage. Default is 50%.
+
+    Example:
+        ```python
+        from pathlib import Path
+        from ultralytics.data.utils import compress_one_image
+
+        for f in Path("path/to/dataset").rglob("*.jpg"):
+            compress_one_image(f)
+        ```
+    """
+    try:  # use PIL
+        im = Image.open(f)
+        r = max_dim / max(im.height, im.width)  # ratio
+        if r < 1.0:  # image too large
+            im = im.resize((int(im.width * r), int(im.height * r)))
+        im.save(f_new or f, "JPEG", quality=quality, optimize=True)  # save
+    except Exception as e:  # use OpenCV
+        LOGGER.info(f"WARNING ⚠️ HUB ops PIL failure {f}: {e}")
+        im = cv2.imread(f)
+        im_height, im_width = im.shape[:2]
+        r = max_dim / max(im_height, im_width)  # ratio
+        if r < 1.0:  # image too large
+            im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
+        cv2.imwrite(str(f_new or f), im)
+
+
+def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annotated_only=False):
+    """
+    Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
+
+    Args:
+        path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco8/images'.
+        weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
+        annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
+
+    Example:
+        ```python
+        from ultralytics.data.utils import autosplit
+
+        autosplit()
+        ```
+    """
+    path = Path(path)  # images dir
+    files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
+    n = len(files)  # number of files
+    random.seed(0)  # for reproducibility
+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
+
+    txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"]  # 3 txt files
+    for x in txt:
+        if (path.parent / x).exists():
+            (path.parent / x).unlink()  # remove existing
+
+    LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
+    for i, img in TQDM(zip(indices, files), total=n):
+        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
+            with open(path.parent / txt[i], "a") as f:
+                f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n")  # add image to txt file
+
+
+def load_dataset_cache_file(path):
+    """Load an Ultralytics *.cache dictionary from path."""
+    import gc
+
+    gc.disable()  # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
+    cache = np.load(str(path), allow_pickle=True).item()  # load dict
+    gc.enable()
+    return cache
+
+
+def save_dataset_cache_file(prefix, path, x, version):
+    """Save an Ultralytics dataset *.cache dictionary x to path."""
+    x["version"] = version  # add cache version
+    if is_dir_writeable(path.parent):
+        if path.exists():
+            path.unlink()  # remove *.cache file if exists
+        np.save(str(path), x)  # save cache for next time
+        path.with_suffix(".cache.npy").rename(path)  # remove .npy suffix
+        LOGGER.info(f"{prefix}New cache created: {path}")
+    else:
+        LOGGER.warning(f"{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.")