пре 4 месеци · 035b97700e
--- a/ultralytics/__init__.py
+++ b/ultralytics/__init__.py
@@ -0,0 +1,29 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+__version__ = "8.3.63"
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+# Set ENV variables (place before imports)
			
 
				+if not os.environ.get("OMP_NUM_THREADS"):
			
 
				+    os.environ["OMP_NUM_THREADS"] = "1"  # default for reduced CPU utilization during training
			
 
				+
			
 
				+from ultralytics.models import NAS, RTDETR, SAM, YOLO, FastSAM, YOLOWorld
			
 
				+from ultralytics.utils import ASSETS, SETTINGS
			
 
				+from ultralytics.utils.checks import check_yolo as checks
			
 
				+from ultralytics.utils.downloads import download
			
 
				+
			
 
				+settings = SETTINGS
			
 
				+__all__ = (
			
 
				+    "__version__",
			
 
				+    "ASSETS",
			
 
				+    "YOLO",
			
 
				+    "YOLOWorld",
			
 
				+    "NAS",
			
 
				+    "SAM",
			
 
				+    "FastSAM",
			
 
				+    "RTDETR",
			
 
				+    "checks",
			
 
				+    "download",
			
 
				+    "settings",
			
 
				+)
			
--- a/ultralytics/assets/bus.jpg
+++ b/ultralytics/assets/bus.jpg
--- a/ultralytics/assets/zidane.jpg
+++ b/ultralytics/assets/zidane.jpg
--- a/ultralytics/data/__init__.py
+++ b/ultralytics/data/__init__.py
@@ -0,0 +1,26 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+from .base import BaseDataset
			
 
				+from .build import build_dataloader, build_grounding, build_yolo_dataset, load_inference_source
			
 
				+from .dataset import (
			
 
				+    ClassificationDataset,
			
 
				+    GroundingDataset,
			
 
				+    SemanticDataset,
			
 
				+    YOLOConcatDataset,
			
 
				+    YOLODataset,
			
 
				+    YOLOMultiModalDataset,
			
 
				+)
			
 
				+
			
 
				+__all__ = (
			
 
				+    "BaseDataset",
			
 
				+    "ClassificationDataset",
			
 
				+    "SemanticDataset",
			
 
				+    "YOLODataset",
			
 
				+    "YOLOMultiModalDataset",
			
 
				+    "YOLOConcatDataset",
			
 
				+    "GroundingDataset",
			
 
				+    "build_yolo_dataset",
			
 
				+    "build_grounding",
			
 
				+    "build_dataloader",
			
 
				+    "load_inference_source",
			
 
				+)
			
--- a/ultralytics/data/annotator.py
+++ b/ultralytics/data/annotator.py
@@ -0,0 +1,72 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from ultralytics import SAM, YOLO
			
 
				+
			
 
				+
			
 
				+def auto_annotate(
			
 
				+    data,
			
 
				+    det_model="yolo11x.pt",
			
 
				+    sam_model="sam_b.pt",
			
 
				+    device="",
			
 
				+    conf=0.25,
			
 
				+    iou=0.45,
			
 
				+    imgsz=640,
			
 
				+    max_det=300,
			
 
				+    classes=None,
			
 
				+    output_dir=None,
			
 
				+):
			
 
				+    """
			
 
				+    Automatically annotates images using a YOLO object detection model and a SAM segmentation model.
			
 
				+
			
 
				+    This function processes images in a specified directory, detects objects using a YOLO model, and then generates
			
 
				+    segmentation masks using a SAM model. The resulting annotations are saved as text files.
			
 
				+
			
 
				+    Args:
			
 
				+        data (str): Path to a folder containing images to be annotated.
			
 
				+        det_model (str): Path or name of the pre-trained YOLO detection model.
			
 
				+        sam_model (str): Path or name of the pre-trained SAM segmentation model.
			
 
				+        device (str): Device to run the models on (e.g., 'cpu', 'cuda', '0').
			
 
				+        conf (float): Confidence threshold for detection model; default is 0.25.
			
 
				+        iou (float): IoU threshold for filtering overlapping boxes in detection results; default is 0.45.
			
 
				+        imgsz (int): Input image resize dimension; default is 640.
			
 
				+        max_det (int): Limits detections per image to control outputs in dense scenes.
			
 
				+        classes (list): Filters predictions to specified class IDs, returning only relevant detections.
			
 
				+        output_dir (str | None): Directory to save the annotated results. If None, a default directory is created.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.annotator import auto_annotate
			
 
				+        >>> auto_annotate(data="ultralytics/assets", det_model="yolo11n.pt", sam_model="mobile_sam.pt")
			
 
				+
			
 
				+    Notes:
			
 
				+        - The function creates a new directory for output if not specified.
			
 
				+        - Annotation results are saved as text files with the same names as the input images.
			
 
				+        - Each line in the output text file represents a detected object with its class ID and segmentation points.
			
 
				+    """
			
 
				+    det_model = YOLO(det_model)
			
 
				+    sam_model = SAM(sam_model)
			
 
				+
			
 
				+    data = Path(data)
			
 
				+    if not output_dir:
			
 
				+        output_dir = data.parent / f"{data.stem}_auto_annotate_labels"
			
 
				+    Path(output_dir).mkdir(exist_ok=True, parents=True)
			
 
				+
			
 
				+    det_results = det_model(
			
 
				+        data, stream=True, device=device, conf=conf, iou=iou, imgsz=imgsz, max_det=max_det, classes=classes
			
 
				+    )
			
 
				+
			
 
				+    for result in det_results:
			
 
				+        class_ids = result.boxes.cls.int().tolist()  # noqa
			
 
				+        if len(class_ids):
			
 
				+            boxes = result.boxes.xyxy  # Boxes object for bbox outputs
			
 
				+            sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
			
 
				+            segments = sam_results[0].masks.xyn  # noqa
			
 
				+
			
 
				+            with open(f"{Path(output_dir) / Path(result.path).stem}.txt", "w") as f:
			
 
				+                for i in range(len(segments)):
			
 
				+                    s = segments[i]
			
 
				+                    if len(s) == 0:
			
 
				+                        continue
			
 
				+                    segment = map(str, segments[i].reshape(-1).tolist())
			
 
				+                    f.write(f"{class_ids[i]} " + " ".join(segment) + "\n")
			
--- a/ultralytics/data/augment.py
+++ b/ultralytics/data/augment.py
@@ -0,0 +1,2744 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import math
			
 
				+import random
			
 
				+from copy import deepcopy
			
 
				+from typing import Tuple, Union
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+
			
 
				+from ultralytics.data.utils import polygons2masks, polygons2masks_overlap
			
 
				+from ultralytics.utils import LOGGER, colorstr
			
 
				+from ultralytics.utils.checks import check_version
			
 
				+from ultralytics.utils.instance import Instances
			
 
				+from ultralytics.utils.metrics import bbox_ioa
			
 
				+from ultralytics.utils.ops import segment2box, xyxyxyxy2xywhr
			
 
				+from ultralytics.utils.torch_utils import TORCHVISION_0_10, TORCHVISION_0_11, TORCHVISION_0_13
			
 
				+
			
 
				+DEFAULT_MEAN = (0.0, 0.0, 0.0)
			
 
				+DEFAULT_STD = (1.0, 1.0, 1.0)
			
 
				+DEFAULT_CROP_FRACTION = 1.0
			
 
				+
			
 
				+
			
 
				+class BaseTransform:
			
 
				+    """
			
 
				+    Base class for image transformations in the Ultralytics library.
			
 
				+
			
 
				+    This class serves as a foundation for implementing various image processing operations, designed to be
			
 
				+    compatible with both classification and semantic segmentation tasks.
			
 
				+
			
 
				+    Methods:
			
 
				+        apply_image: Applies image transformations to labels.
			
 
				+        apply_instances: Applies transformations to object instances in labels.
			
 
				+        apply_semantic: Applies semantic segmentation to an image.
			
 
				+        __call__: Applies all label transformations to an image, instances, and semantic masks.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = BaseTransform()
			
 
				+        >>> labels = {"image": np.array(...), "instances": [...], "semantic": np.array(...)}
			
 
				+        >>> transformed_labels = transform(labels)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self) -> None:
			
 
				+        """
			
 
				+        Initializes the BaseTransform object.
			
 
				+
			
 
				+        This constructor sets up the base transformation object, which can be extended for specific image
			
 
				+        processing tasks. It is designed to be compatible with both classification and semantic segmentation.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseTransform()
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    def apply_image(self, labels):
			
 
				+        """
			
 
				+        Applies image transformations to labels.
			
 
				+
			
 
				+        This method is intended to be overridden by subclasses to implement specific image transformation
			
 
				+        logic. In its base form, it returns the input labels unchanged.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Any): The input labels to be transformed. The exact type and structure of labels may
			
 
				+                vary depending on the specific implementation.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Any): The transformed labels. In the base implementation, this is identical to the input.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseTransform()
			
 
				+            >>> original_labels = [1, 2, 3]
			
 
				+            >>> transformed_labels = transform.apply_image(original_labels)
			
 
				+            >>> print(transformed_labels)
			
 
				+            [1, 2, 3]
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    def apply_instances(self, labels):
			
 
				+        """
			
 
				+        Applies transformations to object instances in labels.
			
 
				+
			
 
				+        This method is responsible for applying various transformations to object instances within the given
			
 
				+        labels. It is designed to be overridden by subclasses to implement specific instance transformation
			
 
				+        logic.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing label information, including object instances.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The modified labels dictionary with transformed object instances.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseTransform()
			
 
				+            >>> labels = {"instances": Instances(xyxy=torch.rand(5, 4), cls=torch.randint(0, 80, (5,)))}
			
 
				+            >>> transformed_labels = transform.apply_instances(labels)
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    def apply_semantic(self, labels):
			
 
				+        """
			
 
				+        Applies semantic segmentation transformations to an image.
			
 
				+
			
 
				+        This method is intended to be overridden by subclasses to implement specific semantic segmentation
			
 
				+        transformations. In its base form, it does not perform any operations.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Any): The input labels or semantic segmentation mask to be transformed.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Any): The transformed semantic segmentation mask or labels.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseTransform()
			
 
				+            >>> semantic_mask = np.zeros((100, 100), dtype=np.uint8)
			
 
				+            >>> transformed_mask = transform.apply_semantic(semantic_mask)
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies all label transformations to an image, instances, and semantic masks.
			
 
				+
			
 
				+        This method orchestrates the application of various transformations defined in the BaseTransform class
			
 
				+        to the input labels. It sequentially calls the apply_image and apply_instances methods to process the
			
 
				+        image and object instances, respectively.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and annotations. Expected keys include 'img' for
			
 
				+                the image data, and 'instances' for object instances.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The input labels dictionary with transformed image and instances.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseTransform()
			
 
				+            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": []}
			
 
				+            >>> transformed_labels = transform(labels)
			
 
				+        """
			
 
				+        self.apply_image(labels)
			
 
				+        self.apply_instances(labels)
			
 
				+        self.apply_semantic(labels)
			
 
				+
			
 
				+
			
 
				+class Compose:
			
 
				+    """
			
 
				+    A class for composing multiple image transformations.
			
 
				+
			
 
				+    Attributes:
			
 
				+        transforms (List[Callable]): A list of transformation functions to be applied sequentially.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies a series of transformations to input data.
			
 
				+        append: Appends a new transform to the existing list of transforms.
			
 
				+        insert: Inserts a new transform at a specified index in the list of transforms.
			
 
				+        __getitem__: Retrieves a specific transform or a set of transforms using indexing.
			
 
				+        __setitem__: Sets a specific transform or a set of transforms using indexing.
			
 
				+        tolist: Converts the list of transforms to a standard Python list.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transforms = [RandomFlip(), RandomPerspective(30)]
			
 
				+        >>> compose = Compose(transforms)
			
 
				+        >>> transformed_data = compose(data)
			
 
				+        >>> compose.append(CenterCrop((224, 224)))
			
 
				+        >>> compose.insert(0, RandomFlip())
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, transforms):
			
 
				+        """
			
 
				+        Initializes the Compose object with a list of transforms.
			
 
				+
			
 
				+        Args:
			
 
				+            transforms (List[Callable]): A list of callable transform objects to be applied sequentially.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> from ultralytics.data.augment import Compose, RandomHSV, RandomFlip
			
 
				+            >>> transforms = [RandomHSV(), RandomFlip()]
			
 
				+            >>> compose = Compose(transforms)
			
 
				+        """
			
 
				+        self.transforms = transforms if isinstance(transforms, list) else [transforms]
			
 
				+
			
 
				+    def __call__(self, data):
			
 
				+        """
			
 
				+        Applies a series of transformations to input data. This method sequentially applies each transformation in the
			
 
				+        Compose object's list of transforms to the input data.
			
 
				+
			
 
				+        Args:
			
 
				+            data (Any): The input data to be transformed. This can be of any type, depending on the
			
 
				+                transformations in the list.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Any): The transformed data after applying all transformations in sequence.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transforms = [Transform1(), Transform2(), Transform3()]
			
 
				+            >>> compose = Compose(transforms)
			
 
				+            >>> transformed_data = compose(input_data)
			
 
				+        """
			
 
				+        for t in self.transforms:
			
 
				+            data = t(data)
			
 
				+        return data
			
 
				+
			
 
				+    def append(self, transform):
			
 
				+        """
			
 
				+        Appends a new transform to the existing list of transforms.
			
 
				+
			
 
				+        Args:
			
 
				+            transform (BaseTransform): The transformation to be added to the composition.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> compose = Compose([RandomFlip(), RandomPerspective()])
			
 
				+            >>> compose.append(RandomHSV())
			
 
				+        """
			
 
				+        self.transforms.append(transform)
			
 
				+
			
 
				+    def insert(self, index, transform):
			
 
				+        """
			
 
				+        Inserts a new transform at a specified index in the existing list of transforms.
			
 
				+
			
 
				+        Args:
			
 
				+            index (int): The index at which to insert the new transform.
			
 
				+            transform (BaseTransform): The transform object to be inserted.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> compose = Compose([Transform1(), Transform2()])
			
 
				+            >>> compose.insert(1, Transform3())
			
 
				+            >>> len(compose.transforms)
			
 
				+            3
			
 
				+        """
			
 
				+        self.transforms.insert(index, transform)
			
 
				+
			
 
				+    def __getitem__(self, index: Union[list, int]) -> "Compose":
			
 
				+        """
			
 
				+        Retrieves a specific transform or a set of transforms using indexing.
			
 
				+
			
 
				+        Args:
			
 
				+            index (int | List[int]): Index or list of indices of the transforms to retrieve.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Compose): A new Compose object containing the selected transform(s).
			
 
				+
			
 
				+        Raises:
			
 
				+            AssertionError: If the index is not of type int or list.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transforms = [RandomFlip(), RandomPerspective(10), RandomHSV(0.5, 0.5, 0.5)]
			
 
				+            >>> compose = Compose(transforms)
			
 
				+            >>> single_transform = compose[1]  # Returns a Compose object with only RandomPerspective
			
 
				+            >>> multiple_transforms = compose[0:2]  # Returns a Compose object with RandomFlip and RandomPerspective
			
 
				+        """
			
 
				+        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
			
 
				+        index = [index] if isinstance(index, int) else index
			
 
				+        return Compose([self.transforms[i] for i in index])
			
 
				+
			
 
				+    def __setitem__(self, index: Union[list, int], value: Union[list, int]) -> None:
			
 
				+        """
			
 
				+        Sets one or more transforms in the composition using indexing.
			
 
				+
			
 
				+        Args:
			
 
				+            index (int | List[int]): Index or list of indices to set transforms at.
			
 
				+            value (Any | List[Any]): Transform or list of transforms to set at the specified index(es).
			
 
				+
			
 
				+        Raises:
			
 
				+            AssertionError: If index type is invalid, value type doesn't match index type, or index is out of range.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> compose = Compose([Transform1(), Transform2(), Transform3()])
			
 
				+            >>> compose[1] = NewTransform()  # Replace second transform
			
 
				+            >>> compose[0:2] = [NewTransform1(), NewTransform2()]  # Replace first two transforms
			
 
				+        """
			
 
				+        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
			
 
				+        if isinstance(index, list):
			
 
				+            assert isinstance(value, list), (
			
 
				+                f"The indices should be the same type as values, but got {type(index)} and {type(value)}"
			
 
				+            )
			
 
				+        if isinstance(index, int):
			
 
				+            index, value = [index], [value]
			
 
				+        for i, v in zip(index, value):
			
 
				+            assert i < len(self.transforms), f"list index {i} out of range {len(self.transforms)}."
			
 
				+            self.transforms[i] = v
			
 
				+
			
 
				+    def tolist(self):
			
 
				+        """
			
 
				+        Converts the list of transforms to a standard Python list.
			
 
				+
			
 
				+        Returns:
			
 
				+            (List): A list containing all the transform objects in the Compose instance.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transforms = [RandomFlip(), RandomPerspective(10), CenterCrop()]
			
 
				+            >>> compose = Compose(transforms)
			
 
				+            >>> transform_list = compose.tolist()
			
 
				+            >>> print(len(transform_list))
			
 
				+            3
			
 
				+        """
			
 
				+        return self.transforms
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        """
			
 
				+        Returns a string representation of the Compose object.
			
 
				+
			
 
				+        Returns:
			
 
				+            (str): A string representation of the Compose object, including the list of transforms.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transforms = [RandomFlip(), RandomPerspective(degrees=10, translate=0.1, scale=0.1)]
			
 
				+            >>> compose = Compose(transforms)
			
 
				+            >>> print(compose)
			
 
				+            Compose([
			
 
				+                RandomFlip(),
			
 
				+                RandomPerspective(degrees=10, translate=0.1, scale=0.1)
			
 
				+            ])
			
 
				+        """
			
 
				+        return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})"
			
 
				+
			
 
				+
			
 
				+class BaseMixTransform:
			
 
				+    """
			
 
				+    Base class for mix transformations like MixUp and Mosaic.
			
 
				+
			
 
				+    This class provides a foundation for implementing mix transformations on datasets. It handles the
			
 
				+    probability-based application of transforms and manages the mixing of multiple images and labels.
			
 
				+
			
 
				+    Attributes:
			
 
				+        dataset (Any): The dataset object containing images and labels.
			
 
				+        pre_transform (Callable | None): Optional transform to apply before mixing.
			
 
				+        p (float): Probability of applying the mix transformation.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the mix transformation to the input labels.
			
 
				+        _mix_transform: Abstract method to be implemented by subclasses for specific mix operations.
			
 
				+        get_indexes: Abstract method to get indexes of images to be mixed.
			
 
				+        _update_label_text: Updates label text for mixed images.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> class CustomMixTransform(BaseMixTransform):
			
 
				+        ...     def _mix_transform(self, labels):
			
 
				+        ...         # Implement custom mix logic here
			
 
				+        ...         return labels
			
 
				+        ...
			
 
				+        ...     def get_indexes(self):
			
 
				+        ...         return [random.randint(0, len(self.dataset) - 1) for _ in range(3)]
			
 
				+        >>> dataset = YourDataset()
			
 
				+        >>> transform = CustomMixTransform(dataset, p=0.5)
			
 
				+        >>> mixed_labels = transform(original_labels)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
			
 
				+        """
			
 
				+        Initializes the BaseMixTransform object for mix transformations like MixUp and Mosaic.
			
 
				+
			
 
				+        This class serves as a base for implementing mix transformations in image processing pipelines.
			
 
				+
			
 
				+        Args:
			
 
				+            dataset (Any): The dataset object containing images and labels for mixing.
			
 
				+            pre_transform (Callable | None): Optional transform to apply before mixing.
			
 
				+            p (float): Probability of applying the mix transformation. Should be in the range [0.0, 1.0].
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> dataset = YOLODataset("path/to/data")
			
 
				+            >>> pre_transform = Compose([RandomFlip(), RandomPerspective()])
			
 
				+            >>> mix_transform = BaseMixTransform(dataset, pre_transform, p=0.5)
			
 
				+        """
			
 
				+        self.dataset = dataset
			
 
				+        self.pre_transform = pre_transform
			
 
				+        self.p = p
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies pre-processing transforms and mixup/mosaic transforms to labels data.
			
 
				+
			
 
				+        This method determines whether to apply the mix transform based on a probability factor. If applied, it
			
 
				+        selects additional images, applies pre-transforms if specified, and then performs the mix transform.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing label data for an image.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The transformed labels dictionary, which may include mixed data from other images.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseMixTransform(dataset, pre_transform=None, p=0.5)
			
 
				+            >>> result = transform({"image": img, "bboxes": boxes, "cls": classes})
			
 
				+        """
			
 
				+        if random.uniform(0, 1) > self.p:
			
 
				+            return labels
			
 
				+
			
 
				+        # Get index of one or three other images
			
 
				+        indexes = self.get_indexes()
			
 
				+        if isinstance(indexes, int):
			
 
				+            indexes = [indexes]
			
 
				+
			
 
				+        # Get images information will be used for Mosaic or MixUp
			
 
				+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
			
 
				+
			
 
				+        if self.pre_transform is not None:
			
 
				+            for i, data in enumerate(mix_labels):
			
 
				+                mix_labels[i] = self.pre_transform(data)
			
 
				+        labels["mix_labels"] = mix_labels
			
 
				+
			
 
				+        # Update cls and texts
			
 
				+        labels = self._update_label_text(labels)
			
 
				+        # Mosaic or MixUp
			
 
				+        labels = self._mix_transform(labels)
			
 
				+        labels.pop("mix_labels", None)
			
 
				+        return labels
			
 
				+
			
 
				+    def _mix_transform(self, labels):
			
 
				+        """
			
 
				+        Applies MixUp or Mosaic augmentation to the label dictionary.
			
 
				+
			
 
				+        This method should be implemented by subclasses to perform specific mix transformations like MixUp or
			
 
				+        Mosaic. It modifies the input label dictionary in-place with the augmented data.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image and label data. Expected to have a 'mix_labels' key
			
 
				+                with a list of additional image and label data for mixing.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The modified labels dictionary with augmented data after applying the mix transform.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseMixTransform(dataset)
			
 
				+            >>> labels = {"image": img, "bboxes": boxes, "mix_labels": [{"image": img2, "bboxes": boxes2}]}
			
 
				+            >>> augmented_labels = transform._mix_transform(labels)
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_indexes(self):
			
 
				+        """
			
 
				+        Gets a list of shuffled indexes for mosaic augmentation.
			
 
				+
			
 
				+        Returns:
			
 
				+            (List[int]): A list of shuffled indexes from the dataset.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = BaseMixTransform(dataset)
			
 
				+            >>> indexes = transform.get_indexes()
			
 
				+            >>> print(indexes)  # [3, 18, 7, 2]
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _update_label_text(labels):
			
 
				+        """
			
 
				+        Updates label text and class IDs for mixed labels in image augmentation.
			
 
				+
			
 
				+        This method processes the 'texts' and 'cls' fields of the input labels dictionary and any mixed labels,
			
 
				+        creating a unified set of text labels and updating class IDs accordingly.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing label information, including 'texts' and 'cls' fields,
			
 
				+                and optionally a 'mix_labels' field with additional label dictionaries.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The updated labels dictionary with unified text labels and updated class IDs.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> labels = {
			
 
				+            ...     "texts": [["cat"], ["dog"]],
			
 
				+            ...     "cls": torch.tensor([[0], [1]]),
			
 
				+            ...     "mix_labels": [{"texts": [["bird"], ["fish"]], "cls": torch.tensor([[0], [1]])}],
			
 
				+            ... }
			
 
				+            >>> updated_labels = self._update_label_text(labels)
			
 
				+            >>> print(updated_labels["texts"])
			
 
				+            [['cat'], ['dog'], ['bird'], ['fish']]
			
 
				+            >>> print(updated_labels["cls"])
			
 
				+            tensor([[0],
			
 
				+                    [1]])
			
 
				+            >>> print(updated_labels["mix_labels"][0]["cls"])
			
 
				+            tensor([[2],
			
 
				+                    [3]])
			
 
				+        """
			
 
				+        if "texts" not in labels:
			
 
				+            return labels
			
 
				+
			
 
				+        mix_texts = sum([labels["texts"]] + [x["texts"] for x in labels["mix_labels"]], [])
			
 
				+        mix_texts = list({tuple(x) for x in mix_texts})
			
 
				+        text2id = {text: i for i, text in enumerate(mix_texts)}
			
 
				+
			
 
				+        for label in [labels] + labels["mix_labels"]:
			
 
				+            for i, cls in enumerate(label["cls"].squeeze(-1).tolist()):
			
 
				+                text = label["texts"][int(cls)]
			
 
				+                label["cls"][i] = text2id[tuple(text)]
			
 
				+            label["texts"] = mix_texts
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class Mosaic(BaseMixTransform):
			
 
				+    """
			
 
				+    Mosaic augmentation for image datasets.
			
 
				+
			
 
				+    This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
			
 
				+    The augmentation is applied to a dataset with a given probability.
			
 
				+
			
 
				+    Attributes:
			
 
				+        dataset: The dataset on which the mosaic augmentation is applied.
			
 
				+        imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
			
 
				+        p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
			
 
				+        n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
			
 
				+        border (Tuple[int, int]): Border size for width and height.
			
 
				+
			
 
				+    Methods:
			
 
				+        get_indexes: Returns a list of random indexes from the dataset.
			
 
				+        _mix_transform: Applies mixup transformation to the input image and labels.
			
 
				+        _mosaic3: Creates a 1x3 image mosaic.
			
 
				+        _mosaic4: Creates a 2x2 image mosaic.
			
 
				+        _mosaic9: Creates a 3x3 image mosaic.
			
 
				+        _update_labels: Updates labels with padding.
			
 
				+        _cat_labels: Concatenates labels and clips mosaic border instances.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.augment import Mosaic
			
 
				+        >>> dataset = YourDataset(...)  # Your image dataset
			
 
				+        >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
			
 
				+        >>> augmented_labels = mosaic_aug(original_labels)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dataset, imgsz=640, p=1.0, n=4):
			
 
				+        """
			
 
				+        Initializes the Mosaic augmentation object.
			
 
				+
			
 
				+        This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
			
 
				+        The augmentation is applied to a dataset with a given probability.
			
 
				+
			
 
				+        Args:
			
 
				+            dataset (Any): The dataset on which the mosaic augmentation is applied.
			
 
				+            imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
			
 
				+            p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
			
 
				+            n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> from ultralytics.data.augment import Mosaic
			
 
				+            >>> dataset = YourDataset(...)
			
 
				+            >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
			
 
				+        """
			
 
				+        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
			
 
				+        assert n in {4, 9}, "grid must be equal to 4 or 9."
			
 
				+        super().__init__(dataset=dataset, p=p)
			
 
				+        self.imgsz = imgsz
			
 
				+        self.border = (-imgsz // 2, -imgsz // 2)  # width, height
			
 
				+        self.n = n
			
 
				+
			
 
				+    def get_indexes(self, buffer=True):
			
 
				+        """
			
 
				+        Returns a list of random indexes from the dataset for mosaic augmentation.
			
 
				+
			
 
				+        This method selects random image indexes either from a buffer or from the entire dataset, depending on
			
 
				+        the 'buffer' parameter. It is used to choose images for creating mosaic augmentations.
			
 
				+
			
 
				+        Args:
			
 
				+            buffer (bool): If True, selects images from the dataset buffer. If False, selects from the entire
			
 
				+                dataset.
			
 
				+
			
 
				+        Returns:
			
 
				+            (List[int]): A list of random image indexes. The length of the list is n-1, where n is the number
			
 
				+                of images used in the mosaic (either 3 or 8, depending on whether n is 4 or 9).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
			
 
				+            >>> indexes = mosaic.get_indexes()
			
 
				+            >>> print(len(indexes))  # Output: 3
			
 
				+        """
			
 
				+        if buffer:  # select images from buffer
			
 
				+            return random.choices(list(self.dataset.buffer), k=self.n - 1)
			
 
				+        else:  # select any images
			
 
				+            return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
			
 
				+
			
 
				+    def _mix_transform(self, labels):
			
 
				+        """
			
 
				+        Applies mosaic augmentation to the input image and labels.
			
 
				+
			
 
				+        This method combines multiple images (3, 4, or 9) into a single mosaic image based on the 'n' attribute.
			
 
				+        It ensures that rectangular annotations are not present and that there are other images available for
			
 
				+        mosaic augmentation.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and annotations. Expected keys include:
			
 
				+                - 'rect_shape': Should be None as rect and mosaic are mutually exclusive.
			
 
				+                - 'mix_labels': A list of dictionaries containing data for other images to be used in the mosaic.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary containing the mosaic-augmented image and updated annotations.
			
 
				+
			
 
				+        Raises:
			
 
				+            AssertionError: If 'rect_shape' is not None or if 'mix_labels' is empty.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
			
 
				+            >>> augmented_data = mosaic._mix_transform(labels)
			
 
				+        """
			
 
				+        assert labels.get("rect_shape", None) is None, "rect and mosaic are mutually exclusive."
			
 
				+        assert len(labels.get("mix_labels", [])), "There are no other images for mosaic augment."
			
 
				+        return (
			
 
				+            self._mosaic3(labels) if self.n == 3 else self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
			
 
				+        )  # This code is modified for mosaic3 method.
			
 
				+
			
 
				+    def _mosaic3(self, labels):
			
 
				+        """
			
 
				+        Creates a 1x3 image mosaic by combining three images.
			
 
				+
			
 
				+        This method arranges three images in a horizontal layout, with the main image in the center and two
			
 
				+        additional images on either side. It's part of the Mosaic augmentation technique used in object detection.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image and label information for the main (center) image.
			
 
				+                Must include 'img' key with the image array, and 'mix_labels' key with a list of two
			
 
				+                dictionaries containing information for the side images.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary with the mosaic image and updated labels. Keys include:
			
 
				+                - 'img' (np.ndarray): The mosaic image array with shape (H, W, C).
			
 
				+                - Other keys from the input labels, updated to reflect the new image dimensions.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=3)
			
 
				+            >>> labels = {
			
 
				+            ...     "img": np.random.rand(480, 640, 3),
			
 
				+            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(2)],
			
 
				+            ... }
			
 
				+            >>> result = mosaic._mosaic3(labels)
			
 
				+            >>> print(result["img"].shape)
			
 
				+            (640, 640, 3)
			
 
				+        """
			
 
				+        mosaic_labels = []
			
 
				+        s = self.imgsz
			
 
				+        for i in range(3):
			
 
				+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
			
 
				+            # Load image
			
 
				+            img = labels_patch["img"]
			
 
				+            h, w = labels_patch.pop("resized_shape")
			
 
				+
			
 
				+            # Place img in img3
			
 
				+            if i == 0:  # center
			
 
				+                img3 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 3 tiles
			
 
				+                h0, w0 = h, w
			
 
				+                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
			
 
				+            elif i == 1:  # right
			
 
				+                c = s + w0, s, s + w0 + w, s + h
			
 
				+            elif i == 2:  # left
			
 
				+                c = s - w, s + h0 - h, s, s + h0
			
 
				+
			
 
				+            padw, padh = c[:2]
			
 
				+            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
			
 
				+
			
 
				+            img3[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img3[ymin:ymax, xmin:xmax]
			
 
				+            # hp, wp = h, w  # height, width previous for next iteration
			
 
				+
			
 
				+            # Labels assuming imgsz*2 mosaic size
			
 
				+            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
			
 
				+            mosaic_labels.append(labels_patch)
			
 
				+        final_labels = self._cat_labels(mosaic_labels)
			
 
				+
			
 
				+        final_labels["img"] = img3[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
			
 
				+        return final_labels
			
 
				+
			
 
				+    def _mosaic4(self, labels):
			
 
				+        """
			
 
				+        Creates a 2x2 image mosaic from four input images.
			
 
				+
			
 
				+        This method combines four images into a single mosaic image by placing them in a 2x2 grid. It also
			
 
				+        updates the corresponding labels for each image in the mosaic.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and labels for the base image (index 0) and three
			
 
				+                additional images (indices 1-3) in the 'mix_labels' key.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
			
 
				+                image as a numpy array, and other keys contain the combined and adjusted labels for all four images.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
			
 
				+            >>> labels = {
			
 
				+            ...     "img": np.random.rand(480, 640, 3),
			
 
				+            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(3)],
			
 
				+            ... }
			
 
				+            >>> result = mosaic._mosaic4(labels)
			
 
				+            >>> assert result["img"].shape == (1280, 1280, 3)
			
 
				+        """
			
 
				+        mosaic_labels = []
			
 
				+        s = self.imgsz
			
 
				+        yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border)  # mosaic center x, y
			
 
				+        for i in range(4):
			
 
				+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
			
 
				+            # Load image
			
 
				+            img = labels_patch["img"]
			
 
				+            h, w = labels_patch.pop("resized_shape")
			
 
				+
			
 
				+            # Place img in img4
			
 
				+            if i == 0:  # top left
			
 
				+                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
			
 
				+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
			
 
				+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
			
 
				+            elif i == 1:  # top right
			
 
				+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
			
 
				+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
			
 
				+            elif i == 2:  # bottom left
			
 
				+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
			
 
				+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
			
 
				+            elif i == 3:  # bottom right
			
 
				+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
			
 
				+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
			
 
				+
			
 
				+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
			
 
				+            padw = x1a - x1b
			
 
				+            padh = y1a - y1b
			
 
				+
			
 
				+            labels_patch = self._update_labels(labels_patch, padw, padh)
			
 
				+            mosaic_labels.append(labels_patch)
			
 
				+        final_labels = self._cat_labels(mosaic_labels)
			
 
				+        final_labels["img"] = img4
			
 
				+        return final_labels
			
 
				+
			
 
				+    def _mosaic9(self, labels):
			
 
				+        """
			
 
				+        Creates a 3x3 image mosaic from the input image and eight additional images.
			
 
				+
			
 
				+        This method combines nine images into a single mosaic image. The input image is placed at the center,
			
 
				+        and eight additional images from the dataset are placed around it in a 3x3 grid pattern.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing the input image and its associated labels. It should have
			
 
				+                the following keys:
			
 
				+                - 'img' (numpy.ndarray): The input image.
			
 
				+                - 'resized_shape' (Tuple[int, int]): The shape of the resized image (height, width).
			
 
				+                - 'mix_labels' (List[Dict]): A list of dictionaries containing information for the additional
			
 
				+                  eight images, each with the same structure as the input labels.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary containing the mosaic image and updated labels. It includes the following keys:
			
 
				+                - 'img' (numpy.ndarray): The final mosaic image.
			
 
				+                - Other keys from the input labels, updated to reflect the new mosaic arrangement.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=9)
			
 
				+            >>> input_labels = dataset[0]
			
 
				+            >>> mosaic_result = mosaic._mosaic9(input_labels)
			
 
				+            >>> mosaic_image = mosaic_result["img"]
			
 
				+        """
			
 
				+        mosaic_labels = []
			
 
				+        s = self.imgsz
			
 
				+        hp, wp = -1, -1  # height, width previous
			
 
				+        for i in range(9):
			
 
				+            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
			
 
				+            # Load image
			
 
				+            img = labels_patch["img"]
			
 
				+            h, w = labels_patch.pop("resized_shape")
			
 
				+
			
 
				+            # Place img in img9
			
 
				+            if i == 0:  # center
			
 
				+                img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
			
 
				+                h0, w0 = h, w
			
 
				+                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
			
 
				+            elif i == 1:  # top
			
 
				+                c = s, s - h, s + w, s
			
 
				+            elif i == 2:  # top right
			
 
				+                c = s + wp, s - h, s + wp + w, s
			
 
				+            elif i == 3:  # right
			
 
				+                c = s + w0, s, s + w0 + w, s + h
			
 
				+            elif i == 4:  # bottom right
			
 
				+                c = s + w0, s + hp, s + w0 + w, s + hp + h
			
 
				+            elif i == 5:  # bottom
			
 
				+                c = s + w0 - w, s + h0, s + w0, s + h0 + h
			
 
				+            elif i == 6:  # bottom left
			
 
				+                c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h
			
 
				+            elif i == 7:  # left
			
 
				+                c = s - w, s + h0 - h, s, s + h0
			
 
				+            elif i == 8:  # top left
			
 
				+                c = s - w, s + h0 - hp - h, s, s + h0 - hp
			
 
				+
			
 
				+            padw, padh = c[:2]
			
 
				+            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
			
 
				+
			
 
				+            # Image
			
 
				+            img9[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img9[ymin:ymax, xmin:xmax]
			
 
				+            hp, wp = h, w  # height, width previous for next iteration
			
 
				+
			
 
				+            # Labels assuming imgsz*2 mosaic size
			
 
				+            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
			
 
				+            mosaic_labels.append(labels_patch)
			
 
				+        final_labels = self._cat_labels(mosaic_labels)
			
 
				+
			
 
				+        final_labels["img"] = img9[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
			
 
				+        return final_labels
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _update_labels(labels, padw, padh):
			
 
				+        """
			
 
				+        Updates label coordinates with padding values.
			
 
				+
			
 
				+        This method adjusts the bounding box coordinates of object instances in the labels by adding padding
			
 
				+        values. It also denormalizes the coordinates if they were previously normalized.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image and instance information.
			
 
				+            padw (int): Padding width to be added to the x-coordinates.
			
 
				+            padh (int): Padding height to be added to the y-coordinates.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): Updated labels dictionary with adjusted instance coordinates.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> labels = {"img": np.zeros((100, 100, 3)), "instances": Instances(...)}
			
 
				+            >>> padw, padh = 50, 50
			
 
				+            >>> updated_labels = Mosaic._update_labels(labels, padw, padh)
			
 
				+        """
			
 
				+        nh, nw = labels["img"].shape[:2]
			
 
				+        labels["instances"].convert_bbox(format="xyxy")
			
 
				+        labels["instances"].denormalize(nw, nh)
			
 
				+        labels["instances"].add_padding(padw, padh)
			
 
				+        return labels
			
 
				+
			
 
				+    def _cat_labels(self, mosaic_labels):
			
 
				+        """
			
 
				+        Concatenates and processes labels for mosaic augmentation.
			
 
				+
			
 
				+        This method combines labels from multiple images used in mosaic augmentation, clips instances to the
			
 
				+        mosaic border, and removes zero-area boxes.
			
 
				+
			
 
				+        Args:
			
 
				+            mosaic_labels (List[Dict]): A list of label dictionaries for each image in the mosaic.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary containing concatenated and processed labels for the mosaic image, including:
			
 
				+                - im_file (str): File path of the first image in the mosaic.
			
 
				+                - ori_shape (Tuple[int, int]): Original shape of the first image.
			
 
				+                - resized_shape (Tuple[int, int]): Shape of the mosaic image (imgsz * 2, imgsz * 2).
			
 
				+                - cls (np.ndarray): Concatenated class labels.
			
 
				+                - instances (Instances): Concatenated instance annotations.
			
 
				+                - mosaic_border (Tuple[int, int]): Mosaic border size.
			
 
				+                - texts (List[str], optional): Text labels if present in the original labels.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mosaic = Mosaic(dataset, imgsz=640)
			
 
				+            >>> mosaic_labels = [{"cls": np.array([0, 1]), "instances": Instances(...)} for _ in range(4)]
			
 
				+            >>> result = mosaic._cat_labels(mosaic_labels)
			
 
				+            >>> print(result.keys())
			
 
				+            dict_keys(['im_file', 'ori_shape', 'resized_shape', 'cls', 'instances', 'mosaic_border'])
			
 
				+        """
			
 
				+        if len(mosaic_labels) == 0:
			
 
				+            return {}
			
 
				+        cls = []
			
 
				+        instances = []
			
 
				+        imgsz = self.imgsz * 2  # mosaic imgsz
			
 
				+        for labels in mosaic_labels:
			
 
				+            cls.append(labels["cls"])
			
 
				+            instances.append(labels["instances"])
			
 
				+        # Final labels
			
 
				+        final_labels = {
			
 
				+            "im_file": mosaic_labels[0]["im_file"],
			
 
				+            "ori_shape": mosaic_labels[0]["ori_shape"],
			
 
				+            "resized_shape": (imgsz, imgsz),
			
 
				+            "cls": np.concatenate(cls, 0),
			
 
				+            "instances": Instances.concatenate(instances, axis=0),
			
 
				+            "mosaic_border": self.border,
			
 
				+        }
			
 
				+        final_labels["instances"].clip(imgsz, imgsz)
			
 
				+        good = final_labels["instances"].remove_zero_area_boxes()
			
 
				+        final_labels["cls"] = final_labels["cls"][good]
			
 
				+        if "texts" in mosaic_labels[0]:
			
 
				+            final_labels["texts"] = mosaic_labels[0]["texts"]
			
 
				+        return final_labels
			
 
				+
			
 
				+
			
 
				+class MixUp(BaseMixTransform):
			
 
				+    """
			
 
				+    Applies MixUp augmentation to image datasets.
			
 
				+
			
 
				+    This class implements the MixUp augmentation technique as described in the paper "mixup: Beyond Empirical Risk
			
 
				+    Minimization" (https://arxiv.org/abs/1710.09412). MixUp combines two images and their labels using a random weight.
			
 
				+
			
 
				+    Attributes:
			
 
				+        dataset (Any): The dataset to which MixUp augmentation will be applied.
			
 
				+        pre_transform (Callable | None): Optional transform to apply before MixUp.
			
 
				+        p (float): Probability of applying MixUp augmentation.
			
 
				+
			
 
				+    Methods:
			
 
				+        get_indexes: Returns a random index from the dataset.
			
 
				+        _mix_transform: Applies MixUp augmentation to the input labels.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.augment import MixUp
			
 
				+        >>> dataset = YourDataset(...)  # Your image dataset
			
 
				+        >>> mixup = MixUp(dataset, p=0.5)
			
 
				+        >>> augmented_labels = mixup(original_labels)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
			
 
				+        """
			
 
				+        Initializes the MixUp augmentation object.
			
 
				+
			
 
				+        MixUp is an image augmentation technique that combines two images by taking a weighted sum of their pixel
			
 
				+        values and labels. This implementation is designed for use with the Ultralytics YOLO framework.
			
 
				+
			
 
				+        Args:
			
 
				+            dataset (Any): The dataset to which MixUp augmentation will be applied.
			
 
				+            pre_transform (Callable | None): Optional transform to apply to images before MixUp.
			
 
				+            p (float): Probability of applying MixUp augmentation to an image. Must be in the range [0, 1].
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> from ultralytics.data.dataset import YOLODataset
			
 
				+            >>> dataset = YOLODataset("path/to/data.yaml")
			
 
				+            >>> mixup = MixUp(dataset, pre_transform=None, p=0.5)
			
 
				+        """
			
 
				+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
			
 
				+
			
 
				+    def get_indexes(self):
			
 
				+        """
			
 
				+        Get a random index from the dataset.
			
 
				+
			
 
				+        This method returns a single random index from the dataset, which is used to select an image for MixUp
			
 
				+        augmentation.
			
 
				+
			
 
				+        Returns:
			
 
				+            (int): A random integer index within the range of the dataset length.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mixup = MixUp(dataset)
			
 
				+            >>> index = mixup.get_indexes()
			
 
				+            >>> print(index)
			
 
				+            42
			
 
				+        """
			
 
				+        return random.randint(0, len(self.dataset) - 1)
			
 
				+
			
 
				+    def _mix_transform(self, labels):
			
 
				+        """
			
 
				+        Applies MixUp augmentation to the input labels.
			
 
				+
			
 
				+        This method implements the MixUp augmentation technique as described in the paper
			
 
				+        "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412).
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing the original image and label information.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary containing the mixed-up image and combined label information.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> mixer = MixUp(dataset)
			
 
				+            >>> mixed_labels = mixer._mix_transform(labels)
			
 
				+        """
			
 
				+        r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
			
 
				+        labels2 = labels["mix_labels"][0]
			
 
				+        labels["img"] = (labels["img"] * r + labels2["img"] * (1 - r)).astype(np.uint8)
			
 
				+        labels["instances"] = Instances.concatenate([labels["instances"], labels2["instances"]], axis=0)
			
 
				+        labels["cls"] = np.concatenate([labels["cls"], labels2["cls"]], 0)
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class RandomPerspective:
			
 
				+    """
			
 
				+    Implements random perspective and affine transformations on images and corresponding annotations.
			
 
				+
			
 
				+    This class applies random rotations, translations, scaling, shearing, and perspective transformations
			
 
				+    to images and their associated bounding boxes, segments, and keypoints. It can be used as part of an
			
 
				+    augmentation pipeline for object detection and instance segmentation tasks.
			
 
				+
			
 
				+    Attributes:
			
 
				+        degrees (float): Maximum absolute degree range for random rotations.
			
 
				+        translate (float): Maximum translation as a fraction of the image size.
			
 
				+        scale (float): Scaling factor range, e.g., scale=0.1 means 0.9-1.1.
			
 
				+        shear (float): Maximum shear angle in degrees.
			
 
				+        perspective (float): Perspective distortion factor.
			
 
				+        border (Tuple[int, int]): Mosaic border size as (x, y).
			
 
				+        pre_transform (Callable | None): Optional transform to apply before the random perspective.
			
 
				+
			
 
				+    Methods:
			
 
				+        affine_transform: Applies affine transformations to the input image.
			
 
				+        apply_bboxes: Transforms bounding boxes using the affine matrix.
			
 
				+        apply_segments: Transforms segments and generates new bounding boxes.
			
 
				+        apply_keypoints: Transforms keypoints using the affine matrix.
			
 
				+        __call__: Applies the random perspective transformation to images and annotations.
			
 
				+        box_candidates: Filters transformed bounding boxes based on size and aspect ratio.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = RandomPerspective(degrees=10, translate=0.1, scale=0.1, shear=10)
			
 
				+        >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
			
 
				+        >>> labels = {"img": image, "cls": np.array([0, 1]), "instances": Instances(...)}
			
 
				+        >>> result = transform(labels)
			
 
				+        >>> transformed_image = result["img"]
			
 
				+        >>> transformed_instances = result["instances"]
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, border=(0, 0), pre_transform=None
			
 
				+    ):
			
 
				+        """
			
 
				+        Initializes RandomPerspective object with transformation parameters.
			
 
				+
			
 
				+        This class implements random perspective and affine transformations on images and corresponding bounding boxes,
			
 
				+        segments, and keypoints. Transformations include rotation, translation, scaling, and shearing.
			
 
				+
			
 
				+        Args:
			
 
				+            degrees (float): Degree range for random rotations.
			
 
				+            translate (float): Fraction of total width and height for random translation.
			
 
				+            scale (float): Scaling factor interval, e.g., a scale factor of 0.5 allows a resize between 50%-150%.
			
 
				+            shear (float): Shear intensity (angle in degrees).
			
 
				+            perspective (float): Perspective distortion factor.
			
 
				+            border (Tuple[int, int]): Tuple specifying mosaic border (top/bottom, left/right).
			
 
				+            pre_transform (Callable | None): Function/transform to apply to the image before starting the random
			
 
				+                transformation.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = RandomPerspective(degrees=10.0, translate=0.1, scale=0.5, shear=5.0)
			
 
				+            >>> result = transform(labels)  # Apply random perspective to labels
			
 
				+        """
			
 
				+        self.degrees = degrees
			
 
				+        self.translate = translate
			
 
				+        self.scale = scale
			
 
				+        self.shear = shear
			
 
				+        self.perspective = perspective
			
 
				+        self.border = border  # mosaic border
			
 
				+        self.pre_transform = pre_transform
			
 
				+
			
 
				+    def affine_transform(self, img, border):
			
 
				+        """
			
 
				+        Applies a sequence of affine transformations centered around the image center.
			
 
				+
			
 
				+        This function performs a series of geometric transformations on the input image, including
			
 
				+        translation, perspective change, rotation, scaling, and shearing. The transformations are
			
 
				+        applied in a specific order to maintain consistency.
			
 
				+
			
 
				+        Args:
			
 
				+            img (np.ndarray): Input image to be transformed.
			
 
				+            border (Tuple[int, int]): Border dimensions for the transformed image.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Tuple[np.ndarray, np.ndarray, float]): A tuple containing:
			
 
				+                - np.ndarray: Transformed image.
			
 
				+                - np.ndarray: 3x3 transformation matrix.
			
 
				+                - float: Scale factor applied during the transformation.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> import numpy as np
			
 
				+            >>> img = np.random.rand(100, 100, 3)
			
 
				+            >>> border = (10, 10)
			
 
				+            >>> transformed_img, matrix, scale = affine_transform(img, border)
			
 
				+        """
			
 
				+        # Center
			
 
				+        C = np.eye(3, dtype=np.float32)
			
 
				+
			
 
				+        C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
			
 
				+        C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
			
 
				+
			
 
				+        # Perspective
			
 
				+        P = np.eye(3, dtype=np.float32)
			
 
				+        P[2, 0] = random.uniform(-self.perspective, self.perspective)  # x perspective (about y)
			
 
				+        P[2, 1] = random.uniform(-self.perspective, self.perspective)  # y perspective (about x)
			
 
				+
			
 
				+        # Rotation and Scale
			
 
				+        R = np.eye(3, dtype=np.float32)
			
 
				+        a = random.uniform(-self.degrees, self.degrees)
			
 
				+        # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
			
 
				+        s = random.uniform(1 - self.scale, 1 + self.scale)
			
 
				+        # s = 2 ** random.uniform(-scale, scale)
			
 
				+        R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
			
 
				+
			
 
				+        # Shear
			
 
				+        S = np.eye(3, dtype=np.float32)
			
 
				+        S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # x shear (deg)
			
 
				+        S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # y shear (deg)
			
 
				+
			
 
				+        # Translation
			
 
				+        T = np.eye(3, dtype=np.float32)
			
 
				+        T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0]  # x translation (pixels)
			
 
				+        T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1]  # y translation (pixels)
			
 
				+
			
 
				+        # Combined rotation matrix
			
 
				+        M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
			
 
				+        # Affine image
			
 
				+        if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
			
 
				+            if self.perspective:
			
 
				+                img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114))
			
 
				+            else:  # affine
			
 
				+                img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114))
			
 
				+        return img, M, s
			
 
				+
			
 
				+    def apply_bboxes(self, bboxes, M):
			
 
				+        """
			
 
				+        Apply affine transformation to bounding boxes.
			
 
				+
			
 
				+        This function applies an affine transformation to a set of bounding boxes using the provided
			
 
				+        transformation matrix.
			
 
				+
			
 
				+        Args:
			
 
				+            bboxes (torch.Tensor): Bounding boxes in xyxy format with shape (N, 4), where N is the number
			
 
				+                of bounding boxes.
			
 
				+            M (torch.Tensor): Affine transformation matrix with shape (3, 3).
			
 
				+
			
 
				+        Returns:
			
 
				+            (torch.Tensor): Transformed bounding boxes in xyxy format with shape (N, 4).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> bboxes = torch.tensor([[10, 10, 20, 20], [30, 30, 40, 40]])
			
 
				+            >>> M = torch.eye(3)
			
 
				+            >>> transformed_bboxes = apply_bboxes(bboxes, M)
			
 
				+        """
			
 
				+        n = len(bboxes)
			
 
				+        if n == 0:
			
 
				+            return bboxes
			
 
				+
			
 
				+        xy = np.ones((n * 4, 3), dtype=bboxes.dtype)
			
 
				+        xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
			
 
				+        xy = xy @ M.T  # transform
			
 
				+        xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
			
 
				+
			
 
				+        # Create new boxes
			
 
				+        x = xy[:, [0, 2, 4, 6]]
			
 
				+        y = xy[:, [1, 3, 5, 7]]
			
 
				+        return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
			
 
				+
			
 
				+    def apply_segments(self, segments, M):
			
 
				+        """
			
 
				+        Apply affine transformations to segments and generate new bounding boxes.
			
 
				+
			
 
				+        This function applies affine transformations to input segments and generates new bounding boxes based on
			
 
				+        the transformed segments. It clips the transformed segments to fit within the new bounding boxes.
			
 
				+
			
 
				+        Args:
			
 
				+            segments (np.ndarray): Input segments with shape (N, M, 2), where N is the number of segments and M is the
			
 
				+                number of points in each segment.
			
 
				+            M (np.ndarray): Affine transformation matrix with shape (3, 3).
			
 
				+
			
 
				+        Returns:
			
 
				+            (Tuple[np.ndarray, np.ndarray]): A tuple containing:
			
 
				+                - New bounding boxes with shape (N, 4) in xyxy format.
			
 
				+                - Transformed and clipped segments with shape (N, M, 2).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> segments = np.random.rand(10, 500, 2)  # 10 segments with 500 points each
			
 
				+            >>> M = np.eye(3)  # Identity transformation matrix
			
 
				+            >>> new_bboxes, new_segments = apply_segments(segments, M)
			
 
				+        """
			
 
				+        n, num = segments.shape[:2]
			
 
				+        if n == 0:
			
 
				+            return [], segments
			
 
				+
			
 
				+        xy = np.ones((n * num, 3), dtype=segments.dtype)
			
 
				+        segments = segments.reshape(-1, 2)
			
 
				+        xy[:, :2] = segments
			
 
				+        xy = xy @ M.T  # transform
			
 
				+        xy = xy[:, :2] / xy[:, 2:3]
			
 
				+        segments = xy.reshape(n, -1, 2)
			
 
				+        bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
			
 
				+        segments[..., 0] = segments[..., 0].clip(bboxes[:, 0:1], bboxes[:, 2:3])
			
 
				+        segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
			
 
				+        return bboxes, segments
			
 
				+
			
 
				+    def apply_keypoints(self, keypoints, M):
			
 
				+        """
			
 
				+        Applies affine transformation to keypoints.
			
 
				+
			
 
				+        This method transforms the input keypoints using the provided affine transformation matrix. It handles
			
 
				+        perspective rescaling if necessary and updates the visibility of keypoints that fall outside the image
			
 
				+        boundaries after transformation.
			
 
				+
			
 
				+        Args:
			
 
				+            keypoints (np.ndarray): Array of keypoints with shape (N, 17, 3), where N is the number of instances,
			
 
				+                17 is the number of keypoints per instance, and 3 represents (x, y, visibility).
			
 
				+            M (np.ndarray): 3x3 affine transformation matrix.
			
 
				+
			
 
				+        Returns:
			
 
				+            (np.ndarray): Transformed keypoints array with the same shape as input (N, 17, 3).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> random_perspective = RandomPerspective()
			
 
				+            >>> keypoints = np.random.rand(5, 17, 3)  # 5 instances, 17 keypoints each
			
 
				+            >>> M = np.eye(3)  # Identity transformation
			
 
				+            >>> transformed_keypoints = random_perspective.apply_keypoints(keypoints, M)
			
 
				+        """
			
 
				+        n, nkpt = keypoints.shape[:2]
			
 
				+        if n == 0:
			
 
				+            return keypoints
			
 
				+        xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype)
			
 
				+        visible = keypoints[..., 2].reshape(n * nkpt, 1)
			
 
				+        xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
			
 
				+        xy = xy @ M.T  # transform
			
 
				+        xy = xy[:, :2] / xy[:, 2:3]  # perspective rescale or affine
			
 
				+        out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
			
 
				+        visible[out_mask] = 0
			
 
				+        return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies random perspective and affine transformations to an image and its associated labels.
			
 
				+
			
 
				+        This method performs a series of transformations including rotation, translation, scaling, shearing,
			
 
				+        and perspective distortion on the input image and adjusts the corresponding bounding boxes, segments,
			
 
				+        and keypoints accordingly.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and annotations.
			
 
				+                Must include:
			
 
				+                    'img' (ndarray): The input image.
			
 
				+                    'cls' (ndarray): Class labels.
			
 
				+                    'instances' (Instances): Object instances with bounding boxes, segments, and keypoints.
			
 
				+                May include:
			
 
				+                    'mosaic_border' (Tuple[int, int]): Border size for mosaic augmentation.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): Transformed labels dictionary containing:
			
 
				+                - 'img' (np.ndarray): The transformed image.
			
 
				+                - 'cls' (np.ndarray): Updated class labels.
			
 
				+                - 'instances' (Instances): Updated object instances.
			
 
				+                - 'resized_shape' (Tuple[int, int]): New image shape after transformation.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = RandomPerspective()
			
 
				+            >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
			
 
				+            >>> labels = {
			
 
				+            ...     "img": image,
			
 
				+            ...     "cls": np.array([0, 1, 2]),
			
 
				+            ...     "instances": Instances(bboxes=np.array([[10, 10, 50, 50], [100, 100, 150, 150]])),
			
 
				+            ... }
			
 
				+            >>> result = transform(labels)
			
 
				+            >>> assert result["img"].shape[:2] == result["resized_shape"]
			
 
				+        """
			
 
				+        if self.pre_transform and "mosaic_border" not in labels:
			
 
				+            labels = self.pre_transform(labels)
			
 
				+        labels.pop("ratio_pad", None)  # do not need ratio pad
			
 
				+
			
 
				+        img = labels["img"]
			
 
				+        cls = labels["cls"]
			
 
				+        instances = labels.pop("instances")
			
 
				+        # Make sure the coord formats are right
			
 
				+        instances.convert_bbox(format="xyxy")
			
 
				+        instances.denormalize(*img.shape[:2][::-1])
			
 
				+
			
 
				+        border = labels.pop("mosaic_border", self.border)
			
 
				+        self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2  # w, h
			
 
				+        # M is affine matrix
			
 
				+        # Scale for func:`box_candidates`
			
 
				+        img, M, scale = self.affine_transform(img, border)
			
 
				+
			
 
				+        bboxes = self.apply_bboxes(instances.bboxes, M)
			
 
				+
			
 
				+        segments = instances.segments
			
 
				+        keypoints = instances.keypoints
			
 
				+        # Update bboxes if there are segments.
			
 
				+        if len(segments):
			
 
				+            bboxes, segments = self.apply_segments(segments, M)
			
 
				+
			
 
				+        if keypoints is not None:
			
 
				+            keypoints = self.apply_keypoints(keypoints, M)
			
 
				+        new_instances = Instances(bboxes, segments, keypoints, bbox_format="xyxy", normalized=False)
			
 
				+        # Clip
			
 
				+        new_instances.clip(*self.size)
			
 
				+
			
 
				+        # Filter instances
			
 
				+        instances.scale(scale_w=scale, scale_h=scale, bbox_only=True)
			
 
				+        # Make the bboxes have the same scale with new_bboxes
			
 
				+        i = self.box_candidates(
			
 
				+            box1=instances.bboxes.T, box2=new_instances.bboxes.T, area_thr=0.01 if len(segments) else 0.10
			
 
				+        )
			
 
				+        labels["instances"] = new_instances[i]
			
 
				+        labels["cls"] = cls[i]
			
 
				+        labels["img"] = img
			
 
				+        labels["resized_shape"] = img.shape[:2]
			
 
				+        return labels
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def box_candidates(box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):
			
 
				+        """
			
 
				+        Compute candidate boxes for further processing based on size and aspect ratio criteria.
			
 
				+
			
 
				+        This method compares boxes before and after augmentation to determine if they meet specified
			
 
				+        thresholds for width, height, aspect ratio, and area. It's used to filter out boxes that have
			
 
				+        been overly distorted or reduced by the augmentation process.
			
 
				+
			
 
				+        Args:
			
 
				+            box1 (numpy.ndarray): Original boxes before augmentation, shape (4, N) where n is the
			
 
				+                number of boxes. Format is [x1, y1, x2, y2] in absolute coordinates.
			
 
				+            box2 (numpy.ndarray): Augmented boxes after transformation, shape (4, N). Format is
			
 
				+                [x1, y1, x2, y2] in absolute coordinates.
			
 
				+            wh_thr (float): Width and height threshold in pixels. Boxes smaller than this in either
			
 
				+                dimension are rejected.
			
 
				+            ar_thr (float): Aspect ratio threshold. Boxes with an aspect ratio greater than this
			
 
				+                value are rejected.
			
 
				+            area_thr (float): Area ratio threshold. Boxes with an area ratio (new/old) less than
			
 
				+                this value are rejected.
			
 
				+            eps (float): Small epsilon value to prevent division by zero.
			
 
				+
			
 
				+        Returns:
			
 
				+            (numpy.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
			
 
				+                True values correspond to boxes that meet all criteria.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> random_perspective = RandomPerspective()
			
 
				+            >>> box1 = np.array([[0, 0, 100, 100], [0, 0, 50, 50]]).T
			
 
				+            >>> box2 = np.array([[10, 10, 90, 90], [5, 5, 45, 45]]).T
			
 
				+            >>> candidates = random_perspective.box_candidates(box1, box2)
			
 
				+            >>> print(candidates)
			
 
				+            [True True]
			
 
				+        """
			
 
				+        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
			
 
				+        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
			
 
				+        ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
			
 
				+        return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates
			
 
				+
			
 
				+
			
 
				+class RandomHSV:
			
 
				+    """
			
 
				+    Randomly adjusts the Hue, Saturation, and Value (HSV) channels of an image.
			
 
				+
			
 
				+    This class applies random HSV augmentation to images within predefined limits set by hgain, sgain, and vgain.
			
 
				+
			
 
				+    Attributes:
			
 
				+        hgain (float): Maximum variation for hue. Range is typically [0, 1].
			
 
				+        sgain (float): Maximum variation for saturation. Range is typically [0, 1].
			
 
				+        vgain (float): Maximum variation for value. Range is typically [0, 1].
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies random HSV augmentation to an image.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> import numpy as np
			
 
				+        >>> from ultralytics.data.augment import RandomHSV
			
 
				+        >>> augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
			
 
				+        >>> image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
			
 
				+        >>> labels = {"img": image}
			
 
				+        >>> augmenter(labels)
			
 
				+        >>> augmented_image = augmented_labels["img"]
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
			
 
				+        """
			
 
				+        Initializes the RandomHSV object for random HSV (Hue, Saturation, Value) augmentation.
			
 
				+
			
 
				+        This class applies random adjustments to the HSV channels of an image within specified limits.
			
 
				+
			
 
				+        Args:
			
 
				+            hgain (float): Maximum variation for hue. Should be in the range [0, 1].
			
 
				+            sgain (float): Maximum variation for saturation. Should be in the range [0, 1].
			
 
				+            vgain (float): Maximum variation for value. Should be in the range [0, 1].
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> hsv_aug = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
			
 
				+            >>> hsv_aug(image)
			
 
				+        """
			
 
				+        self.hgain = hgain
			
 
				+        self.sgain = sgain
			
 
				+        self.vgain = vgain
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies random HSV augmentation to an image within predefined limits.
			
 
				+
			
 
				+        This method modifies the input image by randomly adjusting its Hue, Saturation, and Value (HSV) channels.
			
 
				+        The adjustments are made within the limits set by hgain, sgain, and vgain during initialization.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and metadata. Must include an 'img' key with
			
 
				+                the image as a numpy array.
			
 
				+
			
 
				+        Returns:
			
 
				+            (None): The function modifies the input 'labels' dictionary in-place, updating the 'img' key
			
 
				+                with the HSV-augmented image.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> hsv_augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
			
 
				+            >>> labels = {"img": np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)}
			
 
				+            >>> hsv_augmenter(labels)
			
 
				+            >>> augmented_img = labels["img"]
			
 
				+        """
			
 
				+        img = labels["img"]
			
 
				+        if self.hgain or self.sgain or self.vgain:
			
 
				+            r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1  # random gains
			
 
				+            hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
			
 
				+            dtype = img.dtype  # uint8
			
 
				+
			
 
				+            x = np.arange(0, 256, dtype=r.dtype)
			
 
				+            lut_hue = ((x * r[0]) % 180).astype(dtype)
			
 
				+            lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
			
 
				+            lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
			
 
				+
			
 
				+            im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
			
 
				+            cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class RandomFlip:
			
 
				+    """
			
 
				+    Applies a random horizontal or vertical flip to an image with a given probability.
			
 
				+
			
 
				+    This class performs random image flipping and updates corresponding instance annotations such as
			
 
				+    bounding boxes and keypoints.
			
 
				+
			
 
				+    Attributes:
			
 
				+        p (float): Probability of applying the flip. Must be between 0 and 1.
			
 
				+        direction (str): Direction of flip, either 'horizontal' or 'vertical'.
			
 
				+        flip_idx (array-like): Index mapping for flipping keypoints, if applicable.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the random flip transformation to an image and its annotations.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = RandomFlip(p=0.5, direction="horizontal")
			
 
				+        >>> result = transform({"img": image, "instances": instances})
			
 
				+        >>> flipped_image = result["img"]
			
 
				+        >>> flipped_instances = result["instances"]
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, p=0.5, direction="horizontal", flip_idx=None) -> None:
			
 
				+        """
			
 
				+        Initializes the RandomFlip class with probability and direction.
			
 
				+
			
 
				+        This class applies a random horizontal or vertical flip to an image with a given probability.
			
 
				+        It also updates any instances (bounding boxes, keypoints, etc.) accordingly.
			
 
				+
			
 
				+        Args:
			
 
				+            p (float): The probability of applying the flip. Must be between 0 and 1.
			
 
				+            direction (str): The direction to apply the flip. Must be 'horizontal' or 'vertical'.
			
 
				+            flip_idx (List[int] | None): Index mapping for flipping keypoints, if any.
			
 
				+
			
 
				+        Raises:
			
 
				+            AssertionError: If direction is not 'horizontal' or 'vertical', or if p is not between 0 and 1.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> flip = RandomFlip(p=0.5, direction="horizontal")
			
 
				+            >>> flip_with_idx = RandomFlip(p=0.7, direction="vertical", flip_idx=[1, 0, 3, 2, 5, 4])
			
 
				+        """
			
 
				+        assert direction in {"horizontal", "vertical"}, f"Support direction `horizontal` or `vertical`, got {direction}"
			
 
				+        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
			
 
				+
			
 
				+        self.p = p
			
 
				+        self.direction = direction
			
 
				+        self.flip_idx = flip_idx
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies random flip to an image and updates any instances like bounding boxes or keypoints accordingly.
			
 
				+
			
 
				+        This method randomly flips the input image either horizontally or vertically based on the initialized
			
 
				+        probability and direction. It also updates the corresponding instances (bounding boxes, keypoints) to
			
 
				+        match the flipped image.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing the following keys:
			
 
				+                'img' (numpy.ndarray): The image to be flipped.
			
 
				+                'instances' (ultralytics.utils.instance.Instances): An object containing bounding boxes and
			
 
				+                    optionally keypoints.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The same dictionary with the flipped image and updated instances:
			
 
				+                'img' (numpy.ndarray): The flipped image.
			
 
				+                'instances' (ultralytics.utils.instance.Instances): Updated instances matching the flipped image.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": Instances(...)}
			
 
				+            >>> random_flip = RandomFlip(p=0.5, direction="horizontal")
			
 
				+            >>> flipped_labels = random_flip(labels)
			
 
				+        """
			
 
				+        img = labels["img"]
			
 
				+        instances = labels.pop("instances")
			
 
				+        instances.convert_bbox(format="xywh")
			
 
				+        h, w = img.shape[:2]
			
 
				+        h = 1 if instances.normalized else h
			
 
				+        w = 1 if instances.normalized else w
			
 
				+
			
 
				+        # Flip up-down
			
 
				+        if self.direction == "vertical" and random.random() < self.p:
			
 
				+            img = np.flipud(img)
			
 
				+            instances.flipud(h)
			
 
				+        if self.direction == "horizontal" and random.random() < self.p:
			
 
				+            img = np.fliplr(img)
			
 
				+            instances.fliplr(w)
			
 
				+            # For keypoints
			
 
				+            if self.flip_idx is not None and instances.keypoints is not None:
			
 
				+                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
			
 
				+        labels["img"] = np.ascontiguousarray(img)
			
 
				+        labels["instances"] = instances
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class LetterBox:
			
 
				+    """
			
 
				+    Resize image and padding for detection, instance segmentation, pose.
			
 
				+
			
 
				+    This class resizes and pads images to a specified shape while preserving aspect ratio. It also updates
			
 
				+    corresponding labels and bounding boxes.
			
 
				+
			
 
				+    Attributes:
			
 
				+        new_shape (tuple): Target shape (height, width) for resizing.
			
 
				+        auto (bool): Whether to use minimum rectangle.
			
 
				+        scaleFill (bool): Whether to stretch the image to new_shape.
			
 
				+        scaleup (bool): Whether to allow scaling up. If False, only scale down.
			
 
				+        stride (int): Stride for rounding padding.
			
 
				+        center (bool): Whether to center the image or align to top-left.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Resize and pad image, update labels and bounding boxes.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = LetterBox(new_shape=(640, 640))
			
 
				+        >>> result = transform(labels)
			
 
				+        >>> resized_img = result["img"]
			
 
				+        >>> updated_instances = result["instances"]
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32):
			
 
				+        """
			
 
				+        Initialize LetterBox object for resizing and padding images.
			
 
				+
			
 
				+        This class is designed to resize and pad images for object detection, instance segmentation, and pose estimation
			
 
				+        tasks. It supports various resizing modes including auto-sizing, scale-fill, and letterboxing.
			
 
				+
			
 
				+        Args:
			
 
				+            new_shape (Tuple[int, int]): Target size (height, width) for the resized image.
			
 
				+            auto (bool): If True, use minimum rectangle to resize. If False, use new_shape directly.
			
 
				+            scaleFill (bool): If True, stretch the image to new_shape without padding.
			
 
				+            scaleup (bool): If True, allow scaling up. If False, only scale down.
			
 
				+            center (bool): If True, center the placed image. If False, place image in top-left corner.
			
 
				+            stride (int): Stride of the model (e.g., 32 for YOLOv5).
			
 
				+
			
 
				+        Attributes:
			
 
				+            new_shape (Tuple[int, int]): Target size for the resized image.
			
 
				+            auto (bool): Flag for using minimum rectangle resizing.
			
 
				+            scaleFill (bool): Flag for stretching image without padding.
			
 
				+            scaleup (bool): Flag for allowing upscaling.
			
 
				+            stride (int): Stride value for ensuring image size is divisible by stride.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> letterbox = LetterBox(new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, stride=32)
			
 
				+            >>> resized_img = letterbox(original_img)
			
 
				+        """
			
 
				+        self.new_shape = new_shape
			
 
				+        self.auto = auto
			
 
				+        self.scaleFill = scaleFill
			
 
				+        self.scaleup = scaleup
			
 
				+        self.stride = stride
			
 
				+        self.center = center  # Put the image in the middle or top-left
			
 
				+
			
 
				+    def __call__(self, labels=None, image=None):
			
 
				+        """
			
 
				+        Resizes and pads an image for object detection, instance segmentation, or pose estimation tasks.
			
 
				+
			
 
				+        This method applies letterboxing to the input image, which involves resizing the image while maintaining its
			
 
				+        aspect ratio and adding padding to fit the new shape. It also updates any associated labels accordingly.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict | None): A dictionary containing image data and associated labels, or empty dict if None.
			
 
				+            image (np.ndarray | None): The input image as a numpy array. If None, the image is taken from 'labels'.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict | Tuple): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
			
 
				+                updated labels, and additional metadata. If 'labels' is empty, returns a tuple containing the resized
			
 
				+                and padded image, and a tuple of (ratio, (left_pad, top_pad)).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> letterbox = LetterBox(new_shape=(640, 640))
			
 
				+            >>> result = letterbox(labels={"img": np.zeros((480, 640, 3)), "instances": Instances(...)})
			
 
				+            >>> resized_img = result["img"]
			
 
				+            >>> updated_instances = result["instances"]
			
 
				+        """
			
 
				+        if labels is None:
			
 
				+            labels = {}
			
 
				+        img = labels.get("img") if image is None else image
			
 
				+        shape = img.shape[:2]  # current shape [height, width]
			
 
				+        new_shape = labels.pop("rect_shape", self.new_shape)
			
 
				+        if isinstance(new_shape, int):
			
 
				+            new_shape = (new_shape, new_shape)
			
 
				+
			
 
				+        # Scale ratio (new / old)
			
 
				+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
			
 
				+        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
			
 
				+            r = min(r, 1.0)
			
 
				+
			
 
				+        # Compute padding
			
 
				+        ratio = r, r  # width, height ratios
			
 
				+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
			
 
				+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
			
 
				+        if self.auto:  # minimum rectangle
			
 
				+            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
			
 
				+        elif self.scaleFill:  # stretch
			
 
				+            dw, dh = 0.0, 0.0
			
 
				+            new_unpad = (new_shape[1], new_shape[0])
			
 
				+            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
			
 
				+
			
 
				+        if self.center:
			
 
				+            dw /= 2  # divide padding into 2 sides
			
 
				+            dh /= 2
			
 
				+
			
 
				+        if shape[::-1] != new_unpad:  # resize
			
 
				+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
			
 
				+        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
			
 
				+        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
			
 
				+        img = cv2.copyMakeBorder(
			
 
				+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
			
 
				+        )  # add border
			
 
				+        if labels.get("ratio_pad"):
			
 
				+            labels["ratio_pad"] = (labels["ratio_pad"], (left, top))  # for evaluation
			
 
				+
			
 
				+        if len(labels):
			
 
				+            labels = self._update_labels(labels, ratio, left, top)
			
 
				+            labels["img"] = img
			
 
				+            labels["resized_shape"] = new_shape
			
 
				+            return labels
			
 
				+        else:
			
 
				+            return img
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _update_labels(labels, ratio, padw, padh):
			
 
				+        """
			
 
				+        Updates labels after applying letterboxing to an image.
			
 
				+
			
 
				+        This method modifies the bounding box coordinates of instances in the labels
			
 
				+        to account for resizing and padding applied during letterboxing.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image labels and instances.
			
 
				+            ratio (Tuple[float, float]): Scaling ratios (width, height) applied to the image.
			
 
				+            padw (float): Padding width added to the image.
			
 
				+            padh (float): Padding height added to the image.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): Updated labels dictionary with modified instance coordinates.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> letterbox = LetterBox(new_shape=(640, 640))
			
 
				+            >>> labels = {"instances": Instances(...)}
			
 
				+            >>> ratio = (0.5, 0.5)
			
 
				+            >>> padw, padh = 10, 20
			
 
				+            >>> updated_labels = letterbox._update_labels(labels, ratio, padw, padh)
			
 
				+        """
			
 
				+        labels["instances"].convert_bbox(format="xyxy")
			
 
				+        labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
			
 
				+        labels["instances"].scale(*ratio)
			
 
				+        labels["instances"].add_padding(padw, padh)
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class CopyPaste(BaseMixTransform):
			
 
				+    """
			
 
				+    CopyPaste class for applying Copy-Paste augmentation to image datasets.
			
 
				+
			
 
				+    This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
			
 
				+    Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
			
 
				+    different images to create new training samples.
			
 
				+
			
 
				+    Attributes:
			
 
				+        dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
			
 
				+        pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
			
 
				+        p (float): Probability of applying Copy-Paste augmentation.
			
 
				+
			
 
				+    Methods:
			
 
				+        get_indexes: Returns a random index from the dataset.
			
 
				+        _mix_transform: Applies Copy-Paste augmentation to the input labels.
			
 
				+        __call__: Applies the Copy-Paste transformation to images and annotations.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.augment import CopyPaste
			
 
				+        >>> dataset = YourDataset(...)  # Your image dataset
			
 
				+        >>> copypaste = CopyPaste(dataset, p=0.5)
			
 
				+        >>> augmented_labels = copypaste(original_labels)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
			
 
				+        """Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
			
 
				+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
			
 
				+        assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
			
 
				+        self.mode = mode
			
 
				+
			
 
				+    def get_indexes(self):
			
 
				+        """Returns a list of random indexes from the dataset for CopyPaste augmentation."""
			
 
				+        return random.randint(0, len(self.dataset) - 1)
			
 
				+
			
 
				+    def _mix_transform(self, labels):
			
 
				+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
			
 
				+        labels2 = labels["mix_labels"][0]
			
 
				+        return self._transform(labels, labels2)
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """Applies Copy-Paste augmentation to an image and its labels."""
			
 
				+        if len(labels["instances"].segments) == 0 or self.p == 0:
			
 
				+            return labels
			
 
				+        if self.mode == "flip":
			
 
				+            return self._transform(labels)
			
 
				+
			
 
				+        # Get index of one or three other images
			
 
				+        indexes = self.get_indexes()
			
 
				+        if isinstance(indexes, int):
			
 
				+            indexes = [indexes]
			
 
				+
			
 
				+        # Get images information will be used for Mosaic or MixUp
			
 
				+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
			
 
				+
			
 
				+        if self.pre_transform is not None:
			
 
				+            for i, data in enumerate(mix_labels):
			
 
				+                mix_labels[i] = self.pre_transform(data)
			
 
				+        labels["mix_labels"] = mix_labels
			
 
				+
			
 
				+        # Update cls and texts
			
 
				+        labels = self._update_label_text(labels)
			
 
				+        # Mosaic or MixUp
			
 
				+        labels = self._mix_transform(labels)
			
 
				+        labels.pop("mix_labels", None)
			
 
				+        return labels
			
 
				+
			
 
				+    def _transform(self, labels1, labels2={}):
			
 
				+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
			
 
				+        im = labels1["img"]
			
 
				+        cls = labels1["cls"]
			
 
				+        h, w = im.shape[:2]
			
 
				+        instances = labels1.pop("instances")
			
 
				+        instances.convert_bbox(format="xyxy")
			
 
				+        instances.denormalize(w, h)
			
 
				+
			
 
				+        im_new = np.zeros(im.shape, np.uint8)
			
 
				+        instances2 = labels2.pop("instances", None)
			
 
				+        if instances2 is None:
			
 
				+            instances2 = deepcopy(instances)
			
 
				+            instances2.fliplr(w)
			
 
				+        ioa = bbox_ioa(instances2.bboxes, instances.bboxes)  # intersection over area, (N, M)
			
 
				+        indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
			
 
				+        n = len(indexes)
			
 
				+        sorted_idx = np.argsort(ioa.max(1)[indexes])
			
 
				+        indexes = indexes[sorted_idx]
			
 
				+        for j in indexes[: round(self.p * n)]:
			
 
				+            cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
			
 
				+            instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
			
 
				+            cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
			
 
				+
			
 
				+        result = labels2.get("img", cv2.flip(im, 1))  # augment segments
			
 
				+        i = im_new.astype(bool)
			
 
				+        im[i] = result[i]
			
 
				+
			
 
				+        labels1["img"] = im
			
 
				+        labels1["cls"] = cls
			
 
				+        labels1["instances"] = instances
			
 
				+        return labels1
			
 
				+
			
 
				+
			
 
				+class Albumentations:
			
 
				+    """
			
 
				+    Albumentations transformations for image augmentation.
			
 
				+
			
 
				+    This class applies various image transformations using the Albumentations library. It includes operations such as
			
 
				+    Blur, Median Blur, conversion to grayscale, Contrast Limited Adaptive Histogram Equalization (CLAHE), random changes
			
 
				+    in brightness and contrast, RandomGamma, and image quality reduction through compression.
			
 
				+
			
 
				+    Attributes:
			
 
				+        p (float): Probability of applying the transformations.
			
 
				+        transform (albumentations.Compose): Composed Albumentations transforms.
			
 
				+        contains_spatial (bool): Indicates if the transforms include spatial operations.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the Albumentations transformations to the input labels.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = Albumentations(p=0.5)
			
 
				+        >>> augmented_labels = transform(labels)
			
 
				+
			
 
				+    Notes:
			
 
				+        - The Albumentations package must be installed to use this class.
			
 
				+        - If the package is not installed or an error occurs during initialization, the transform will be set to None.
			
 
				+        - Spatial transforms are handled differently and require special processing for bounding boxes.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, p=1.0):
			
 
				+        """
			
 
				+        Initialize the Albumentations transform object for YOLO bbox formatted parameters.
			
 
				+
			
 
				+        This class applies various image augmentations using the Albumentations library, including Blur, Median Blur,
			
 
				+        conversion to grayscale, Contrast Limited Adaptive Histogram Equalization, random changes of brightness and
			
 
				+        contrast, RandomGamma, and image quality reduction through compression.
			
 
				+
			
 
				+        Args:
			
 
				+            p (float): Probability of applying the augmentations. Must be between 0 and 1.
			
 
				+
			
 
				+        Attributes:
			
 
				+            p (float): Probability of applying the augmentations.
			
 
				+            transform (albumentations.Compose): Composed Albumentations transforms.
			
 
				+            contains_spatial (bool): Indicates if the transforms include spatial transformations.
			
 
				+
			
 
				+        Raises:
			
 
				+            ImportError: If the Albumentations package is not installed.
			
 
				+            Exception: For any other errors during initialization.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = Albumentations(p=0.5)
			
 
				+            >>> augmented = transform(image=image, bboxes=bboxes, class_labels=classes)
			
 
				+            >>> augmented_image = augmented["image"]
			
 
				+            >>> augmented_bboxes = augmented["bboxes"]
			
 
				+
			
 
				+        Notes:
			
 
				+            - Requires Albumentations version 1.0.3 or higher.
			
 
				+            - Spatial transforms are handled differently to ensure bbox compatibility.
			
 
				+            - Some transforms are applied with very low probability (0.01) by default.
			
 
				+        """
			
 
				+        self.p = p
			
 
				+        self.transform = None
			
 
				+        prefix = colorstr("albumentations: ")
			
 
				+
			
 
				+        try:
			
 
				+            import albumentations as A
			
 
				+
			
 
				+            check_version(A.__version__, "1.0.3", hard=True)  # version requirement
			
 
				+
			
 
				+            # List of possible spatial transforms
			
 
				+            spatial_transforms = {
			
 
				+                "Affine",
			
 
				+                "BBoxSafeRandomCrop",
			
 
				+                "CenterCrop",
			
 
				+                "CoarseDropout",
			
 
				+                "Crop",
			
 
				+                "CropAndPad",
			
 
				+                "CropNonEmptyMaskIfExists",
			
 
				+                "D4",
			
 
				+                "ElasticTransform",
			
 
				+                "Flip",
			
 
				+                "GridDistortion",
			
 
				+                "GridDropout",
			
 
				+                "HorizontalFlip",
			
 
				+                "Lambda",
			
 
				+                "LongestMaxSize",
			
 
				+                "MaskDropout",
			
 
				+                "MixUp",
			
 
				+                "Morphological",
			
 
				+                "NoOp",
			
 
				+                "OpticalDistortion",
			
 
				+                "PadIfNeeded",
			
 
				+                "Perspective",
			
 
				+                "PiecewiseAffine",
			
 
				+                "PixelDropout",
			
 
				+                "RandomCrop",
			
 
				+                "RandomCropFromBorders",
			
 
				+                "RandomGridShuffle",
			
 
				+                "RandomResizedCrop",
			
 
				+                "RandomRotate90",
			
 
				+                "RandomScale",
			
 
				+                "RandomSizedBBoxSafeCrop",
			
 
				+                "RandomSizedCrop",
			
 
				+                "Resize",
			
 
				+                "Rotate",
			
 
				+                "SafeRotate",
			
 
				+                "ShiftScaleRotate",
			
 
				+                "SmallestMaxSize",
			
 
				+                "Transpose",
			
 
				+                "VerticalFlip",
			
 
				+                "XYMasking",
			
 
				+            }  # from https://albumentations.ai/docs/getting_started/transforms_and_targets/#spatial-level-transforms
			
 
				+
			
 
				+            # Transforms
			
 
				+            T = [
			
 
				+                A.Blur(p=0.01),
			
 
				+                A.MedianBlur(p=0.01),
			
 
				+                A.ToGray(p=0.01),
			
 
				+                A.CLAHE(p=0.01),
			
 
				+                A.RandomBrightnessContrast(p=0.0),
			
 
				+                A.RandomGamma(p=0.0),
			
 
				+                A.ImageCompression(quality_range=(75, 100), p=0.5),
			
 
				+            ]
			
 
				+
			
 
				+            # Compose transforms
			
 
				+            self.contains_spatial = any(transform.__class__.__name__ in spatial_transforms for transform in T)
			
 
				+            self.transform = (
			
 
				+                A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
			
 
				+                if self.contains_spatial
			
 
				+                else A.Compose(T)
			
 
				+            )
			
 
				+            if hasattr(self.transform, "set_random_seed"):
			
 
				+                # Required for deterministic transforms in albumentations>=1.4.21
			
 
				+                self.transform.set_random_seed(torch.initial_seed())
			
 
				+            LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
			
 
				+        except ImportError:  # package not installed, skip
			
 
				+            pass
			
 
				+        except Exception as e:
			
 
				+            LOGGER.info(f"{prefix}{e}")
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Applies Albumentations transformations to input labels.
			
 
				+
			
 
				+        This method applies a series of image augmentations using the Albumentations library. It can perform both
			
 
				+        spatial and non-spatial transformations on the input image and its corresponding labels.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image data and annotations. Expected keys are:
			
 
				+                - 'img': numpy.ndarray representing the image
			
 
				+                - 'cls': numpy.ndarray of class labels
			
 
				+                - 'instances': object containing bounding boxes and other instance information
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): The input dictionary with augmented image and updated annotations.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = Albumentations(p=0.5)
			
 
				+            >>> labels = {
			
 
				+            ...     "img": np.random.rand(640, 640, 3),
			
 
				+            ...     "cls": np.array([0, 1]),
			
 
				+            ...     "instances": Instances(bboxes=np.array([[0, 0, 1, 1], [0.5, 0.5, 0.8, 0.8]])),
			
 
				+            ... }
			
 
				+            >>> augmented = transform(labels)
			
 
				+            >>> assert augmented["img"].shape == (640, 640, 3)
			
 
				+
			
 
				+        Notes:
			
 
				+            - The method applies transformations with probability self.p.
			
 
				+            - Spatial transforms update bounding boxes, while non-spatial transforms only modify the image.
			
 
				+            - Requires the Albumentations library to be installed.
			
 
				+        """
			
 
				+        if self.transform is None or random.random() > self.p:
			
 
				+            return labels
			
 
				+
			
 
				+        if self.contains_spatial:
			
 
				+            cls = labels["cls"]
			
 
				+            if len(cls):
			
 
				+                im = labels["img"]
			
 
				+                labels["instances"].convert_bbox("xywh")
			
 
				+                labels["instances"].normalize(*im.shape[:2][::-1])
			
 
				+                bboxes = labels["instances"].bboxes
			
 
				+                # TODO: add supports of segments and keypoints
			
 
				+                new = self.transform(image=im, bboxes=bboxes, class_labels=cls)  # transformed
			
 
				+                if len(new["class_labels"]) > 0:  # skip update if no bbox in new im
			
 
				+                    labels["img"] = new["image"]
			
 
				+                    labels["cls"] = np.array(new["class_labels"])
			
 
				+                    bboxes = np.array(new["bboxes"], dtype=np.float32)
			
 
				+                labels["instances"].update(bboxes=bboxes)
			
 
				+        else:
			
 
				+            labels["img"] = self.transform(image=labels["img"])["image"]  # transformed
			
 
				+
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+class Format:
			
 
				+    """
			
 
				+    A class for formatting image annotations for object detection, instance segmentation, and pose estimation tasks.
			
 
				+
			
 
				+    This class standardizes image and instance annotations to be used by the `collate_fn` in PyTorch DataLoader.
			
 
				+
			
 
				+    Attributes:
			
 
				+        bbox_format (str): Format for bounding boxes. Options are 'xywh' or 'xyxy'.
			
 
				+        normalize (bool): Whether to normalize bounding boxes.
			
 
				+        return_mask (bool): Whether to return instance masks for segmentation.
			
 
				+        return_keypoint (bool): Whether to return keypoints for pose estimation.
			
 
				+        return_obb (bool): Whether to return oriented bounding boxes.
			
 
				+        mask_ratio (int): Downsample ratio for masks.
			
 
				+        mask_overlap (bool): Whether to overlap masks.
			
 
				+        batch_idx (bool): Whether to keep batch indexes.
			
 
				+        bgr (float): The probability to return BGR images.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Formats labels dictionary with image, classes, bounding boxes, and optionally masks and keypoints.
			
 
				+        _format_img: Converts image from Numpy array to PyTorch tensor.
			
 
				+        _format_segments: Converts polygon points to bitmap masks.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
			
 
				+        >>> formatted_labels = formatter(labels)
			
 
				+        >>> img = formatted_labels["img"]
			
 
				+        >>> bboxes = formatted_labels["bboxes"]
			
 
				+        >>> masks = formatted_labels["masks"]
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        bbox_format="xywh",
			
 
				+        normalize=True,
			
 
				+        return_mask=False,
			
 
				+        return_keypoint=False,
			
 
				+        return_obb=False,
			
 
				+        mask_ratio=4,
			
 
				+        mask_overlap=True,
			
 
				+        batch_idx=True,
			
 
				+        bgr=0.0,
			
 
				+    ):
			
 
				+        """
			
 
				+        Initializes the Format class with given parameters for image and instance annotation formatting.
			
 
				+
			
 
				+        This class standardizes image and instance annotations for object detection, instance segmentation, and pose
			
 
				+        estimation tasks, preparing them for use in PyTorch DataLoader's `collate_fn`.
			
 
				+
			
 
				+        Args:
			
 
				+            bbox_format (str): Format for bounding boxes. Options are 'xywh', 'xyxy', etc.
			
 
				+            normalize (bool): Whether to normalize bounding boxes to [0,1].
			
 
				+            return_mask (bool): If True, returns instance masks for segmentation tasks.
			
 
				+            return_keypoint (bool): If True, returns keypoints for pose estimation tasks.
			
 
				+            return_obb (bool): If True, returns oriented bounding boxes.
			
 
				+            mask_ratio (int): Downsample ratio for masks.
			
 
				+            mask_overlap (bool): If True, allows mask overlap.
			
 
				+            batch_idx (bool): If True, keeps batch indexes.
			
 
				+            bgr (float): Probability of returning BGR images instead of RGB.
			
 
				+
			
 
				+        Attributes:
			
 
				+            bbox_format (str): Format for bounding boxes.
			
 
				+            normalize (bool): Whether bounding boxes are normalized.
			
 
				+            return_mask (bool): Whether to return instance masks.
			
 
				+            return_keypoint (bool): Whether to return keypoints.
			
 
				+            return_obb (bool): Whether to return oriented bounding boxes.
			
 
				+            mask_ratio (int): Downsample ratio for masks.
			
 
				+            mask_overlap (bool): Whether masks can overlap.
			
 
				+            batch_idx (bool): Whether to keep batch indexes.
			
 
				+            bgr (float): The probability to return BGR images.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> format = Format(bbox_format="xyxy", return_mask=True, return_keypoint=False)
			
 
				+            >>> print(format.bbox_format)
			
 
				+            xyxy
			
 
				+        """
			
 
				+        self.bbox_format = bbox_format
			
 
				+        self.normalize = normalize
			
 
				+        self.return_mask = return_mask  # set False when training detection only
			
 
				+        self.return_keypoint = return_keypoint
			
 
				+        self.return_obb = return_obb
			
 
				+        self.mask_ratio = mask_ratio
			
 
				+        self.mask_overlap = mask_overlap
			
 
				+        self.batch_idx = batch_idx  # keep the batch indexes
			
 
				+        self.bgr = bgr
			
 
				+
			
 
				+    def __call__(self, labels):
			
 
				+        """
			
 
				+        Formats image annotations for object detection, instance segmentation, and pose estimation tasks.
			
 
				+
			
 
				+        This method standardizes the image and instance annotations to be used by the `collate_fn` in PyTorch
			
 
				+        DataLoader. It processes the input labels dictionary, converting annotations to the specified format and
			
 
				+        applying normalization if required.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image and annotation data with the following keys:
			
 
				+                - 'img': The input image as a numpy array.
			
 
				+                - 'cls': Class labels for instances.
			
 
				+                - 'instances': An Instances object containing bounding boxes, segments, and keypoints.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): A dictionary with formatted data, including:
			
 
				+                - 'img': Formatted image tensor.
			
 
				+                - 'cls': Class label's tensor.
			
 
				+                - 'bboxes': Bounding boxes tensor in the specified format.
			
 
				+                - 'masks': Instance masks tensor (if return_mask is True).
			
 
				+                - 'keypoints': Keypoints tensor (if return_keypoint is True).
			
 
				+                - 'batch_idx': Batch index tensor (if batch_idx is True).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
			
 
				+            >>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1]), "instances": Instances(...)}
			
 
				+            >>> formatted_labels = formatter(labels)
			
 
				+            >>> print(formatted_labels.keys())
			
 
				+        """
			
 
				+        img = labels.pop("img")
			
 
				+        h, w = img.shape[:2]
			
 
				+        cls = labels.pop("cls")
			
 
				+        instances = labels.pop("instances")
			
 
				+        instances.convert_bbox(format=self.bbox_format)
			
 
				+        instances.denormalize(w, h)
			
 
				+        nl = len(instances)
			
 
				+
			
 
				+        if self.return_mask:
			
 
				+            if nl:
			
 
				+                masks, instances, cls = self._format_segments(instances, cls, w, h)
			
 
				+                masks = torch.from_numpy(masks)
			
 
				+            else:
			
 
				+                masks = torch.zeros(
			
 
				+                    1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio, img.shape[1] // self.mask_ratio
			
 
				+                )
			
 
				+            labels["masks"] = masks
			
 
				+        labels["img"] = self._format_img(img)
			
 
				+        labels["cls"] = torch.from_numpy(cls) if nl else torch.zeros(nl)
			
 
				+        labels["bboxes"] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
			
 
				+        if self.return_keypoint:
			
 
				+            labels["keypoints"] = torch.from_numpy(instances.keypoints)
			
 
				+            if self.normalize:
			
 
				+                labels["keypoints"][..., 0] /= w
			
 
				+                labels["keypoints"][..., 1] /= h
			
 
				+        if self.return_obb:
			
 
				+            labels["bboxes"] = (
			
 
				+                xyxyxyxy2xywhr(torch.from_numpy(instances.segments)) if len(instances.segments) else torch.zeros((0, 5))
			
 
				+            )
			
 
				+        # NOTE: need to normalize obb in xywhr format for width-height consistency
			
 
				+        if self.normalize:
			
 
				+            labels["bboxes"][:, [0, 2]] /= w
			
 
				+            labels["bboxes"][:, [1, 3]] /= h
			
 
				+        # Then we can use collate_fn
			
 
				+        if self.batch_idx:
			
 
				+            labels["batch_idx"] = torch.zeros(nl)
			
 
				+        return labels
			
 
				+
			
 
				+    def _format_img(self, img):
			
 
				+        """
			
 
				+        Formats an image for YOLO from a Numpy array to a PyTorch tensor.
			
 
				+
			
 
				+        This function performs the following operations:
			
 
				+        1. Ensures the image has 3 dimensions (adds a channel dimension if needed).
			
 
				+        2. Transposes the image from HWC to CHW format.
			
 
				+        3. Optionally flips the color channels from RGB to BGR.
			
 
				+        4. Converts the image to a contiguous array.
			
 
				+        5. Converts the Numpy array to a PyTorch tensor.
			
 
				+
			
 
				+        Args:
			
 
				+            img (np.ndarray): Input image as a Numpy array with shape (H, W, C) or (H, W).
			
 
				+
			
 
				+        Returns:
			
 
				+            (torch.Tensor): Formatted image as a PyTorch tensor with shape (C, H, W).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> import numpy as np
			
 
				+            >>> img = np.random.rand(100, 100, 3)
			
 
				+            >>> formatted_img = self._format_img(img)
			
 
				+            >>> print(formatted_img.shape)
			
 
				+            torch.Size([3, 100, 100])
			
 
				+        """
			
 
				+        if len(img.shape) < 3:
			
 
				+            img = np.expand_dims(img, -1)
			
 
				+        img = img.transpose(2, 0, 1)
			
 
				+        img = np.ascontiguousarray(img[::-1] if random.uniform(0, 1) > self.bgr else img)
			
 
				+        img = torch.from_numpy(img)
			
 
				+        return img
			
 
				+
			
 
				+    def _format_segments(self, instances, cls, w, h):
			
 
				+        """
			
 
				+        Converts polygon segments to bitmap masks.
			
 
				+
			
 
				+        Args:
			
 
				+            instances (Instances): Object containing segment information.
			
 
				+            cls (numpy.ndarray): Class labels for each instance.
			
 
				+            w (int): Width of the image.
			
 
				+            h (int): Height of the image.
			
 
				+
			
 
				+        Returns:
			
 
				+            masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
			
 
				+            instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
			
 
				+            cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
			
 
				+
			
 
				+        Notes:
			
 
				+            - If self.mask_overlap is True, masks are overlapped and sorted by area.
			
 
				+            - If self.mask_overlap is False, each mask is represented separately.
			
 
				+            - Masks are downsampled according to self.mask_ratio.
			
 
				+        """
			
 
				+        segments = instances.segments
			
 
				+        if self.mask_overlap:
			
 
				+            masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
			
 
				+            masks = masks[None]  # (640, 640) -> (1, 640, 640)
			
 
				+            instances = instances[sorted_idx]
			
 
				+            cls = cls[sorted_idx]
			
 
				+        else:
			
 
				+            masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio)
			
 
				+
			
 
				+        return masks, instances, cls
			
 
				+
			
 
				+
			
 
				+class RandomLoadText:
			
 
				+    """
			
 
				+    Randomly samples positive and negative texts and updates class indices accordingly.
			
 
				+
			
 
				+    This class is responsible for sampling texts from a given set of class texts, including both positive
			
 
				+    (present in the image) and negative (not present in the image) samples. It updates the class indices
			
 
				+    to reflect the sampled texts and can optionally pad the text list to a fixed length.
			
 
				+
			
 
				+    Attributes:
			
 
				+        prompt_format (str): Format string for text prompts.
			
 
				+        neg_samples (Tuple[int, int]): Range for randomly sampling negative texts.
			
 
				+        max_samples (int): Maximum number of different text samples in one image.
			
 
				+        padding (bool): Whether to pad texts to max_samples.
			
 
				+        padding_value (str): The text used for padding when padding is True.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Processes the input labels and returns updated classes and texts.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> loader = RandomLoadText(prompt_format="Object: {}", neg_samples=(5, 10), max_samples=20)
			
 
				+        >>> labels = {"cls": [0, 1, 2], "texts": [["cat"], ["dog"], ["bird"]], "instances": [...]}
			
 
				+        >>> updated_labels = loader(labels)
			
 
				+        >>> print(updated_labels["texts"])
			
 
				+        ['Object: cat', 'Object: dog', 'Object: bird', 'Object: elephant', 'Object: car']
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        prompt_format: str = "{}",
			
 
				+        neg_samples: Tuple[int, int] = (80, 80),
			
 
				+        max_samples: int = 80,
			
 
				+        padding: bool = False,
			
 
				+        padding_value: str = "",
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        Initializes the RandomLoadText class for randomly sampling positive and negative texts.
			
 
				+
			
 
				+        This class is designed to randomly sample positive texts and negative texts, and update the class
			
 
				+        indices accordingly to the number of samples. It can be used for text-based object detection tasks.
			
 
				+
			
 
				+        Args:
			
 
				+            prompt_format (str): Format string for the prompt. Default is '{}'. The format string should
			
 
				+                contain a single pair of curly braces {} where the text will be inserted.
			
 
				+            neg_samples (Tuple[int, int]): A range to randomly sample negative texts. The first integer
			
 
				+                specifies the minimum number of negative samples, and the second integer specifies the
			
 
				+                maximum. Default is (80, 80).
			
 
				+            max_samples (int): The maximum number of different text samples in one image. Default is 80.
			
 
				+            padding (bool): Whether to pad texts to max_samples. If True, the number of texts will always
			
 
				+                be equal to max_samples. Default is False.
			
 
				+            padding_value (str): The padding text to use when padding is True. Default is an empty string.
			
 
				+
			
 
				+        Attributes:
			
 
				+            prompt_format (str): The format string for the prompt.
			
 
				+            neg_samples (Tuple[int, int]): The range for sampling negative texts.
			
 
				+            max_samples (int): The maximum number of text samples.
			
 
				+            padding (bool): Whether padding is enabled.
			
 
				+            padding_value (str): The value used for padding.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> random_load_text = RandomLoadText(prompt_format="Object: {}", neg_samples=(50, 100), max_samples=120)
			
 
				+            >>> random_load_text.prompt_format
			
 
				+            'Object: {}'
			
 
				+            >>> random_load_text.neg_samples
			
 
				+            (50, 100)
			
 
				+            >>> random_load_text.max_samples
			
 
				+            120
			
 
				+        """
			
 
				+        self.prompt_format = prompt_format
			
 
				+        self.neg_samples = neg_samples
			
 
				+        self.max_samples = max_samples
			
 
				+        self.padding = padding
			
 
				+        self.padding_value = padding_value
			
 
				+
			
 
				+    def __call__(self, labels: dict) -> dict:
			
 
				+        """
			
 
				+        Randomly samples positive and negative texts and updates class indices accordingly.
			
 
				+
			
 
				+        This method samples positive texts based on the existing class labels in the image, and randomly
			
 
				+        selects negative texts from the remaining classes. It then updates the class indices to match the
			
 
				+        new sampled text order.
			
 
				+
			
 
				+        Args:
			
 
				+            labels (Dict): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
			
 
				+
			
 
				+        Returns:
			
 
				+            (Dict): Updated labels dictionary with new 'cls' and 'texts' entries.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> loader = RandomLoadText(prompt_format="A photo of {}", neg_samples=(5, 10), max_samples=20)
			
 
				+            >>> labels = {"cls": np.array([[0], [1], [2]]), "texts": [["dog"], ["cat"], ["bird"]]}
			
 
				+            >>> updated_labels = loader(labels)
			
 
				+        """
			
 
				+        assert "texts" in labels, "No texts found in labels."
			
 
				+        class_texts = labels["texts"]
			
 
				+        num_classes = len(class_texts)
			
 
				+        cls = np.asarray(labels.pop("cls"), dtype=int)
			
 
				+        pos_labels = np.unique(cls).tolist()
			
 
				+
			
 
				+        if len(pos_labels) > self.max_samples:
			
 
				+            pos_labels = random.sample(pos_labels, k=self.max_samples)
			
 
				+
			
 
				+        neg_samples = min(min(num_classes, self.max_samples) - len(pos_labels), random.randint(*self.neg_samples))
			
 
				+        neg_labels = [i for i in range(num_classes) if i not in pos_labels]
			
 
				+        neg_labels = random.sample(neg_labels, k=neg_samples)
			
 
				+
			
 
				+        sampled_labels = pos_labels + neg_labels
			
 
				+        random.shuffle(sampled_labels)
			
 
				+
			
 
				+        label2ids = {label: i for i, label in enumerate(sampled_labels)}
			
 
				+        valid_idx = np.zeros(len(labels["instances"]), dtype=bool)
			
 
				+        new_cls = []
			
 
				+        for i, label in enumerate(cls.squeeze(-1).tolist()):
			
 
				+            if label not in label2ids:
			
 
				+                continue
			
 
				+            valid_idx[i] = True
			
 
				+            new_cls.append([label2ids[label]])
			
 
				+        labels["instances"] = labels["instances"][valid_idx]
			
 
				+        labels["cls"] = np.array(new_cls)
			
 
				+
			
 
				+        # Randomly select one prompt when there's more than one prompts
			
 
				+        texts = []
			
 
				+        for label in sampled_labels:
			
 
				+            prompts = class_texts[label]
			
 
				+            assert len(prompts) > 0
			
 
				+            prompt = self.prompt_format.format(prompts[random.randrange(len(prompts))])
			
 
				+            texts.append(prompt)
			
 
				+
			
 
				+        if self.padding:
			
 
				+            valid_labels = len(pos_labels) + len(neg_labels)
			
 
				+            num_padding = self.max_samples - valid_labels
			
 
				+            if num_padding > 0:
			
 
				+                texts += [self.padding_value] * num_padding
			
 
				+
			
 
				+        labels["texts"] = texts
			
 
				+        return labels
			
 
				+
			
 
				+
			
 
				+def v8_transforms(dataset, imgsz, hyp, stretch=False):
			
 
				+    """
			
 
				+    Applies a series of image transformations for training.
			
 
				+
			
 
				+    This function creates a composition of image augmentation techniques to prepare images for YOLO training.
			
 
				+    It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
			
 
				+
			
 
				+    Args:
			
 
				+        dataset (Dataset): The dataset object containing image data and annotations.
			
 
				+        imgsz (int): The target image size for resizing.
			
 
				+        hyp (Namespace): A dictionary of hyperparameters controlling various aspects of the transformations.
			
 
				+        stretch (bool): If True, applies stretching to the image. If False, uses LetterBox resizing.
			
 
				+
			
 
				+    Returns:
			
 
				+        (Compose): A composition of image transformations to be applied to the dataset.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.dataset import YOLODataset
			
 
				+        >>> from ultralytics.utils import IterableSimpleNamespace
			
 
				+        >>> dataset = YOLODataset(img_path="path/to/images", imgsz=640)
			
 
				+        >>> hyp = IterableSimpleNamespace(mosaic=1.0, copy_paste=0.5, degrees=10.0, translate=0.2, scale=0.9)
			
 
				+        >>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
			
 
				+        >>> augmented_data = transforms(dataset[0])
			
 
				+    """
			
 
				+    mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
			
 
				+    affine = RandomPerspective(
			
 
				+        degrees=hyp.degrees,
			
 
				+        translate=hyp.translate,
			
 
				+        scale=hyp.scale,
			
 
				+        shear=hyp.shear,
			
 
				+        perspective=hyp.perspective,
			
 
				+        pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
			
 
				+    )
			
 
				+
			
 
				+    pre_transform = Compose([mosaic, affine])
			
 
				+    if hyp.copy_paste_mode == "flip":
			
 
				+        pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
			
 
				+    else:
			
 
				+        pre_transform.append(
			
 
				+            CopyPaste(
			
 
				+                dataset,
			
 
				+                pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
			
 
				+                p=hyp.copy_paste,
			
 
				+                mode=hyp.copy_paste_mode,
			
 
				+            )
			
 
				+        )
			
 
				+    flip_idx = dataset.data.get("flip_idx", [])  # for keypoints augmentation
			
 
				+    if dataset.use_keypoints:
			
 
				+        kpt_shape = dataset.data.get("kpt_shape", None)
			
 
				+        if len(flip_idx) == 0 and hyp.fliplr > 0.0:
			
 
				+            hyp.fliplr = 0.0
			
 
				+            LOGGER.warning("WARNING ⚠️ No 'flip_idx' array defined in data.yaml, setting augmentation 'fliplr=0.0'")
			
 
				+        elif flip_idx and (len(flip_idx) != kpt_shape[0]):
			
 
				+            raise ValueError(f"data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}")
			
 
				+
			
 
				+    return Compose(
			
 
				+        [
			
 
				+            pre_transform,
			
 
				+            MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
			
 
				+            Albumentations(p=1.0),
			
 
				+            RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
			
 
				+            RandomFlip(direction="vertical", p=hyp.flipud),
			
 
				+            RandomFlip(direction="horizontal", p=hyp.fliplr, flip_idx=flip_idx),
			
 
				+        ]
			
 
				+    )  # transforms
			
 
				+
			
 
				+
			
 
				+# Classification augmentations -----------------------------------------------------------------------------------------
			
 
				+def classify_transforms(
			
 
				+    size=224,
			
 
				+    mean=DEFAULT_MEAN,
			
 
				+    std=DEFAULT_STD,
			
 
				+    interpolation="BILINEAR",
			
 
				+    crop_fraction: float = DEFAULT_CROP_FRACTION,
			
 
				+):
			
 
				+    """
			
 
				+    Creates a composition of image transforms for classification tasks.
			
 
				+
			
 
				+    This function generates a sequence of torchvision transforms suitable for preprocessing images
			
 
				+    for classification models during evaluation or inference. The transforms include resizing,
			
 
				+    center cropping, conversion to tensor, and normalization.
			
 
				+
			
 
				+    Args:
			
 
				+        size (int | tuple): The target size for the transformed image. If an int, it defines the shortest edge. If a
			
 
				+            tuple, it defines (height, width).
			
 
				+        mean (tuple): Mean values for each RGB channel used in normalization.
			
 
				+        std (tuple): Standard deviation values for each RGB channel used in normalization.
			
 
				+        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
			
 
				+        crop_fraction (float): Fraction of the image to be cropped.
			
 
				+
			
 
				+    Returns:
			
 
				+        (torchvision.transforms.Compose): A composition of torchvision transforms.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transforms = classify_transforms(size=224)
			
 
				+        >>> img = Image.open("path/to/image.jpg")
			
 
				+        >>> transformed_img = transforms(img)
			
 
				+    """
			
 
				+    import torchvision.transforms as T  # scope for faster 'import ultralytics'
			
 
				+
			
 
				+    if isinstance(size, (tuple, list)):
			
 
				+        assert len(size) == 2, f"'size' tuples must be length 2, not length {len(size)}"
			
 
				+        scale_size = tuple(math.floor(x / crop_fraction) for x in size)
			
 
				+    else:
			
 
				+        scale_size = math.floor(size / crop_fraction)
			
 
				+        scale_size = (scale_size, scale_size)
			
 
				+
			
 
				+    # Aspect ratio is preserved, crops center within image, no borders are added, image is lost
			
 
				+    if scale_size[0] == scale_size[1]:
			
 
				+        # Simple case, use torchvision built-in Resize with the shortest edge mode (scalar size arg)
			
 
				+        tfl = [T.Resize(scale_size[0], interpolation=getattr(T.InterpolationMode, interpolation))]
			
 
				+    else:
			
 
				+        # Resize the shortest edge to matching target dim for non-square target
			
 
				+        tfl = [T.Resize(scale_size)]
			
 
				+    tfl.extend(
			
 
				+        [
			
 
				+            T.CenterCrop(size),
			
 
				+            T.ToTensor(),
			
 
				+            T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
			
 
				+        ]
			
 
				+    )
			
 
				+    return T.Compose(tfl)
			
 
				+
			
 
				+
			
 
				+# Classification training augmentations --------------------------------------------------------------------------------
			
 
				+def classify_augmentations(
			
 
				+    size=224,
			
 
				+    mean=DEFAULT_MEAN,
			
 
				+    std=DEFAULT_STD,
			
 
				+    scale=None,
			
 
				+    ratio=None,
			
 
				+    hflip=0.5,
			
 
				+    vflip=0.0,
			
 
				+    auto_augment=None,
			
 
				+    hsv_h=0.015,  # image HSV-Hue augmentation (fraction)
			
 
				+    hsv_s=0.4,  # image HSV-Saturation augmentation (fraction)
			
 
				+    hsv_v=0.4,  # image HSV-Value augmentation (fraction)
			
 
				+    force_color_jitter=False,
			
 
				+    erasing=0.0,
			
 
				+    interpolation="BILINEAR",
			
 
				+):
			
 
				+    """
			
 
				+    Creates a composition of image augmentation transforms for classification tasks.
			
 
				+
			
 
				+    This function generates a set of image transformations suitable for training classification models. It includes
			
 
				+    options for resizing, flipping, color jittering, auto augmentation, and random erasing.
			
 
				+
			
 
				+    Args:
			
 
				+        size (int): Target size for the image after transformations.
			
 
				+        mean (tuple): Mean values for normalization, one per channel.
			
 
				+        std (tuple): Standard deviation values for normalization, one per channel.
			
 
				+        scale (tuple | None): Range of size of the origin size cropped.
			
 
				+        ratio (tuple | None): Range of aspect ratio of the origin aspect ratio cropped.
			
 
				+        hflip (float): Probability of horizontal flip.
			
 
				+        vflip (float): Probability of vertical flip.
			
 
				+        auto_augment (str | None): Auto augmentation policy. Can be 'randaugment', 'augmix', 'autoaugment' or None.
			
 
				+        hsv_h (float): Image HSV-Hue augmentation factor.
			
 
				+        hsv_s (float): Image HSV-Saturation augmentation factor.
			
 
				+        hsv_v (float): Image HSV-Value augmentation factor.
			
 
				+        force_color_jitter (bool): Whether to apply color jitter even if auto augment is enabled.
			
 
				+        erasing (float): Probability of random erasing.
			
 
				+        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
			
 
				+
			
 
				+    Returns:
			
 
				+        (torchvision.transforms.Compose): A composition of image augmentation transforms.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transforms = classify_augmentations(size=224, auto_augment="randaugment")
			
 
				+        >>> augmented_image = transforms(original_image)
			
 
				+    """
			
 
				+    # Transforms to apply if Albumentations not installed
			
 
				+    import torchvision.transforms as T  # scope for faster 'import ultralytics'
			
 
				+
			
 
				+    if not isinstance(size, int):
			
 
				+        raise TypeError(f"classify_transforms() size {size} must be integer, not (list, tuple)")
			
 
				+    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
			
 
				+    ratio = tuple(ratio or (3.0 / 4.0, 4.0 / 3.0))  # default imagenet ratio range
			
 
				+    interpolation = getattr(T.InterpolationMode, interpolation)
			
 
				+    primary_tfl = [T.RandomResizedCrop(size, scale=scale, ratio=ratio, interpolation=interpolation)]
			
 
				+    if hflip > 0.0:
			
 
				+        primary_tfl.append(T.RandomHorizontalFlip(p=hflip))
			
 
				+    if vflip > 0.0:
			
 
				+        primary_tfl.append(T.RandomVerticalFlip(p=vflip))
			
 
				+
			
 
				+    secondary_tfl = []
			
 
				+    disable_color_jitter = False
			
 
				+    if auto_augment:
			
 
				+        assert isinstance(auto_augment, str), f"Provided argument should be string, but got type {type(auto_augment)}"
			
 
				+        # color jitter is typically disabled if AA/RA on,
			
 
				+        # this allows override without breaking old hparm cfgs
			
 
				+        disable_color_jitter = not force_color_jitter
			
 
				+
			
 
				+        if auto_augment == "randaugment":
			
 
				+            if TORCHVISION_0_11:
			
 
				+                secondary_tfl.append(T.RandAugment(interpolation=interpolation))
			
 
				+            else:
			
 
				+                LOGGER.warning('"auto_augment=randaugment" requires torchvision >= 0.11.0. Disabling it.')
			
 
				+
			
 
				+        elif auto_augment == "augmix":
			
 
				+            if TORCHVISION_0_13:
			
 
				+                secondary_tfl.append(T.AugMix(interpolation=interpolation))
			
 
				+            else:
			
 
				+                LOGGER.warning('"auto_augment=augmix" requires torchvision >= 0.13.0. Disabling it.')
			
 
				+
			
 
				+        elif auto_augment == "autoaugment":
			
 
				+            if TORCHVISION_0_10:
			
 
				+                secondary_tfl.append(T.AutoAugment(interpolation=interpolation))
			
 
				+            else:
			
 
				+                LOGGER.warning('"auto_augment=autoaugment" requires torchvision >= 0.10.0. Disabling it.')
			
 
				+
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                f'Invalid auto_augment policy: {auto_augment}. Should be one of "randaugment", '
			
 
				+                f'"augmix", "autoaugment" or None'
			
 
				+            )
			
 
				+
			
 
				+    if not disable_color_jitter:
			
 
				+        secondary_tfl.append(T.ColorJitter(brightness=hsv_v, contrast=hsv_v, saturation=hsv_s, hue=hsv_h))
			
 
				+
			
 
				+    final_tfl = [
			
 
				+        T.ToTensor(),
			
 
				+        T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
			
 
				+        T.RandomErasing(p=erasing, inplace=True),
			
 
				+    ]
			
 
				+
			
 
				+    return T.Compose(primary_tfl + secondary_tfl + final_tfl)
			
 
				+
			
 
				+
			
 
				+# NOTE: keep this class for backward compatibility
			
 
				+class ClassifyLetterBox:
			
 
				+    """
			
 
				+    A class for resizing and padding images for classification tasks.
			
 
				+
			
 
				+    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
			
 
				+    It resizes and pads images to a specified size while maintaining the original aspect ratio.
			
 
				+
			
 
				+    Attributes:
			
 
				+        h (int): Target height of the image.
			
 
				+        w (int): Target width of the image.
			
 
				+        auto (bool): If True, automatically calculates the short side using stride.
			
 
				+        stride (int): The stride value, used when 'auto' is True.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the letterbox transformation to an input image.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = ClassifyLetterBox(size=(640, 640), auto=False, stride=32)
			
 
				+        >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
			
 
				+        >>> result = transform(img)
			
 
				+        >>> print(result.shape)
			
 
				+        (640, 640, 3)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, size=(640, 640), auto=False, stride=32):
			
 
				+        """
			
 
				+        Initializes the ClassifyLetterBox object for image preprocessing.
			
 
				+
			
 
				+        This class is designed to be part of a transformation pipeline for image classification tasks. It resizes and
			
 
				+        pads images to a specified size while maintaining the original aspect ratio.
			
 
				+
			
 
				+        Args:
			
 
				+            size (int | Tuple[int, int]): Target size for the letterboxed image. If an int, a square image of
			
 
				+                (size, size) is created. If a tuple, it should be (height, width).
			
 
				+            auto (bool): If True, automatically calculates the short side based on stride. Default is False.
			
 
				+            stride (int): The stride value, used when 'auto' is True. Default is 32.
			
 
				+
			
 
				+        Attributes:
			
 
				+            h (int): Target height of the letterboxed image.
			
 
				+            w (int): Target width of the letterboxed image.
			
 
				+            auto (bool): Flag indicating whether to automatically calculate short side.
			
 
				+            stride (int): Stride value for automatic short side calculation.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = ClassifyLetterBox(size=224)
			
 
				+            >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
			
 
				+            >>> result = transform(img)
			
 
				+            >>> print(result.shape)
			
 
				+            (224, 224, 3)
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.h, self.w = (size, size) if isinstance(size, int) else size
			
 
				+        self.auto = auto  # pass max size integer, automatically solve for short side using stride
			
 
				+        self.stride = stride  # used with auto
			
 
				+
			
 
				+    def __call__(self, im):
			
 
				+        """
			
 
				+        Resizes and pads an image using the letterbox method.
			
 
				+
			
 
				+        This method resizes the input image to fit within the specified dimensions while maintaining its aspect ratio,
			
 
				+        then pads the resized image to match the target size.
			
 
				+
			
 
				+        Args:
			
 
				+            im (numpy.ndarray): Input image as a numpy array with shape (H, W, C).
			
 
				+
			
 
				+        Returns:
			
 
				+            (numpy.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
			
 
				+                the target height and width respectively.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> letterbox = ClassifyLetterBox(size=(640, 640))
			
 
				+            >>> image = np.random.randint(0, 255, (720, 1280, 3), dtype=np.uint8)
			
 
				+            >>> resized_image = letterbox(image)
			
 
				+            >>> print(resized_image.shape)
			
 
				+            (640, 640, 3)
			
 
				+        """
			
 
				+        imh, imw = im.shape[:2]
			
 
				+        r = min(self.h / imh, self.w / imw)  # ratio of new/old dimensions
			
 
				+        h, w = round(imh * r), round(imw * r)  # resized image dimensions
			
 
				+
			
 
				+        # Calculate padding dimensions
			
 
				+        hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else (self.h, self.w)
			
 
				+        top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
			
 
				+
			
 
				+        # Create padded image
			
 
				+        im_out = np.full((hs, ws, 3), 114, dtype=im.dtype)
			
 
				+        im_out[top : top + h, left : left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
			
 
				+        return im_out
			
 
				+
			
 
				+
			
 
				+# NOTE: keep this class for backward compatibility
			
 
				+class CenterCrop:
			
 
				+    """
			
 
				+    Applies center cropping to images for classification tasks.
			
 
				+
			
 
				+    This class performs center cropping on input images, resizing them to a specified size while maintaining the aspect
			
 
				+    ratio. It is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
			
 
				+
			
 
				+    Attributes:
			
 
				+        h (int): Target height of the cropped image.
			
 
				+        w (int): Target width of the cropped image.
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the center crop transformation to an input image.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = CenterCrop(640)
			
 
				+        >>> image = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
			
 
				+        >>> cropped_image = transform(image)
			
 
				+        >>> print(cropped_image.shape)
			
 
				+        (640, 640, 3)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, size=640):
			
 
				+        """
			
 
				+        Initializes the CenterCrop object for image preprocessing.
			
 
				+
			
 
				+        This class is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
			
 
				+        It performs a center crop on input images to a specified size.
			
 
				+
			
 
				+        Args:
			
 
				+            size (int | Tuple[int, int]): The desired output size of the crop. If size is an int, a square crop
			
 
				+                (size, size) is made. If size is a sequence like (h, w), it is used as the output size.
			
 
				+
			
 
				+        Returns:
			
 
				+            (None): This method initializes the object and does not return anything.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = CenterCrop(224)
			
 
				+            >>> img = np.random.rand(300, 300, 3)
			
 
				+            >>> cropped_img = transform(img)
			
 
				+            >>> print(cropped_img.shape)
			
 
				+            (224, 224, 3)
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.h, self.w = (size, size) if isinstance(size, int) else size
			
 
				+
			
 
				+    def __call__(self, im):
			
 
				+        """
			
 
				+        Applies center cropping to an input image.
			
 
				+
			
 
				+        This method resizes and crops the center of the image using a letterbox method. It maintains the aspect
			
 
				+        ratio of the original image while fitting it into the specified dimensions.
			
 
				+
			
 
				+        Args:
			
 
				+            im (numpy.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
			
 
				+                PIL Image object.
			
 
				+
			
 
				+        Returns:
			
 
				+            (numpy.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = CenterCrop(size=224)
			
 
				+            >>> image = np.random.randint(0, 255, (640, 480, 3), dtype=np.uint8)
			
 
				+            >>> cropped_image = transform(image)
			
 
				+            >>> assert cropped_image.shape == (224, 224, 3)
			
 
				+        """
			
 
				+        if isinstance(im, Image.Image):  # convert from PIL to numpy array if required
			
 
				+            im = np.asarray(im)
			
 
				+        imh, imw = im.shape[:2]
			
 
				+        m = min(imh, imw)  # min dimension
			
 
				+        top, left = (imh - m) // 2, (imw - m) // 2
			
 
				+        return cv2.resize(im[top : top + m, left : left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
			
 
				+
			
 
				+
			
 
				+# NOTE: keep this class for backward compatibility
			
 
				+class ToTensor:
			
 
				+    """
			
 
				+    Converts an image from a numpy array to a PyTorch tensor.
			
 
				+
			
 
				+    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
			
 
				+
			
 
				+    Attributes:
			
 
				+        half (bool): If True, converts the image to half precision (float16).
			
 
				+
			
 
				+    Methods:
			
 
				+        __call__: Applies the tensor conversion to an input image.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> transform = ToTensor(half=True)
			
 
				+        >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
			
 
				+        >>> tensor_img = transform(img)
			
 
				+        >>> print(tensor_img.shape, tensor_img.dtype)
			
 
				+        torch.Size([3, 640, 640]) torch.float16
			
 
				+
			
 
				+    Notes:
			
 
				+        The input image is expected to be in BGR format with shape (H, W, C).
			
 
				+        The output tensor will be in RGB format with shape (C, H, W), normalized to [0, 1].
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, half=False):
			
 
				+        """
			
 
				+        Initializes the ToTensor object for converting images to PyTorch tensors.
			
 
				+
			
 
				+        This class is designed to be used as part of a transformation pipeline for image preprocessing in the
			
 
				+        Ultralytics YOLO framework. It converts numpy arrays or PIL Images to PyTorch tensors, with an option
			
 
				+        for half-precision (float16) conversion.
			
 
				+
			
 
				+        Args:
			
 
				+            half (bool): If True, converts the tensor to half precision (float16). Default is False.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = ToTensor(half=True)
			
 
				+            >>> img = np.random.rand(640, 640, 3)
			
 
				+            >>> tensor_img = transform(img)
			
 
				+            >>> print(tensor_img.dtype)
			
 
				+            torch.float16
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.half = half
			
 
				+
			
 
				+    def __call__(self, im):
			
 
				+        """
			
 
				+        Transforms an image from a numpy array to a PyTorch tensor.
			
 
				+
			
 
				+        This method converts the input image from a numpy array to a PyTorch tensor, applying optional
			
 
				+        half-precision conversion and normalization. The image is transposed from HWC to CHW format and
			
 
				+        the color channels are reversed from BGR to RGB.
			
 
				+
			
 
				+        Args:
			
 
				+            im (numpy.ndarray): Input image as a numpy array with shape (H, W, C) in BGR order.
			
 
				+
			
 
				+        Returns:
			
 
				+            (torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized
			
 
				+                to [0, 1] with shape (C, H, W) in RGB order.
			
 
				+
			
 
				+        Examples:
			
 
				+            >>> transform = ToTensor(half=True)
			
 
				+            >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
			
 
				+            >>> tensor_img = transform(img)
			
 
				+            >>> print(tensor_img.shape, tensor_img.dtype)
			
 
				+            torch.Size([3, 640, 640]) torch.float16
			
 
				+        """
			
 
				+        im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1])  # HWC to CHW -> BGR to RGB -> contiguous
			
 
				+        im = torch.from_numpy(im)  # to torch
			
 
				+        im = im.half() if self.half else im.float()  # uint8 to fp16/32
			
 
				+        im /= 255.0  # 0-255 to 0.0-1.0
			
 
				+        return im
			
--- a/ultralytics/data/base.py
+++ b/ultralytics/data/base.py
@@ -0,0 +1,346 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import glob
			
 
				+import math
			
 
				+import os
			
 
				+import random
			
 
				+from copy import deepcopy
			
 
				+from multiprocessing.pool import ThreadPool
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+import psutil
			
 
				+from torch.utils.data import Dataset
			
 
				+
			
 
				+from ultralytics.data.utils import FORMATS_HELP_MSG, HELP_URL, IMG_FORMATS
			
 
				+from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
			
 
				+
			
 
				+
			
 
				+class BaseDataset(Dataset):
			
 
				+    """
			
 
				+    Base dataset class for loading and processing image data.
			
 
				+
			
 
				+    Args:
			
 
				+        img_path (str): Path to the folder containing images.
			
 
				+        imgsz (int, optional): Image size. Defaults to 640.
			
 
				+        cache (bool, optional): Cache images to RAM or disk during training. Defaults to False.
			
 
				+        augment (bool, optional): If True, data augmentation is applied. Defaults to True.
			
 
				+        hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None.
			
 
				+        prefix (str, optional): Prefix to print in log messages. Defaults to ''.
			
 
				+        rect (bool, optional): If True, rectangular training is used. Defaults to False.
			
 
				+        batch_size (int, optional): Size of batches. Defaults to None.
			
 
				+        stride (int, optional): Stride. Defaults to 32.
			
 
				+        pad (float, optional): Padding. Defaults to 0.0.
			
 
				+        single_cls (bool, optional): If True, single class training is used. Defaults to False.
			
 
				+        classes (list): List of included classes. Default is None.
			
 
				+        fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data).
			
 
				+
			
 
				+    Attributes:
			
 
				+        im_files (list): List of image file paths.
			
 
				+        labels (list): List of label data dictionaries.
			
 
				+        ni (int): Number of images in the dataset.
			
 
				+        ims (list): List of loaded images.
			
 
				+        npy_files (list): List of numpy file paths.
			
 
				+        transforms (callable): Image transformation function.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        img_path,
			
 
				+        imgsz=640,
			
 
				+        cache=False,
			
 
				+        augment=True,
			
 
				+        hyp=DEFAULT_CFG,
			
 
				+        prefix="",
			
 
				+        rect=False,
			
 
				+        batch_size=16,
			
 
				+        stride=32,
			
 
				+        pad=0.5,
			
 
				+        single_cls=False,
			
 
				+        classes=None,
			
 
				+        fraction=1.0,
			
 
				+    ):
			
 
				+        """Initialize BaseDataset with given configuration and options."""
			
 
				+        super().__init__()
			
 
				+        self.img_path = img_path
			
 
				+        self.imgsz = imgsz
			
 
				+        self.augment = augment
			
 
				+        self.single_cls = single_cls
			
 
				+        self.prefix = prefix
			
 
				+        self.fraction = fraction
			
 
				+        self.im_files = self.get_img_files(self.img_path)
			
 
				+        self.labels = self.get_labels()
			
 
				+        self.update_labels(include_class=classes)  # single_cls and include_class
			
 
				+        self.ni = len(self.labels)  # number of images
			
 
				+        self.rect = rect
			
 
				+        self.batch_size = batch_size
			
 
				+        self.stride = stride
			
 
				+        self.pad = pad
			
 
				+        if self.rect:
			
 
				+            assert self.batch_size is not None
			
 
				+            self.set_rectangle()
			
 
				+
			
 
				+        # Buffer thread for mosaic images
			
 
				+        self.buffer = []  # buffer size = batch size
			
 
				+        self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
			
 
				+
			
 
				+        # Cache images (options are cache = True, False, None, "ram", "disk")
			
 
				+        self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
			
 
				+        self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
			
 
				+        self.cache = cache.lower() if isinstance(cache, str) else "ram" if cache is True else None
			
 
				+        if self.cache == "ram" and self.check_cache_ram():
			
 
				+            if hyp.deterministic:
			
 
				+                LOGGER.warning(
			
 
				+                    "WARNING ⚠️ cache='ram' may produce non-deterministic training results. "
			
 
				+                    "Consider cache='disk' as a deterministic alternative if your disk space allows."
			
 
				+                )
			
 
				+            self.cache_images()
			
 
				+        elif self.cache == "disk" and self.check_cache_disk():
			
 
				+            self.cache_images()
			
 
				+
			
 
				+        # Transforms
			
 
				+        self.transforms = self.build_transforms(hyp=hyp)
			
 
				+
			
 
				+    def get_img_files(self, img_path):
			
 
				+        """Read image files."""
			
 
				+        try:
			
 
				+            f = []  # image files
			
 
				+            for p in img_path if isinstance(img_path, list) else [img_path]:
			
 
				+                p = Path(p)  # os-agnostic
			
 
				+                if p.is_dir():  # dir
			
 
				+                    f += glob.glob(str(p / "**" / "*.*"), recursive=True)
			
 
				+                    # F = list(p.rglob('*.*'))  # pathlib
			
 
				+                elif p.is_file():  # file
			
 
				+                    with open(p) as t:
			
 
				+                        t = t.read().strip().splitlines()
			
 
				+                        parent = str(p.parent) + os.sep
			
 
				+                        f += [x.replace("./", parent) if x.startswith("./") else x for x in t]  # local to global path
			
 
				+                        # F += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
			
 
				+                else:
			
 
				+                    raise FileNotFoundError(f"{self.prefix}{p} does not exist")
			
 
				+            im_files = sorted(x.replace("/", os.sep) for x in f if x.split(".")[-1].lower() in IMG_FORMATS)
			
 
				+            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS])  # pathlib
			
 
				+            assert im_files, f"{self.prefix}No images found in {img_path}. {FORMATS_HELP_MSG}"
			
 
				+        except Exception as e:
			
 
				+            raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
			
 
				+        if self.fraction < 1:
			
 
				+            im_files = im_files[: round(len(im_files) * self.fraction)]  # retain a fraction of the dataset
			
 
				+        return im_files
			
 
				+
			
 
				+    def update_labels(self, include_class: Optional[list]):
			
 
				+        """Update labels to include only these classes (optional)."""
			
 
				+        include_class_array = np.array(include_class).reshape(1, -1)
			
 
				+        for i in range(len(self.labels)):
			
 
				+            if include_class is not None:
			
 
				+                cls = self.labels[i]["cls"]
			
 
				+                bboxes = self.labels[i]["bboxes"]
			
 
				+                segments = self.labels[i]["segments"]
			
 
				+                keypoints = self.labels[i]["keypoints"]
			
 
				+                j = (cls == include_class_array).any(1)
			
 
				+                self.labels[i]["cls"] = cls[j]
			
 
				+                self.labels[i]["bboxes"] = bboxes[j]
			
 
				+                if segments:
			
 
				+                    self.labels[i]["segments"] = [segments[si] for si, idx in enumerate(j) if idx]
			
 
				+                if keypoints is not None:
			
 
				+                    self.labels[i]["keypoints"] = keypoints[j]
			
 
				+            if self.single_cls:
			
 
				+                self.labels[i]["cls"][:, 0] = 0
			
 
				+
			
 
				+    def load_image(self, i, rect_mode=True):
			
 
				+        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
			
 
				+        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
			
 
				+        if im is None:  # not cached in RAM
			
 
				+            if fn.exists():  # load npy
			
 
				+                try:
			
 
				+                    im = np.load(fn)
			
 
				+                except Exception as e:
			
 
				+                    LOGGER.warning(f"{self.prefix}WARNING ⚠️ Removing corrupt *.npy image file {fn} due to: {e}")
			
 
				+                    Path(fn).unlink(missing_ok=True)
			
 
				+                    im = cv2.imread(f)  # BGR
			
 
				+            else:  # read image
			
 
				+                im = cv2.imread(f)  # BGR
			
 
				+            if im is None:
			
 
				+                raise FileNotFoundError(f"Image Not Found {f}")
			
 
				+
			
 
				+            h0, w0 = im.shape[:2]  # orig hw
			
 
				+            if rect_mode:  # resize long side to imgsz while maintaining aspect ratio
			
 
				+                r = self.imgsz / max(h0, w0)  # ratio
			
 
				+                if r != 1:  # if sizes are not equal
			
 
				+                    w, h = (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz))
			
 
				+                    im = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
			
 
				+            elif not (h0 == w0 == self.imgsz):  # resize by stretching image to square imgsz
			
 
				+                im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
			
 
				+
			
 
				+            # Add to buffer if training with augmentations
			
 
				+            if self.augment:
			
 
				+                self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
			
 
				+                self.buffer.append(i)
			
 
				+                if 1 < len(self.buffer) >= self.max_buffer_length:  # prevent empty buffer
			
 
				+                    j = self.buffer.pop(0)
			
 
				+                    if self.cache != "ram":
			
 
				+                        self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
			
 
				+
			
 
				+            return im, (h0, w0), im.shape[:2]
			
 
				+
			
 
				+        return self.ims[i], self.im_hw0[i], self.im_hw[i]
			
 
				+
			
 
				+    def cache_images(self):
			
 
				+        """Cache images to memory or disk."""
			
 
				+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
			
 
				+        fcn, storage = (self.cache_images_to_disk, "Disk") if self.cache == "disk" else (self.load_image, "RAM")
			
 
				+        with ThreadPool(NUM_THREADS) as pool:
			
 
				+            results = pool.imap(fcn, range(self.ni))
			
 
				+            pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
			
 
				+            for i, x in pbar:
			
 
				+                if self.cache == "disk":
			
 
				+                    b += self.npy_files[i].stat().st_size
			
 
				+                else:  # 'ram'
			
 
				+                    self.ims[i], self.im_hw0[i], self.im_hw[i] = x  # im, hw_orig, hw_resized = load_image(self, i)
			
 
				+                    b += self.ims[i].nbytes
			
 
				+                pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {storage})"
			
 
				+            pbar.close()
			
 
				+
			
 
				+    def cache_images_to_disk(self, i):
			
 
				+        """Saves an image as an *.npy file for faster loading."""
			
 
				+        f = self.npy_files[i]
			
 
				+        if not f.exists():
			
 
				+            np.save(f.as_posix(), cv2.imread(self.im_files[i]), allow_pickle=False)
			
 
				+
			
 
				+    def check_cache_disk(self, safety_margin=0.5):
			
 
				+        """Check image caching requirements vs available disk space."""
			
 
				+        import shutil
			
 
				+
			
 
				+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
			
 
				+        n = min(self.ni, 30)  # extrapolate from 30 random images
			
 
				+        for _ in range(n):
			
 
				+            im_file = random.choice(self.im_files)
			
 
				+            im = cv2.imread(im_file)
			
 
				+            if im is None:
			
 
				+                continue
			
 
				+            b += im.nbytes
			
 
				+            if not os.access(Path(im_file).parent, os.W_OK):
			
 
				+                self.cache = None
			
 
				+                LOGGER.info(f"{self.prefix}Skipping caching images to disk, directory not writeable ⚠️")
			
 
				+                return False
			
 
				+        disk_required = b * self.ni / n * (1 + safety_margin)  # bytes required to cache dataset to disk
			
 
				+        total, used, free = shutil.disk_usage(Path(self.im_files[0]).parent)
			
 
				+        if disk_required > free:
			
 
				+            self.cache = None
			
 
				+            LOGGER.info(
			
 
				+                f"{self.prefix}{disk_required / gb:.1f}GB disk space required, "
			
 
				+                f"with {int(safety_margin * 100)}% safety margin but only "
			
 
				+                f"{free / gb:.1f}/{total / gb:.1f}GB free, not caching images to disk ⚠️"
			
 
				+            )
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    def check_cache_ram(self, safety_margin=0.5):
			
 
				+        """Check image caching requirements vs available memory."""
			
 
				+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
			
 
				+        n = min(self.ni, 30)  # extrapolate from 30 random images
			
 
				+        for _ in range(n):
			
 
				+            im = cv2.imread(random.choice(self.im_files))  # sample image
			
 
				+            if im is None:
			
 
				+                continue
			
 
				+            ratio = self.imgsz / max(im.shape[0], im.shape[1])  # max(h, w)  # ratio
			
 
				+            b += im.nbytes * ratio**2
			
 
				+        mem_required = b * self.ni / n * (1 + safety_margin)  # GB required to cache dataset into RAM
			
 
				+        mem = psutil.virtual_memory()
			
 
				+        if mem_required > mem.available:
			
 
				+            self.cache = None
			
 
				+            LOGGER.info(
			
 
				+                f"{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images "
			
 
				+                f"with {int(safety_margin * 100)}% safety margin but only "
			
 
				+                f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, not caching images ⚠️"
			
 
				+            )
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    def set_rectangle(self):
			
 
				+        """Sets the shape of bounding boxes for YOLO detections as rectangles."""
			
 
				+        bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int)  # batch index
			
 
				+        nb = bi[-1] + 1  # number of batches
			
 
				+
			
 
				+        s = np.array([x.pop("shape") for x in self.labels])  # hw
			
 
				+        ar = s[:, 0] / s[:, 1]  # aspect ratio
			
 
				+        irect = ar.argsort()
			
 
				+        self.im_files = [self.im_files[i] for i in irect]
			
 
				+        self.labels = [self.labels[i] for i in irect]
			
 
				+        ar = ar[irect]
			
 
				+
			
 
				+        # Set training image shapes
			
 
				+        shapes = [[1, 1]] * nb
			
 
				+        for i in range(nb):
			
 
				+            ari = ar[bi == i]
			
 
				+            mini, maxi = ari.min(), ari.max()
			
 
				+            if maxi < 1:
			
 
				+                shapes[i] = [maxi, 1]
			
 
				+            elif mini > 1:
			
 
				+                shapes[i] = [1, 1 / mini]
			
 
				+
			
 
				+        self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
			
 
				+        self.batch = bi  # batch index of image
			
 
				+
			
 
				+    def __getitem__(self, index):
			
 
				+        """Returns transformed label information for given index."""
			
 
				+        return self.transforms(self.get_image_and_label(index))
			
 
				+
			
 
				+    def get_image_and_label(self, index):
			
 
				+        """Get and return label information from the dataset."""
			
 
				+        label = deepcopy(self.labels[index])  # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
			
 
				+        label.pop("shape", None)  # shape is for rect, remove it
			
 
				+        label["img"], label["ori_shape"], label["resized_shape"] = self.load_image(index)
			
 
				+        label["ratio_pad"] = (
			
 
				+            label["resized_shape"][0] / label["ori_shape"][0],
			
 
				+            label["resized_shape"][1] / label["ori_shape"][1],
			
 
				+        )  # for evaluation
			
 
				+        if self.rect:
			
 
				+            label["rect_shape"] = self.batch_shapes[self.batch[index]]
			
 
				+        return self.update_labels_info(label)
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Returns the length of the labels list for the dataset."""
			
 
				+        return len(self.labels)
			
 
				+
			
 
				+    def update_labels_info(self, label):
			
 
				+        """Custom your label format here."""
			
 
				+        return label
			
 
				+
			
 
				+    def build_transforms(self, hyp=None):
			
 
				+        """
			
 
				+        Users can customize augmentations here.
			
 
				+
			
 
				+        Example:
			
 
				+            ```python
			
 
				+            if self.augment:
			
 
				+                # Training transforms
			
 
				+                return Compose([])
			
 
				+            else:
			
 
				+                # Val transforms
			
 
				+                return Compose([])
			
 
				+            ```
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """
			
 
				+        Users can customize their own format here.
			
 
				+
			
 
				+        Note:
			
 
				+            Ensure output is a dictionary with the following keys:
			
 
				+            ```python
			
 
				+            dict(
			
 
				+                im_file=im_file,
			
 
				+                shape=shape,  # format: (height, width)
			
 
				+                cls=cls,
			
 
				+                bboxes=bboxes,  # xywh
			
 
				+                segments=segments,  # xy
			
 
				+                keypoints=keypoints,  # xy
			
 
				+                normalized=True,  # or False
			
 
				+                bbox_format="xyxy",  # or xywh, ltwh
			
 
				+            )
			
 
				+            ```
			
 
				+        """
			
 
				+        raise NotImplementedError
			
--- a/ultralytics/data/build.py
+++ b/ultralytics/data/build.py
@@ -0,0 +1,215 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import os
			
 
				+import random
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+from torch.utils.data import dataloader, distributed
			
 
				+
			
 
				+from ultralytics.data.dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
			
 
				+from ultralytics.data.loaders import (
			
 
				+    LOADERS,
			
 
				+    LoadImagesAndVideos,
			
 
				+    LoadPilAndNumpy,
			
 
				+    LoadScreenshots,
			
 
				+    LoadStreams,
			
 
				+    LoadTensor,
			
 
				+    SourceTypes,
			
 
				+    autocast_list,
			
 
				+)
			
 
				+from ultralytics.data.utils import IMG_FORMATS, PIN_MEMORY, VID_FORMATS
			
 
				+from ultralytics.utils import RANK, colorstr
			
 
				+from ultralytics.utils.checks import check_file
			
 
				+
			
 
				+
			
 
				+class InfiniteDataLoader(dataloader.DataLoader):
			
 
				+    """
			
 
				+    Dataloader that reuses workers.
			
 
				+
			
 
				+    Uses same syntax as vanilla DataLoader.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        """Dataloader that infinitely recycles workers, inherits from DataLoader."""
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        object.__setattr__(self, "batch_sampler", _RepeatSampler(self.batch_sampler))
			
 
				+        self.iterator = super().__iter__()
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Returns the length of the batch sampler's sampler."""
			
 
				+        return len(self.batch_sampler.sampler)
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Creates a sampler that repeats indefinitely."""
			
 
				+        for _ in range(len(self)):
			
 
				+            yield next(self.iterator)
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        """Ensure that workers are terminated."""
			
 
				+        if hasattr(self.iterator, "_workers"):
			
 
				+            for w in self.iterator._workers:  # force terminate
			
 
				+                if w.is_alive():
			
 
				+                    w.terminate()
			
 
				+            self.iterator._shutdown_workers()  # cleanup
			
 
				+
			
 
				+    def reset(self):
			
 
				+        """
			
 
				+        Reset iterator.
			
 
				+
			
 
				+        This is useful when we want to modify settings of dataset while training.
			
 
				+        """
			
 
				+        self.iterator = self._get_iterator()
			
 
				+
			
 
				+
			
 
				+class _RepeatSampler:
			
 
				+    """
			
 
				+    Sampler that repeats forever.
			
 
				+
			
 
				+    Args:
			
 
				+        sampler (Dataset.sampler): The sampler to repeat.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, sampler):
			
 
				+        """Initializes an object that repeats a given sampler indefinitely."""
			
 
				+        self.sampler = sampler
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Iterates over the 'sampler' and yields its contents."""
			
 
				+        while True:
			
 
				+            yield from iter(self.sampler)
			
 
				+
			
 
				+
			
 
				+def seed_worker(worker_id):  # noqa
			
 
				+    """Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
			
 
				+    worker_seed = torch.initial_seed() % 2**32
			
 
				+    np.random.seed(worker_seed)
			
 
				+    random.seed(worker_seed)
			
 
				+
			
 
				+
			
 
				+def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32, multi_modal=False):
			
 
				+    """Build YOLO Dataset."""
			
 
				+    dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
			
 
				+    return dataset(
			
 
				+        img_path=img_path,
			
 
				+        imgsz=cfg.imgsz,
			
 
				+        batch_size=batch,
			
 
				+        augment=mode == "train",  # augmentation
			
 
				+        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
			
 
				+        rect=cfg.rect or rect,  # rectangular batches
			
 
				+        cache=cfg.cache or None,
			
 
				+        single_cls=cfg.single_cls or False,
			
 
				+        stride=int(stride),
			
 
				+        pad=0.0 if mode == "train" else 0.5,
			
 
				+        prefix=colorstr(f"{mode}: "),
			
 
				+        task=cfg.task,
			
 
				+        classes=cfg.classes,
			
 
				+        data=data,
			
 
				+        fraction=cfg.fraction if mode == "train" else 1.0,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def build_grounding(cfg, img_path, json_file, batch, mode="train", rect=False, stride=32):
			
 
				+    """Build YOLO Dataset."""
			
 
				+    return GroundingDataset(
			
 
				+        img_path=img_path,
			
 
				+        json_file=json_file,
			
 
				+        imgsz=cfg.imgsz,
			
 
				+        batch_size=batch,
			
 
				+        augment=mode == "train",  # augmentation
			
 
				+        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
			
 
				+        rect=cfg.rect or rect,  # rectangular batches
			
 
				+        cache=cfg.cache or None,
			
 
				+        single_cls=cfg.single_cls or False,
			
 
				+        stride=int(stride),
			
 
				+        pad=0.0 if mode == "train" else 0.5,
			
 
				+        prefix=colorstr(f"{mode}: "),
			
 
				+        task=cfg.task,
			
 
				+        classes=cfg.classes,
			
 
				+        fraction=cfg.fraction if mode == "train" else 1.0,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
			
 
				+    """Return an InfiniteDataLoader or DataLoader for training or validation set."""
			
 
				+    batch = min(batch, len(dataset))
			
 
				+    nd = torch.cuda.device_count()  # number of CUDA devices
			
 
				+    nw = min(os.cpu_count() // max(nd, 1), workers)  # number of workers
			
 
				+    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
			
 
				+    generator = torch.Generator()
			
 
				+    generator.manual_seed(6148914691236517205 + RANK)
			
 
				+    return InfiniteDataLoader(
			
 
				+        dataset=dataset,
			
 
				+        batch_size=batch,
			
 
				+        shuffle=shuffle and sampler is None,
			
 
				+        num_workers=nw,
			
 
				+        sampler=sampler,
			
 
				+        pin_memory=PIN_MEMORY,
			
 
				+        collate_fn=getattr(dataset, "collate_fn", None),
			
 
				+        worker_init_fn=seed_worker,
			
 
				+        generator=generator,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def check_source(source):
			
 
				+    """Check source type and return corresponding flag values."""
			
 
				+    webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
			
 
				+    if isinstance(source, (str, int, Path)):  # int for local usb camera
			
 
				+        source = str(source)
			
 
				+        is_file = Path(source).suffix[1:] in (IMG_FORMATS | VID_FORMATS)
			
 
				+        is_url = source.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://"))
			
 
				+        webcam = source.isnumeric() or source.endswith(".streams") or (is_url and not is_file)
			
 
				+        screenshot = source.lower() == "screen"
			
 
				+        if is_url and is_file:
			
 
				+            source = check_file(source)  # download
			
 
				+    elif isinstance(source, LOADERS):
			
 
				+        in_memory = True
			
 
				+    elif isinstance(source, (list, tuple)):
			
 
				+        source = autocast_list(source)  # convert all list elements to PIL or np arrays
			
 
				+        from_img = True
			
 
				+    elif isinstance(source, (Image.Image, np.ndarray)):
			
 
				+        from_img = True
			
 
				+    elif isinstance(source, torch.Tensor):
			
 
				+        tensor = True
			
 
				+    else:
			
 
				+        raise TypeError("Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict")
			
 
				+
			
 
				+    return source, webcam, screenshot, from_img, in_memory, tensor
			
 
				+
			
 
				+
			
 
				+def load_inference_source(source=None, batch=1, vid_stride=1, buffer=False):
			
 
				+    """
			
 
				+    Loads an inference source for object detection and applies necessary transformations.
			
 
				+
			
 
				+    Args:
			
 
				+        source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
			
 
				+        batch (int, optional): Batch size for dataloaders. Default is 1.
			
 
				+        vid_stride (int, optional): The frame interval for video sources. Default is 1.
			
 
				+        buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.
			
 
				+
			
 
				+    Returns:
			
 
				+        dataset (Dataset): A dataset object for the specified input source.
			
 
				+    """
			
 
				+    source, stream, screenshot, from_img, in_memory, tensor = check_source(source)
			
 
				+    source_type = source.source_type if in_memory else SourceTypes(stream, screenshot, from_img, tensor)
			
 
				+
			
 
				+    # Dataloader
			
 
				+    if tensor:
			
 
				+        dataset = LoadTensor(source)
			
 
				+    elif in_memory:
			
 
				+        dataset = source
			
 
				+    elif stream:
			
 
				+        dataset = LoadStreams(source, vid_stride=vid_stride, buffer=buffer)
			
 
				+    elif screenshot:
			
 
				+        dataset = LoadScreenshots(source)
			
 
				+    elif from_img:
			
 
				+        dataset = LoadPilAndNumpy(source)
			
 
				+    else:
			
 
				+        dataset = LoadImagesAndVideos(source, batch=batch, vid_stride=vid_stride)
			
 
				+
			
 
				+    # Attach source types to the dataset
			
 
				+    setattr(dataset, "source_type", source_type)
			
 
				+
			
 
				+    return dataset
			
--- a/ultralytics/data/converter.py
+++ b/ultralytics/data/converter.py
@@ -0,0 +1,702 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import json
			
 
				+import random
			
 
				+import shutil
			
 
				+from collections import defaultdict
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from PIL import Image
			
 
				+
			
 
				+from ultralytics.utils import DATASETS_DIR, LOGGER, NUM_THREADS, TQDM
			
 
				+from ultralytics.utils.downloads import download
			
 
				+from ultralytics.utils.files import increment_path
			
 
				+
			
 
				+
			
 
				+def coco91_to_coco80_class():
			
 
				+    """
			
 
				+    Converts 91-index COCO class IDs to 80-index COCO class IDs.
			
 
				+
			
 
				+    Returns:
			
 
				+        (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
			
 
				+            corresponding 91-index class ID.
			
 
				+    """
			
 
				+    return [
			
 
				+        0,
			
 
				+        1,
			
 
				+        2,
			
 
				+        3,
			
 
				+        4,
			
 
				+        5,
			
 
				+        6,
			
 
				+        7,
			
 
				+        8,
			
 
				+        9,
			
 
				+        10,
			
 
				+        None,
			
 
				+        11,
			
 
				+        12,
			
 
				+        13,
			
 
				+        14,
			
 
				+        15,
			
 
				+        16,
			
 
				+        17,
			
 
				+        18,
			
 
				+        19,
			
 
				+        20,
			
 
				+        21,
			
 
				+        22,
			
 
				+        23,
			
 
				+        None,
			
 
				+        24,
			
 
				+        25,
			
 
				+        None,
			
 
				+        None,
			
 
				+        26,
			
 
				+        27,
			
 
				+        28,
			
 
				+        29,
			
 
				+        30,
			
 
				+        31,
			
 
				+        32,
			
 
				+        33,
			
 
				+        34,
			
 
				+        35,
			
 
				+        36,
			
 
				+        37,
			
 
				+        38,
			
 
				+        39,
			
 
				+        None,
			
 
				+        40,
			
 
				+        41,
			
 
				+        42,
			
 
				+        43,
			
 
				+        44,
			
 
				+        45,
			
 
				+        46,
			
 
				+        47,
			
 
				+        48,
			
 
				+        49,
			
 
				+        50,
			
 
				+        51,
			
 
				+        52,
			
 
				+        53,
			
 
				+        54,
			
 
				+        55,
			
 
				+        56,
			
 
				+        57,
			
 
				+        58,
			
 
				+        59,
			
 
				+        None,
			
 
				+        60,
			
 
				+        None,
			
 
				+        None,
			
 
				+        61,
			
 
				+        None,
			
 
				+        62,
			
 
				+        63,
			
 
				+        64,
			
 
				+        65,
			
 
				+        66,
			
 
				+        67,
			
 
				+        68,
			
 
				+        69,
			
 
				+        70,
			
 
				+        71,
			
 
				+        72,
			
 
				+        None,
			
 
				+        73,
			
 
				+        74,
			
 
				+        75,
			
 
				+        76,
			
 
				+        77,
			
 
				+        78,
			
 
				+        79,
			
 
				+        None,
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def coco80_to_coco91_class():
			
 
				+    r"""
			
 
				+    Converts 80-index (val2014) to 91-index (paper).
			
 
				+    For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/.
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        import numpy as np
			
 
				+
			
 
				+        a = np.loadtxt("data/coco.names", dtype="str", delimiter="\n")
			
 
				+        b = np.loadtxt("data/coco_paper.names", dtype="str", delimiter="\n")
			
 
				+        x1 = [list(a[i] == b).index(True) + 1 for i in range(80)]  # darknet to coco
			
 
				+        x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)]  # coco to darknet
			
 
				+        ```
			
 
				+    """
			
 
				+    return [
			
 
				+        1,
			
 
				+        2,
			
 
				+        3,
			
 
				+        4,
			
 
				+        5,
			
 
				+        6,
			
 
				+        7,
			
 
				+        8,
			
 
				+        9,
			
 
				+        10,
			
 
				+        11,
			
 
				+        13,
			
 
				+        14,
			
 
				+        15,
			
 
				+        16,
			
 
				+        17,
			
 
				+        18,
			
 
				+        19,
			
 
				+        20,
			
 
				+        21,
			
 
				+        22,
			
 
				+        23,
			
 
				+        24,
			
 
				+        25,
			
 
				+        27,
			
 
				+        28,
			
 
				+        31,
			
 
				+        32,
			
 
				+        33,
			
 
				+        34,
			
 
				+        35,
			
 
				+        36,
			
 
				+        37,
			
 
				+        38,
			
 
				+        39,
			
 
				+        40,
			
 
				+        41,
			
 
				+        42,
			
 
				+        43,
			
 
				+        44,
			
 
				+        46,
			
 
				+        47,
			
 
				+        48,
			
 
				+        49,
			
 
				+        50,
			
 
				+        51,
			
 
				+        52,
			
 
				+        53,
			
 
				+        54,
			
 
				+        55,
			
 
				+        56,
			
 
				+        57,
			
 
				+        58,
			
 
				+        59,
			
 
				+        60,
			
 
				+        61,
			
 
				+        62,
			
 
				+        63,
			
 
				+        64,
			
 
				+        65,
			
 
				+        67,
			
 
				+        70,
			
 
				+        72,
			
 
				+        73,
			
 
				+        74,
			
 
				+        75,
			
 
				+        76,
			
 
				+        77,
			
 
				+        78,
			
 
				+        79,
			
 
				+        80,
			
 
				+        81,
			
 
				+        82,
			
 
				+        84,
			
 
				+        85,
			
 
				+        86,
			
 
				+        87,
			
 
				+        88,
			
 
				+        89,
			
 
				+        90,
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def convert_coco(
			
 
				+    labels_dir="../coco/annotations/",
			
 
				+    save_dir="coco_converted/",
			
 
				+    use_segments=False,
			
 
				+    use_keypoints=False,
			
 
				+    cls91to80=True,
			
 
				+    lvis=False,
			
 
				+):
			
 
				+    """
			
 
				+    Converts COCO dataset annotations to a YOLO annotation format  suitable for training YOLO models.
			
 
				+
			
 
				+    Args:
			
 
				+        labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
			
 
				+        save_dir (str, optional): Path to directory to save results to.
			
 
				+        use_segments (bool, optional): Whether to include segmentation masks in the output.
			
 
				+        use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
			
 
				+        cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
			
 
				+        lvis (bool, optional): Whether to convert data in lvis dataset way.
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        from ultralytics.data.converter import convert_coco
			
 
				+
			
 
				+        convert_coco("../datasets/coco/annotations/", use_segments=True, use_keypoints=False, cls91to80=False)
			
 
				+        convert_coco(
			
 
				+            "../datasets/lvis/annotations/", use_segments=True, use_keypoints=False, cls91to80=False, lvis=True
			
 
				+        )
			
 
				+        ```
			
 
				+
			
 
				+    Output:
			
 
				+        Generates output files in the specified output directory.
			
 
				+    """
			
 
				+    # Create dataset directory
			
 
				+    save_dir = increment_path(save_dir)  # increment if save directory already exists
			
 
				+    for p in save_dir / "labels", save_dir / "images":
			
 
				+        p.mkdir(parents=True, exist_ok=True)  # make dir
			
 
				+
			
 
				+    # Convert classes
			
 
				+    coco80 = coco91_to_coco80_class()
			
 
				+
			
 
				+    # Import json
			
 
				+    for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
			
 
				+        lname = "" if lvis else json_file.stem.replace("instances_", "")
			
 
				+        fn = Path(save_dir) / "labels" / lname  # folder name
			
 
				+        fn.mkdir(parents=True, exist_ok=True)
			
 
				+        if lvis:
			
 
				+            # NOTE: create folders for both train and val in advance,
			
 
				+            # since LVIS val set contains images from COCO 2017 train in addition to the COCO 2017 val split.
			
 
				+            (fn / "train2017").mkdir(parents=True, exist_ok=True)
			
 
				+            (fn / "val2017").mkdir(parents=True, exist_ok=True)
			
 
				+        with open(json_file, encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+
			
 
				+        # Create image dict
			
 
				+        images = {f"{x['id']:d}": x for x in data["images"]}
			
 
				+        # Create image-annotations dict
			
 
				+        imgToAnns = defaultdict(list)
			
 
				+        for ann in data["annotations"]:
			
 
				+            imgToAnns[ann["image_id"]].append(ann)
			
 
				+
			
 
				+        image_txt = []
			
 
				+        # Write labels file
			
 
				+        for img_id, anns in TQDM(imgToAnns.items(), desc=f"Annotations {json_file}"):
			
 
				+            img = images[f"{img_id:d}"]
			
 
				+            h, w = img["height"], img["width"]
			
 
				+            f = str(Path(img["coco_url"]).relative_to("http://images.cocodataset.org")) if lvis else img["file_name"]
			
 
				+            if lvis:
			
 
				+                image_txt.append(str(Path("./images") / f))
			
 
				+
			
 
				+            bboxes = []
			
 
				+            segments = []
			
 
				+            keypoints = []
			
 
				+            for ann in anns:
			
 
				+                if ann.get("iscrowd", False):
			
 
				+                    continue
			
 
				+                # The COCO box format is [top left x, top left y, width, height]
			
 
				+                box = np.array(ann["bbox"], dtype=np.float64)
			
 
				+                box[:2] += box[2:] / 2  # xy top-left corner to center
			
 
				+                box[[0, 2]] /= w  # normalize x
			
 
				+                box[[1, 3]] /= h  # normalize y
			
 
				+                if box[2] <= 0 or box[3] <= 0:  # if w <= 0 and h <= 0
			
 
				+                    continue
			
 
				+
			
 
				+                cls = coco80[ann["category_id"] - 1] if cls91to80 else ann["category_id"] - 1  # class
			
 
				+                box = [cls] + box.tolist()
			
 
				+                if box not in bboxes:
			
 
				+                    bboxes.append(box)
			
 
				+                    if use_segments and ann.get("segmentation") is not None:
			
 
				+                        if len(ann["segmentation"]) == 0:
			
 
				+                            segments.append([])
			
 
				+                            continue
			
 
				+                        elif len(ann["segmentation"]) > 1:
			
 
				+                            s = merge_multi_segment(ann["segmentation"])
			
 
				+                            s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
			
 
				+                        else:
			
 
				+                            s = [j for i in ann["segmentation"] for j in i]  # all segments concatenated
			
 
				+                            s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
			
 
				+                        s = [cls] + s
			
 
				+                        segments.append(s)
			
 
				+                    if use_keypoints and ann.get("keypoints") is not None:
			
 
				+                        keypoints.append(
			
 
				+                            box + (np.array(ann["keypoints"]).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
			
 
				+                        )
			
 
				+
			
 
				+            # Write
			
 
				+            with open((fn / f).with_suffix(".txt"), "a") as file:
			
 
				+                for i in range(len(bboxes)):
			
 
				+                    if use_keypoints:
			
 
				+                        line = (*(keypoints[i]),)  # cls, box, keypoints
			
 
				+                    else:
			
 
				+                        line = (
			
 
				+                            *(segments[i] if use_segments and len(segments[i]) > 0 else bboxes[i]),
			
 
				+                        )  # cls, box or segments
			
 
				+                    file.write(("%g " * len(line)).rstrip() % line + "\n")
			
 
				+
			
 
				+        if lvis:
			
 
				+            with open((Path(save_dir) / json_file.name.replace("lvis_v1_", "").replace(".json", ".txt")), "a") as f:
			
 
				+                f.writelines(f"{line}\n" for line in image_txt)
			
 
				+
			
 
				+    LOGGER.info(f"{'LVIS' if lvis else 'COCO'} data converted successfully.\nResults saved to {save_dir.resolve()}")
			
 
				+
			
 
				+
			
 
				+def convert_segment_masks_to_yolo_seg(masks_dir, output_dir, classes):
			
 
				+    """
			
 
				+    Converts a dataset of segmentation mask images to the YOLO segmentation format.
			
 
				+
			
 
				+    This function takes the directory containing the binary format mask images and converts them into YOLO segmentation format.
			
 
				+    The converted masks are saved in the specified output directory.
			
 
				+
			
 
				+    Args:
			
 
				+        masks_dir (str): The path to the directory where all mask images (png, jpg) are stored.
			
 
				+        output_dir (str): The path to the directory where the converted YOLO segmentation masks will be stored.
			
 
				+        classes (int): Total classes in the dataset i.e. for COCO classes=80
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        from ultralytics.data.converter import convert_segment_masks_to_yolo_seg
			
 
				+
			
 
				+        # The classes here is the total classes in the dataset, for COCO dataset we have 80 classes
			
 
				+        convert_segment_masks_to_yolo_seg("path/to/masks_directory", "path/to/output/directory", classes=80)
			
 
				+        ```
			
 
				+
			
 
				+    Notes:
			
 
				+        The expected directory structure for the masks is:
			
 
				+
			
 
				+            - masks
			
 
				+                ├─ mask_image_01.png or mask_image_01.jpg
			
 
				+                ├─ mask_image_02.png or mask_image_02.jpg
			
 
				+                ├─ mask_image_03.png or mask_image_03.jpg
			
 
				+                └─ mask_image_04.png or mask_image_04.jpg
			
 
				+
			
 
				+        After execution, the labels will be organized in the following structure:
			
 
				+
			
 
				+            - output_dir
			
 
				+                ├─ mask_yolo_01.txt
			
 
				+                ├─ mask_yolo_02.txt
			
 
				+                ├─ mask_yolo_03.txt
			
 
				+                └─ mask_yolo_04.txt
			
 
				+    """
			
 
				+    pixel_to_class_mapping = {i + 1: i for i in range(classes)}
			
 
				+    for mask_path in Path(masks_dir).iterdir():
			
 
				+        if mask_path.suffix in {".png", ".jpg"}:
			
 
				+            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)  # Read the mask image in grayscale
			
 
				+            img_height, img_width = mask.shape  # Get image dimensions
			
 
				+            LOGGER.info(f"Processing {mask_path} imgsz = {img_height} x {img_width}")
			
 
				+
			
 
				+            unique_values = np.unique(mask)  # Get unique pixel values representing different classes
			
 
				+            yolo_format_data = []
			
 
				+
			
 
				+            for value in unique_values:
			
 
				+                if value == 0:
			
 
				+                    continue  # Skip background
			
 
				+                class_index = pixel_to_class_mapping.get(value, -1)
			
 
				+                if class_index == -1:
			
 
				+                    LOGGER.warning(f"Unknown class for pixel value {value} in file {mask_path}, skipping.")
			
 
				+                    continue
			
 
				+
			
 
				+                # Create a binary mask for the current class and find contours
			
 
				+                contours, _ = cv2.findContours(
			
 
				+                    (mask == value).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
			
 
				+                )  # Find contours
			
 
				+
			
 
				+                for contour in contours:
			
 
				+                    if len(contour) >= 3:  # YOLO requires at least 3 points for a valid segmentation
			
 
				+                        contour = contour.squeeze()  # Remove single-dimensional entries
			
 
				+                        yolo_format = [class_index]
			
 
				+                        for point in contour:
			
 
				+                            # Normalize the coordinates
			
 
				+                            yolo_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
			
 
				+                            yolo_format.append(round(point[1] / img_height, 6))
			
 
				+                        yolo_format_data.append(yolo_format)
			
 
				+            # Save Ultralytics YOLO format data to file
			
 
				+            output_path = Path(output_dir) / f"{mask_path.stem}.txt"
			
 
				+            with open(output_path, "w") as file:
			
 
				+                for item in yolo_format_data:
			
 
				+                    line = " ".join(map(str, item))
			
 
				+                    file.write(line + "\n")
			
 
				+            LOGGER.info(f"Processed and stored at {output_path} imgsz = {img_height} x {img_width}")
			
 
				+
			
 
				+
			
 
				+def convert_dota_to_yolo_obb(dota_root_path: str):
			
 
				+    """
			
 
				+    Converts DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.
			
 
				+
			
 
				+    The function processes images in the 'train' and 'val' folders of the DOTA dataset. For each image, it reads the
			
 
				+    associated label from the original labels directory and writes new labels in YOLO OBB format to a new directory.
			
 
				+
			
 
				+    Args:
			
 
				+        dota_root_path (str): The root directory path of the DOTA dataset.
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        from ultralytics.data.converter import convert_dota_to_yolo_obb
			
 
				+
			
 
				+        convert_dota_to_yolo_obb("path/to/DOTA")
			
 
				+        ```
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+
			
 
				+            - DOTA
			
 
				+                ├─ images
			
 
				+                │   ├─ train
			
 
				+                │   └─ val
			
 
				+                └─ labels
			
 
				+                    ├─ train_original
			
 
				+                    └─ val_original
			
 
				+
			
 
				+        After execution, the function will organize the labels into:
			
 
				+
			
 
				+            - DOTA
			
 
				+                └─ labels
			
 
				+                    ├─ train
			
 
				+                    └─ val
			
 
				+    """
			
 
				+    dota_root_path = Path(dota_root_path)
			
 
				+
			
 
				+    # Class names to indices mapping
			
 
				+    class_mapping = {
			
 
				+        "plane": 0,
			
 
				+        "ship": 1,
			
 
				+        "storage-tank": 2,
			
 
				+        "baseball-diamond": 3,
			
 
				+        "tennis-court": 4,
			
 
				+        "basketball-court": 5,
			
 
				+        "ground-track-field": 6,
			
 
				+        "harbor": 7,
			
 
				+        "bridge": 8,
			
 
				+        "large-vehicle": 9,
			
 
				+        "small-vehicle": 10,
			
 
				+        "helicopter": 11,
			
 
				+        "roundabout": 12,
			
 
				+        "soccer-ball-field": 13,
			
 
				+        "swimming-pool": 14,
			
 
				+        "container-crane": 15,
			
 
				+        "airport": 16,
			
 
				+        "helipad": 17,
			
 
				+    }
			
 
				+
			
 
				+    def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
			
 
				+        """Converts a single image's DOTA annotation to YOLO OBB format and saves it to a specified directory."""
			
 
				+        orig_label_path = orig_label_dir / f"{image_name}.txt"
			
 
				+        save_path = save_dir / f"{image_name}.txt"
			
 
				+
			
 
				+        with orig_label_path.open("r") as f, save_path.open("w") as g:
			
 
				+            lines = f.readlines()
			
 
				+            for line in lines:
			
 
				+                parts = line.strip().split()
			
 
				+                if len(parts) < 9:
			
 
				+                    continue
			
 
				+                class_name = parts[8]
			
 
				+                class_idx = class_mapping[class_name]
			
 
				+                coords = [float(p) for p in parts[:8]]
			
 
				+                normalized_coords = [
			
 
				+                    coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)
			
 
				+                ]
			
 
				+                formatted_coords = [f"{coord:.6g}" for coord in normalized_coords]
			
 
				+                g.write(f"{class_idx} {' '.join(formatted_coords)}\n")
			
 
				+
			
 
				+    for phase in ["train", "val"]:
			
 
				+        image_dir = dota_root_path / "images" / phase
			
 
				+        orig_label_dir = dota_root_path / "labels" / f"{phase}_original"
			
 
				+        save_dir = dota_root_path / "labels" / phase
			
 
				+
			
 
				+        save_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        image_paths = list(image_dir.iterdir())
			
 
				+        for image_path in TQDM(image_paths, desc=f"Processing {phase} images"):
			
 
				+            if image_path.suffix != ".png":
			
 
				+                continue
			
 
				+            image_name_without_ext = image_path.stem
			
 
				+            img = cv2.imread(str(image_path))
			
 
				+            h, w = img.shape[:2]
			
 
				+            convert_label(image_name_without_ext, w, h, orig_label_dir, save_dir)
			
 
				+
			
 
				+
			
 
				+def min_index(arr1, arr2):
			
 
				+    """
			
 
				+    Find a pair of indexes with the shortest distance between two arrays of 2D points.
			
 
				+
			
 
				+    Args:
			
 
				+        arr1 (np.ndarray): A NumPy array of shape (N, 2) representing N 2D points.
			
 
				+        arr2 (np.ndarray): A NumPy array of shape (M, 2) representing M 2D points.
			
 
				+
			
 
				+    Returns:
			
 
				+        (tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
			
 
				+    """
			
 
				+    dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
			
 
				+    return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
			
 
				+
			
 
				+
			
 
				+def merge_multi_segment(segments):
			
 
				+    """
			
 
				+    Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
			
 
				+    This function connects these coordinates with a thin line to merge all segments into one.
			
 
				+
			
 
				+    Args:
			
 
				+        segments (List[List]): Original segmentations in COCO's JSON file.
			
 
				+                               Each element is a list of coordinates, like [segmentation1, segmentation2,...].
			
 
				+
			
 
				+    Returns:
			
 
				+        s (List[np.ndarray]): A list of connected segments represented as NumPy arrays.
			
 
				+    """
			
 
				+    s = []
			
 
				+    segments = [np.array(i).reshape(-1, 2) for i in segments]
			
 
				+    idx_list = [[] for _ in range(len(segments))]
			
 
				+
			
 
				+    # Record the indexes with min distance between each segment
			
 
				+    for i in range(1, len(segments)):
			
 
				+        idx1, idx2 = min_index(segments[i - 1], segments[i])
			
 
				+        idx_list[i - 1].append(idx1)
			
 
				+        idx_list[i].append(idx2)
			
 
				+
			
 
				+    # Use two round to connect all the segments
			
 
				+    for k in range(2):
			
 
				+        # Forward connection
			
 
				+        if k == 0:
			
 
				+            for i, idx in enumerate(idx_list):
			
 
				+                # Middle segments have two indexes, reverse the index of middle segments
			
 
				+                if len(idx) == 2 and idx[0] > idx[1]:
			
 
				+                    idx = idx[::-1]
			
 
				+                    segments[i] = segments[i][::-1, :]
			
 
				+
			
 
				+                segments[i] = np.roll(segments[i], -idx[0], axis=0)
			
 
				+                segments[i] = np.concatenate([segments[i], segments[i][:1]])
			
 
				+                # Deal with the first segment and the last one
			
 
				+                if i in {0, len(idx_list) - 1}:
			
 
				+                    s.append(segments[i])
			
 
				+                else:
			
 
				+                    idx = [0, idx[1] - idx[0]]
			
 
				+                    s.append(segments[i][idx[0] : idx[1] + 1])
			
 
				+
			
 
				+        else:
			
 
				+            for i in range(len(idx_list) - 1, -1, -1):
			
 
				+                if i not in {0, len(idx_list) - 1}:
			
 
				+                    idx = idx_list[i]
			
 
				+                    nidx = abs(idx[1] - idx[0])
			
 
				+                    s.append(segments[i][nidx:])
			
 
				+    return s
			
 
				+
			
 
				+
			
 
				+def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt", device=None):
			
 
				+    """
			
 
				+    Converts existing object detection dataset (bounding boxes) to segmentation dataset or oriented bounding box (OBB)
			
 
				+    in YOLO format. Generates segmentation data using SAM auto-annotator as needed.
			
 
				+
			
 
				+    Args:
			
 
				+        im_dir (str | Path): Path to image directory to convert.
			
 
				+        save_dir (str | Path): Path to save the generated labels, labels will be saved
			
 
				+            into `labels-segment` in the same directory level of `im_dir` if save_dir is None. Default: None.
			
 
				+        sam_model (str): Segmentation model to use for intermediate segmentation data; optional.
			
 
				+        device (int | str): The specific device to run SAM models. Default: None.
			
 
				+
			
 
				+    Notes:
			
 
				+        The input directory structure assumed for dataset:
			
 
				+
			
 
				+            - im_dir
			
 
				+                ├─ 001.jpg
			
 
				+                ├─ ...
			
 
				+                └─ NNN.jpg
			
 
				+            - labels
			
 
				+                ├─ 001.txt
			
 
				+                ├─ ...
			
 
				+                └─ NNN.txt
			
 
				+    """
			
 
				+    from ultralytics import SAM
			
 
				+    from ultralytics.data import YOLODataset
			
 
				+    from ultralytics.utils import LOGGER
			
 
				+    from ultralytics.utils.ops import xywh2xyxy
			
 
				+
			
 
				+    # NOTE: add placeholder to pass class index check
			
 
				+    dataset = YOLODataset(im_dir, data=dict(names=list(range(1000))))
			
 
				+    if len(dataset.labels[0]["segments"]) > 0:  # if it's segment data
			
 
				+        LOGGER.info("Segmentation labels detected, no need to generate new ones!")
			
 
				+        return
			
 
				+
			
 
				+    LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
			
 
				+    sam_model = SAM(sam_model)
			
 
				+    for label in TQDM(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
			
 
				+        h, w = label["shape"]
			
 
				+        boxes = label["bboxes"]
			
 
				+        if len(boxes) == 0:  # skip empty labels
			
 
				+            continue
			
 
				+        boxes[:, [0, 2]] *= w
			
 
				+        boxes[:, [1, 3]] *= h
			
 
				+        im = cv2.imread(label["im_file"])
			
 
				+        sam_results = sam_model(im, bboxes=xywh2xyxy(boxes), verbose=False, save=False, device=device)
			
 
				+        label["segments"] = sam_results[0].masks.xyn
			
 
				+
			
 
				+    save_dir = Path(save_dir) if save_dir else Path(im_dir).parent / "labels-segment"
			
 
				+    save_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    for label in dataset.labels:
			
 
				+        texts = []
			
 
				+        lb_name = Path(label["im_file"]).with_suffix(".txt").name
			
 
				+        txt_file = save_dir / lb_name
			
 
				+        cls = label["cls"]
			
 
				+        for i, s in enumerate(label["segments"]):
			
 
				+            if len(s) == 0:
			
 
				+                continue
			
 
				+            line = (int(cls[i]), *s.reshape(-1))
			
 
				+            texts.append(("%g " * len(line)).rstrip() % line)
			
 
				+        with open(txt_file, "a") as f:
			
 
				+            f.writelines(text + "\n" for text in texts)
			
 
				+    LOGGER.info(f"Generated segment labels saved in {save_dir}")
			
 
				+
			
 
				+
			
 
				+def create_synthetic_coco_dataset():
			
 
				+    """
			
 
				+    Creates a synthetic COCO dataset with random images based on filenames from label lists.
			
 
				+
			
 
				+    This function downloads COCO labels, reads image filenames from label list files,
			
 
				+    creates synthetic images for train2017 and val2017 subsets, and organizes
			
 
				+    them in the COCO dataset structure. It uses multithreading to generate images efficiently.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from ultralytics.data.converter import create_synthetic_coco_dataset
			
 
				+        >>> create_synthetic_coco_dataset()
			
 
				+
			
 
				+    Notes:
			
 
				+        - Requires internet connection to download label files.
			
 
				+        - Generates random RGB images of varying sizes (480x480 to 640x640 pixels).
			
 
				+        - Existing test2017 directory is removed as it's not needed.
			
 
				+        - Reads image filenames from train2017.txt and val2017.txt files.
			
 
				+    """
			
 
				+
			
 
				+    def create_synthetic_image(image_file):
			
 
				+        """Generates synthetic images with random sizes and colors for dataset augmentation or testing purposes."""
			
 
				+        if not image_file.exists():
			
 
				+            size = (random.randint(480, 640), random.randint(480, 640))
			
 
				+            Image.new(
			
 
				+                "RGB",
			
 
				+                size=size,
			
 
				+                color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
			
 
				+            ).save(image_file)
			
 
				+
			
 
				+    # Download labels
			
 
				+    dir = DATASETS_DIR / "coco"
			
 
				+    url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
			
 
				+    label_zip = "coco2017labels-segments.zip"
			
 
				+    download([url + label_zip], dir=dir.parent)
			
 
				+
			
 
				+    # Create synthetic images
			
 
				+    shutil.rmtree(dir / "labels" / "test2017", ignore_errors=True)  # Remove test2017 directory as not needed
			
 
				+    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
			
 
				+        for subset in ["train2017", "val2017"]:
			
 
				+            subset_dir = dir / "images" / subset
			
 
				+            subset_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+            # Read image filenames from label list file
			
 
				+            label_list_file = dir / f"{subset}.txt"
			
 
				+            if label_list_file.exists():
			
 
				+                with open(label_list_file) as f:
			
 
				+                    image_files = [dir / line.strip() for line in f]
			
 
				+
			
 
				+                # Submit all tasks
			
 
				+                futures = [executor.submit(create_synthetic_image, image_file) for image_file in image_files]
			
 
				+                for _ in TQDM(as_completed(futures), total=len(futures), desc=f"Generating images for {subset}"):
			
 
				+                    pass  # The actual work is done in the background
			
 
				+            else:
			
 
				+                print(f"Warning: Labels file {label_list_file} does not exist. Skipping image creation for {subset}.")
			
 
				+
			
 
				+    print("Synthetic COCO dataset created successfully.")
			
--- a/ultralytics/data/dataset.py
+++ b/ultralytics/data/dataset.py
@@ -0,0 +1,521 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import json
			
 
				+from collections import defaultdict
			
 
				+from itertools import repeat
			
 
				+from multiprocessing.pool import ThreadPool
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+from torch.utils.data import ConcatDataset
			
 
				+
			
 
				+from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr
			
 
				+from ultralytics.utils.ops import resample_segments
			
 
				+from ultralytics.utils.torch_utils import TORCHVISION_0_18
			
 
				+
			
 
				+from .augment import (
			
 
				+    Compose,
			
 
				+    Format,
			
 
				+    Instances,
			
 
				+    LetterBox,
			
 
				+    RandomLoadText,
			
 
				+    classify_augmentations,
			
 
				+    classify_transforms,
			
 
				+    v8_transforms,
			
 
				+)
			
 
				+from .base import BaseDataset
			
 
				+from .utils import (
			
 
				+    HELP_URL,
			
 
				+    LOGGER,
			
 
				+    get_hash,
			
 
				+    img2label_paths,
			
 
				+    load_dataset_cache_file,
			
 
				+    save_dataset_cache_file,
			
 
				+    verify_image,
			
 
				+    verify_image_label,
			
 
				+)
			
 
				+
			
 
				+# Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
			
 
				+DATASET_CACHE_VERSION = "1.0.3"
			
 
				+
			
 
				+
			
 
				+class YOLODataset(BaseDataset):
			
 
				+    """
			
 
				+    Dataset class for loading object detection and/or segmentation labels in YOLO format.
			
 
				+
			
 
				+    Args:
			
 
				+        data (dict, optional): A dataset YAML dictionary. Defaults to None.
			
 
				+        task (str): An explicit arg to point current task, Defaults to 'detect'.
			
 
				+
			
 
				+    Returns:
			
 
				+        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, *args, data=None, task="detect", **kwargs):
			
 
				+        """Initializes the YOLODataset with optional configurations for segments and keypoints."""
			
 
				+        self.use_segments = task == "segment"
			
 
				+        self.use_keypoints = task == "pose"
			
 
				+        self.use_obb = task == "obb"
			
 
				+        self.data = data
			
 
				+        assert not (self.use_segments and self.use_keypoints), "Can not use both segments and keypoints."
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+
			
 
				+    def cache_labels(self, path=Path("./labels.cache")):
			
 
				+        """
			
 
				+        Cache dataset labels, check images and read shapes.
			
 
				+
			
 
				+        Args:
			
 
				+            path (Path): Path where to save the cache file. Default is Path("./labels.cache").
			
 
				+
			
 
				+        Returns:
			
 
				+            (dict): labels.
			
 
				+        """
			
 
				+        x = {"labels": []}
			
 
				+        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
			
 
				+        desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
			
 
				+        total = len(self.im_files)
			
 
				+        nkpt, ndim = self.data.get("kpt_shape", (0, 0))
			
 
				+        if self.use_keypoints and (nkpt <= 0 or ndim not in {2, 3}):
			
 
				+            raise ValueError(
			
 
				+                "'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
			
 
				+                "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
			
 
				+            )
			
 
				+        with ThreadPool(NUM_THREADS) as pool:
			
 
				+            results = pool.imap(
			
 
				+                func=verify_image_label,
			
 
				+                iterable=zip(
			
 
				+                    self.im_files,
			
 
				+                    self.label_files,
			
 
				+                    repeat(self.prefix),
			
 
				+                    repeat(self.use_keypoints),
			
 
				+                    repeat(len(self.data["names"])),
			
 
				+                    repeat(nkpt),
			
 
				+                    repeat(ndim),
			
 
				+                ),
			
 
				+            )
			
 
				+            pbar = TQDM(results, desc=desc, total=total)
			
 
				+            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
			
 
				+                nm += nm_f
			
 
				+                nf += nf_f
			
 
				+                ne += ne_f
			
 
				+                nc += nc_f
			
 
				+                if im_file:
			
 
				+                    x["labels"].append(
			
 
				+                        {
			
 
				+                            "im_file": im_file,
			
 
				+                            "shape": shape,
			
 
				+                            "cls": lb[:, 0:1],  # n, 1
			
 
				+                            "bboxes": lb[:, 1:],  # n, 4
			
 
				+                            "segments": segments,
			
 
				+                            "keypoints": keypoint,
			
 
				+                            "normalized": True,
			
 
				+                            "bbox_format": "xywh",
			
 
				+                        }
			
 
				+                    )
			
 
				+                if msg:
			
 
				+                    msgs.append(msg)
			
 
				+                pbar.desc = f"{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt"
			
 
				+            pbar.close()
			
 
				+
			
 
				+        if msgs:
			
 
				+            LOGGER.info("\n".join(msgs))
			
 
				+        if nf == 0:
			
 
				+            LOGGER.warning(f"{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}")
			
 
				+        x["hash"] = get_hash(self.label_files + self.im_files)
			
 
				+        x["results"] = nf, nm, ne, nc, len(self.im_files)
			
 
				+        x["msgs"] = msgs  # warnings
			
 
				+        save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
			
 
				+        return x
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """Returns dictionary of labels for YOLO training."""
			
 
				+        self.label_files = img2label_paths(self.im_files)
			
 
				+        cache_path = Path(self.label_files[0]).parent.with_suffix(".cache")
			
 
				+        try:
			
 
				+            cache, exists = load_dataset_cache_file(cache_path), True  # attempt to load a *.cache file
			
 
				+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
			
 
				+            assert cache["hash"] == get_hash(self.label_files + self.im_files)  # identical hash
			
 
				+        except (FileNotFoundError, AssertionError, AttributeError):
			
 
				+            cache, exists = self.cache_labels(cache_path), False  # run cache ops
			
 
				+
			
 
				+        # Display cache
			
 
				+        nf, nm, ne, nc, n = cache.pop("results")  # found, missing, empty, corrupt, total
			
 
				+        if exists and LOCAL_RANK in {-1, 0}:
			
 
				+            d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
			
 
				+            TQDM(None, desc=self.prefix + d, total=n, initial=n)  # display results
			
 
				+            if cache["msgs"]:
			
 
				+                LOGGER.info("\n".join(cache["msgs"]))  # display warnings
			
 
				+
			
 
				+        # Read cache
			
 
				+        [cache.pop(k) for k in ("hash", "version", "msgs")]  # remove items
			
 
				+        labels = cache["labels"]
			
 
				+        if not labels:
			
 
				+            LOGGER.warning(f"WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}")
			
 
				+        self.im_files = [lb["im_file"] for lb in labels]  # update im_files
			
 
				+
			
 
				+        # Check if the dataset is all boxes or all segments
			
 
				+        lengths = ((len(lb["cls"]), len(lb["bboxes"]), len(lb["segments"])) for lb in labels)
			
 
				+        len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
			
 
				+        if len_segments and len_boxes != len_segments:
			
 
				+            LOGGER.warning(
			
 
				+                f"WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, "
			
 
				+                f"len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. "
			
 
				+                "To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset."
			
 
				+            )
			
 
				+            for lb in labels:
			
 
				+                lb["segments"] = []
			
 
				+        if len_cls == 0:
			
 
				+            LOGGER.warning(f"WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}")
			
 
				+        return labels
			
 
				+
			
 
				+    def build_transforms(self, hyp=None):
			
 
				+        """Builds and appends transforms to the list."""
			
 
				+        if self.augment:
			
 
				+            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
			
 
				+            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
			
 
				+            transforms = v8_transforms(self, self.imgsz, hyp)
			
 
				+        else:
			
 
				+            transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
			
 
				+        transforms.append(
			
 
				+            Format(
			
 
				+                bbox_format="xywh",
			
 
				+                normalize=True,
			
 
				+                return_mask=self.use_segments,
			
 
				+                return_keypoint=self.use_keypoints,
			
 
				+                return_obb=self.use_obb,
			
 
				+                batch_idx=True,
			
 
				+                mask_ratio=hyp.mask_ratio,
			
 
				+                mask_overlap=hyp.overlap_mask,
			
 
				+                bgr=hyp.bgr if self.augment else 0.0,  # only affect training.
			
 
				+            )
			
 
				+        )
			
 
				+        return transforms
			
 
				+
			
 
				+    def close_mosaic(self, hyp):
			
 
				+        """Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations."""
			
 
				+        hyp.mosaic = 0.0  # set mosaic ratio=0.0
			
 
				+        hyp.copy_paste = 0.0  # keep the same behavior as previous v8 close-mosaic
			
 
				+        hyp.mixup = 0.0  # keep the same behavior as previous v8 close-mosaic
			
 
				+        self.transforms = self.build_transforms(hyp)
			
 
				+
			
 
				+    def update_labels_info(self, label):
			
 
				+        """
			
 
				+        Custom your label format here.
			
 
				+
			
 
				+        Note:
			
 
				+            cls is not with bboxes now, classification and semantic segmentation need an independent cls label
			
 
				+            Can also support classification and semantic segmentation by adding or removing dict keys there.
			
 
				+        """
			
 
				+        bboxes = label.pop("bboxes")
			
 
				+        segments = label.pop("segments", [])
			
 
				+        keypoints = label.pop("keypoints", None)
			
 
				+        bbox_format = label.pop("bbox_format")
			
 
				+        normalized = label.pop("normalized")
			
 
				+
			
 
				+        # NOTE: do NOT resample oriented boxes
			
 
				+        segment_resamples = 100 if self.use_obb else 1000
			
 
				+        if len(segments) > 0:
			
 
				+            # make sure segments interpolate correctly if original length is greater than segment_resamples
			
 
				+            max_len = max(len(s) for s in segments)
			
 
				+            segment_resamples = (max_len + 1) if segment_resamples < max_len else segment_resamples
			
 
				+            # list[np.array(segment_resamples, 2)] * num_samples
			
 
				+            segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
			
 
				+        else:
			
 
				+            segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
			
 
				+        label["instances"] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
			
 
				+        return label
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def collate_fn(batch):
			
 
				+        """Collates data samples into batches."""
			
 
				+        new_batch = {}
			
 
				+        keys = batch[0].keys()
			
 
				+        values = list(zip(*[list(b.values()) for b in batch]))
			
 
				+        for i, k in enumerate(keys):
			
 
				+            value = values[i]
			
 
				+            if k == "img":
			
 
				+                value = torch.stack(value, 0)
			
 
				+            if k in {"masks", "keypoints", "bboxes", "cls", "segments", "obb"}:
			
 
				+                value = torch.cat(value, 0)
			
 
				+            new_batch[k] = value
			
 
				+        new_batch["batch_idx"] = list(new_batch["batch_idx"])
			
 
				+        for i in range(len(new_batch["batch_idx"])):
			
 
				+            new_batch["batch_idx"][i] += i  # add target image index for build_targets()
			
 
				+        new_batch["batch_idx"] = torch.cat(new_batch["batch_idx"], 0)
			
 
				+        return new_batch
			
 
				+
			
 
				+
			
 
				+class YOLOMultiModalDataset(YOLODataset):
			
 
				+    """
			
 
				+    Dataset class for loading object detection and/or segmentation labels in YOLO format.
			
 
				+
			
 
				+    Args:
			
 
				+        data (dict, optional): A dataset YAML dictionary. Defaults to None.
			
 
				+        task (str): An explicit arg to point current task, Defaults to 'detect'.
			
 
				+
			
 
				+    Returns:
			
 
				+        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, *args, data=None, task="detect", **kwargs):
			
 
				+        """Initializes a dataset object for object detection tasks with optional specifications."""
			
 
				+        super().__init__(*args, data=data, task=task, **kwargs)
			
 
				+
			
 
				+    def update_labels_info(self, label):
			
 
				+        """Add texts information for multi-modal model training."""
			
 
				+        labels = super().update_labels_info(label)
			
 
				+        # NOTE: some categories are concatenated with its synonyms by `/`.
			
 
				+        labels["texts"] = [v.split("/") for _, v in self.data["names"].items()]
			
 
				+        return labels
			
 
				+
			
 
				+    def build_transforms(self, hyp=None):
			
 
				+        """Enhances data transformations with optional text augmentation for multi-modal training."""
			
 
				+        transforms = super().build_transforms(hyp)
			
 
				+        if self.augment:
			
 
				+            # NOTE: hard-coded the args for now.
			
 
				+            transforms.insert(-1, RandomLoadText(max_samples=min(self.data["nc"], 80), padding=True))
			
 
				+        return transforms
			
 
				+
			
 
				+
			
 
				+class GroundingDataset(YOLODataset):
			
 
				+    """Handles object detection tasks by loading annotations from a specified JSON file, supporting YOLO format."""
			
 
				+
			
 
				+    def __init__(self, *args, task="detect", json_file, **kwargs):
			
 
				+        """Initializes a GroundingDataset for object detection, loading annotations from a specified JSON file."""
			
 
				+        assert task == "detect", "`GroundingDataset` only support `detect` task for now!"
			
 
				+        self.json_file = json_file
			
 
				+        super().__init__(*args, task=task, data={}, **kwargs)
			
 
				+
			
 
				+    def get_img_files(self, img_path):
			
 
				+        """The image files would be read in `get_labels` function, return empty list here."""
			
 
				+        return []
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """Loads annotations from a JSON file, filters, and normalizes bounding boxes for each image."""
			
 
				+        labels = []
			
 
				+        LOGGER.info("Loading annotation file...")
			
 
				+        with open(self.json_file) as f:
			
 
				+            annotations = json.load(f)
			
 
				+        images = {f"{x['id']:d}": x for x in annotations["images"]}
			
 
				+        img_to_anns = defaultdict(list)
			
 
				+        for ann in annotations["annotations"]:
			
 
				+            img_to_anns[ann["image_id"]].append(ann)
			
 
				+        for img_id, anns in TQDM(img_to_anns.items(), desc=f"Reading annotations {self.json_file}"):
			
 
				+            img = images[f"{img_id:d}"]
			
 
				+            h, w, f = img["height"], img["width"], img["file_name"]
			
 
				+            im_file = Path(self.img_path) / f
			
 
				+            if not im_file.exists():
			
 
				+                continue
			
 
				+            self.im_files.append(str(im_file))
			
 
				+            bboxes = []
			
 
				+            cat2id = {}
			
 
				+            texts = []
			
 
				+            for ann in anns:
			
 
				+                if ann["iscrowd"]:
			
 
				+                    continue
			
 
				+                box = np.array(ann["bbox"], dtype=np.float32)
			
 
				+                box[:2] += box[2:] / 2
			
 
				+                box[[0, 2]] /= float(w)
			
 
				+                box[[1, 3]] /= float(h)
			
 
				+                if box[2] <= 0 or box[3] <= 0:
			
 
				+                    continue
			
 
				+
			
 
				+                caption = img["caption"]
			
 
				+                cat_name = " ".join([caption[t[0] : t[1]] for t in ann["tokens_positive"]])
			
 
				+                if cat_name not in cat2id:
			
 
				+                    cat2id[cat_name] = len(cat2id)
			
 
				+                    texts.append([cat_name])
			
 
				+                cls = cat2id[cat_name]  # class
			
 
				+                box = [cls] + box.tolist()
			
 
				+                if box not in bboxes:
			
 
				+                    bboxes.append(box)
			
 
				+            lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
			
 
				+            labels.append(
			
 
				+                {
			
 
				+                    "im_file": im_file,
			
 
				+                    "shape": (h, w),
			
 
				+                    "cls": lb[:, 0:1],  # n, 1
			
 
				+                    "bboxes": lb[:, 1:],  # n, 4
			
 
				+                    "normalized": True,
			
 
				+                    "bbox_format": "xywh",
			
 
				+                    "texts": texts,
			
 
				+                }
			
 
				+            )
			
 
				+        return labels
			
 
				+
			
 
				+    def build_transforms(self, hyp=None):
			
 
				+        """Configures augmentations for training with optional text loading; `hyp` adjusts augmentation intensity."""
			
 
				+        transforms = super().build_transforms(hyp)
			
 
				+        if self.augment:
			
 
				+            # NOTE: hard-coded the args for now.
			
 
				+            transforms.insert(-1, RandomLoadText(max_samples=80, padding=True))
			
 
				+        return transforms
			
 
				+
			
 
				+
			
 
				+class YOLOConcatDataset(ConcatDataset):
			
 
				+    """
			
 
				+    Dataset as a concatenation of multiple datasets.
			
 
				+
			
 
				+    This class is useful to assemble different existing datasets.
			
 
				+    """
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def collate_fn(batch):
			
 
				+        """Collates data samples into batches."""
			
 
				+        return YOLODataset.collate_fn(batch)
			
 
				+
			
 
				+
			
 
				+# TODO: support semantic segmentation
			
 
				+class SemanticDataset(BaseDataset):
			
 
				+    """
			
 
				+    Semantic Segmentation Dataset.
			
 
				+
			
 
				+    This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
			
 
				+    from the BaseDataset class.
			
 
				+
			
 
				+    Note:
			
 
				+        This class is currently a placeholder and needs to be populated with methods and attributes for supporting
			
 
				+        semantic segmentation tasks.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        """Initialize a SemanticDataset object."""
			
 
				+        super().__init__()
			
 
				+
			
 
				+
			
 
				+class ClassificationDataset:
			
 
				+    """
			
 
				+    Extends torchvision ImageFolder to support YOLO classification tasks, offering functionalities like image
			
 
				+    augmentation, caching, and verification. It's designed to efficiently handle large datasets for training deep
			
 
				+    learning models, with optional image transformations and caching mechanisms to speed up training.
			
 
				+
			
 
				+    This class allows for augmentations using both torchvision and Albumentations libraries, and supports caching images
			
 
				+    in RAM or on disk to reduce IO overhead during training. Additionally, it implements a robust verification process
			
 
				+    to ensure data integrity and consistency.
			
 
				+
			
 
				+    Attributes:
			
 
				+        cache_ram (bool): Indicates if caching in RAM is enabled.
			
 
				+        cache_disk (bool): Indicates if caching on disk is enabled.
			
 
				+        samples (list): A list of tuples, each containing the path to an image, its class index, path to its .npy cache
			
 
				+                        file (if caching on disk), and optionally the loaded image array (if caching in RAM).
			
 
				+        torch_transforms (callable): PyTorch transforms to be applied to the images.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, root, args, augment=False, prefix=""):
			
 
				+        """
			
 
				+        Initialize YOLO object with root, image size, augmentations, and cache settings.
			
 
				+
			
 
				+        Args:
			
 
				+            root (str): Path to the dataset directory where images are stored in a class-specific folder structure.
			
 
				+            args (Namespace): Configuration containing dataset-related settings such as image size, augmentation
			
 
				+                parameters, and cache settings. It includes attributes like `imgsz` (image size), `fraction` (fraction
			
 
				+                of data to use), `scale`, `fliplr`, `flipud`, `cache` (disk or RAM caching for faster training),
			
 
				+                `auto_augment`, `hsv_h`, `hsv_s`, `hsv_v`, and `crop_fraction`.
			
 
				+            augment (bool, optional): Whether to apply augmentations to the dataset. Default is False.
			
 
				+            prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification and
			
 
				+                debugging. Default is an empty string.
			
 
				+        """
			
 
				+        import torchvision  # scope for faster 'import ultralytics'
			
 
				+
			
 
				+        # Base class assigned as attribute rather than used as base class to allow for scoping slow torchvision import
			
 
				+        if TORCHVISION_0_18:  # 'allow_empty' argument first introduced in torchvision 0.18
			
 
				+            self.base = torchvision.datasets.ImageFolder(root=root, allow_empty=True)
			
 
				+        else:
			
 
				+            self.base = torchvision.datasets.ImageFolder(root=root)
			
 
				+        self.samples = self.base.samples
			
 
				+        self.root = self.base.root
			
 
				+
			
 
				+        # Initialize attributes
			
 
				+        if augment and args.fraction < 1.0:  # reduce training fraction
			
 
				+            self.samples = self.samples[: round(len(self.samples) * args.fraction)]
			
 
				+        self.prefix = colorstr(f"{prefix}: ") if prefix else ""
			
 
				+        self.cache_ram = args.cache is True or str(args.cache).lower() == "ram"  # cache images into RAM
			
 
				+        if self.cache_ram:
			
 
				+            LOGGER.warning(
			
 
				+                "WARNING ⚠️ Classification `cache_ram` training has known memory leak in "
			
 
				+                "https://github.com/ultralytics/ultralytics/issues/9824, setting `cache_ram=False`."
			
 
				+            )
			
 
				+            self.cache_ram = False
			
 
				+        self.cache_disk = str(args.cache).lower() == "disk"  # cache images on hard drive as uncompressed *.npy files
			
 
				+        self.samples = self.verify_images()  # filter out bad images
			
 
				+        self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples]  # file, index, npy, im
			
 
				+        scale = (1.0 - args.scale, 1.0)  # (0.08, 1.0)
			
 
				+        self.torch_transforms = (
			
 
				+            classify_augmentations(
			
 
				+                size=args.imgsz,
			
 
				+                scale=scale,
			
 
				+                hflip=args.fliplr,
			
 
				+                vflip=args.flipud,
			
 
				+                erasing=args.erasing,
			
 
				+                auto_augment=args.auto_augment,
			
 
				+                hsv_h=args.hsv_h,
			
 
				+                hsv_s=args.hsv_s,
			
 
				+                hsv_v=args.hsv_v,
			
 
				+            )
			
 
				+            if augment
			
 
				+            else classify_transforms(size=args.imgsz, crop_fraction=args.crop_fraction)
			
 
				+        )
			
 
				+
			
 
				+    def __getitem__(self, i):
			
 
				+        """Returns subset of data and targets corresponding to given indices."""
			
 
				+        f, j, fn, im = self.samples[i]  # filename, index, filename.with_suffix('.npy'), image
			
 
				+        if self.cache_ram:
			
 
				+            if im is None:  # Warning: two separate if statements required here, do not combine this with previous line
			
 
				+                im = self.samples[i][3] = cv2.imread(f)
			
 
				+        elif self.cache_disk:
			
 
				+            if not fn.exists():  # load npy
			
 
				+                np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
			
 
				+            im = np.load(fn)
			
 
				+        else:  # read image
			
 
				+            im = cv2.imread(f)  # BGR
			
 
				+        # Convert NumPy array to PIL image
			
 
				+        im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
			
 
				+        sample = self.torch_transforms(im)
			
 
				+        return {"img": sample, "cls": j}
			
 
				+
			
 
				+    def __len__(self) -> int:
			
 
				+        """Return the total number of samples in the dataset."""
			
 
				+        return len(self.samples)
			
 
				+
			
 
				+    def verify_images(self):
			
 
				+        """Verify all images in dataset."""
			
 
				+        desc = f"{self.prefix}Scanning {self.root}..."
			
 
				+        path = Path(self.root).with_suffix(".cache")  # *.cache file path
			
 
				+
			
 
				+        try:
			
 
				+            cache = load_dataset_cache_file(path)  # attempt to load a *.cache file
			
 
				+            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
			
 
				+            assert cache["hash"] == get_hash([x[0] for x in self.samples])  # identical hash
			
 
				+            nf, nc, n, samples = cache.pop("results")  # found, missing, empty, corrupt, total
			
 
				+            if LOCAL_RANK in {-1, 0}:
			
 
				+                d = f"{desc} {nf} images, {nc} corrupt"
			
 
				+                TQDM(None, desc=d, total=n, initial=n)
			
 
				+                if cache["msgs"]:
			
 
				+                    LOGGER.info("\n".join(cache["msgs"]))  # display warnings
			
 
				+            return samples
			
 
				+
			
 
				+        except (FileNotFoundError, AssertionError, AttributeError):
			
 
				+            # Run scan if *.cache retrieval failed
			
 
				+            nf, nc, msgs, samples, x = 0, 0, [], [], {}
			
 
				+            with ThreadPool(NUM_THREADS) as pool:
			
 
				+                results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
			
 
				+                pbar = TQDM(results, desc=desc, total=len(self.samples))
			
 
				+                for sample, nf_f, nc_f, msg in pbar:
			
 
				+                    if nf_f:
			
 
				+                        samples.append(sample)
			
 
				+                    if msg:
			
 
				+                        msgs.append(msg)
			
 
				+                    nf += nf_f
			
 
				+                    nc += nc_f
			
 
				+                    pbar.desc = f"{desc} {nf} images, {nc} corrupt"
			
 
				+                pbar.close()
			
 
				+            if msgs:
			
 
				+                LOGGER.info("\n".join(msgs))
			
 
				+            x["hash"] = get_hash([x[0] for x in self.samples])
			
 
				+            x["results"] = nf, nc, len(samples), samples
			
 
				+            x["msgs"] = msgs  # warnings
			
 
				+            save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
			
 
				+            return samples
			
--- a/ultralytics/data/loaders.py
+++ b/ultralytics/data/loaders.py
@@ -0,0 +1,658 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import glob
			
 
				+import math
			
 
				+import os
			
 
				+import time
			
 
				+from dataclasses import dataclass
			
 
				+from pathlib import Path
			
 
				+from threading import Thread
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+import requests
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+
			
 
				+from ultralytics.data.utils import FORMATS_HELP_MSG, IMG_FORMATS, VID_FORMATS
			
 
				+from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, ops
			
 
				+from ultralytics.utils.checks import check_requirements
			
 
				+from ultralytics.utils.patches import imread
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class SourceTypes:
			
 
				+    """
			
 
				+    Class to represent various types of input sources for predictions.
			
 
				+
			
 
				+    This class uses dataclass to define boolean flags for different types of input sources that can be used for
			
 
				+    making predictions with YOLO models.
			
 
				+
			
 
				+    Attributes:
			
 
				+        stream (bool): Flag indicating if the input source is a video stream.
			
 
				+        screenshot (bool): Flag indicating if the input source is a screenshot.
			
 
				+        from_img (bool): Flag indicating if the input source is an image file.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> source_types = SourceTypes(stream=True, screenshot=False, from_img=False)
			
 
				+        >>> print(source_types.stream)
			
 
				+        True
			
 
				+        >>> print(source_types.from_img)
			
 
				+        False
			
 
				+    """
			
 
				+
			
 
				+    stream: bool = False
			
 
				+    screenshot: bool = False
			
 
				+    from_img: bool = False
			
 
				+    tensor: bool = False
			
 
				+
			
 
				+
			
 
				+class LoadStreams:
			
 
				+    """
			
 
				+    Stream Loader for various types of video streams.
			
 
				+
			
 
				+    Supports RTSP, RTMP, HTTP, and TCP streams. This class handles the loading and processing of multiple video
			
 
				+    streams simultaneously, making it suitable for real-time video analysis tasks.
			
 
				+
			
 
				+    Attributes:
			
 
				+        sources (List[str]): The source input paths or URLs for the video streams.
			
 
				+        vid_stride (int): Video frame-rate stride.
			
 
				+        buffer (bool): Whether to buffer input streams.
			
 
				+        running (bool): Flag to indicate if the streaming thread is running.
			
 
				+        mode (str): Set to 'stream' indicating real-time capture.
			
 
				+        imgs (List[List[np.ndarray]]): List of image frames for each stream.
			
 
				+        fps (List[float]): List of FPS for each stream.
			
 
				+        frames (List[int]): List of total frames for each stream.
			
 
				+        threads (List[Thread]): List of threads for each stream.
			
 
				+        shape (List[Tuple[int, int, int]]): List of shapes for each stream.
			
 
				+        caps (List[cv2.VideoCapture]): List of cv2.VideoCapture objects for each stream.
			
 
				+        bs (int): Batch size for processing.
			
 
				+
			
 
				+    Methods:
			
 
				+        update: Read stream frames in daemon thread.
			
 
				+        close: Close stream loader and release resources.
			
 
				+        __iter__: Returns an iterator object for the class.
			
 
				+        __next__: Returns source paths, transformed, and original images for processing.
			
 
				+        __len__: Return the length of the sources object.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> stream_loader = LoadStreams("rtsp://example.com/stream1.mp4")
			
 
				+        >>> for sources, imgs, _ in stream_loader:
			
 
				+        ...     # Process the images
			
 
				+        ...     pass
			
 
				+        >>> stream_loader.close()
			
 
				+
			
 
				+    Notes:
			
 
				+        - The class uses threading to efficiently load frames from multiple streams simultaneously.
			
 
				+        - It automatically handles YouTube links, converting them to the best available stream URL.
			
 
				+        - The class implements a buffer system to manage frame storage and retrieval.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, sources="file.streams", vid_stride=1, buffer=False):
			
 
				+        """Initialize stream loader for multiple video sources, supporting various stream types."""
			
 
				+        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
			
 
				+        self.buffer = buffer  # buffer input streams
			
 
				+        self.running = True  # running flag for Thread
			
 
				+        self.mode = "stream"
			
 
				+        self.vid_stride = vid_stride  # video frame-rate stride
			
 
				+
			
 
				+        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
			
 
				+        n = len(sources)
			
 
				+        self.bs = n
			
 
				+        self.fps = [0] * n  # frames per second
			
 
				+        self.frames = [0] * n
			
 
				+        self.threads = [None] * n
			
 
				+        self.caps = [None] * n  # video capture objects
			
 
				+        self.imgs = [[] for _ in range(n)]  # images
			
 
				+        self.shape = [[] for _ in range(n)]  # image shapes
			
 
				+        self.sources = [ops.clean_str(x) for x in sources]  # clean source names for later
			
 
				+        for i, s in enumerate(sources):  # index, source
			
 
				+            # Start thread to read frames from video stream
			
 
				+            st = f"{i + 1}/{n}: {s}... "
			
 
				+            if urlparse(s).hostname in {"www.youtube.com", "youtube.com", "youtu.be"}:  # if source is YouTube video
			
 
				+                # YouTube format i.e. 'https://www.youtube.com/watch?v=Jsn8D3aC840' or 'https://youtu.be/Jsn8D3aC840'
			
 
				+                s = get_best_youtube_url(s)
			
 
				+            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
			
 
				+            if s == 0 and (IS_COLAB or IS_KAGGLE):
			
 
				+                raise NotImplementedError(
			
 
				+                    "'source=0' webcam not supported in Colab and Kaggle notebooks. "
			
 
				+                    "Try running 'source=0' in a local environment."
			
 
				+                )
			
 
				+            self.caps[i] = cv2.VideoCapture(s)  # store video capture object
			
 
				+            if not self.caps[i].isOpened():
			
 
				+                raise ConnectionError(f"{st}Failed to open {s}")
			
 
				+            w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
			
 
				+            h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
			
 
				+            fps = self.caps[i].get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan
			
 
				+            self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
			
 
				+                "inf"
			
 
				+            )  # infinite stream fallback
			
 
				+            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback
			
 
				+
			
 
				+            success, im = self.caps[i].read()  # guarantee first frame
			
 
				+            if not success or im is None:
			
 
				+                raise ConnectionError(f"{st}Failed to read images from {s}")
			
 
				+            self.imgs[i].append(im)
			
 
				+            self.shape[i] = im.shape
			
 
				+            self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
			
 
				+            LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
			
 
				+            self.threads[i].start()
			
 
				+        LOGGER.info("")  # newline
			
 
				+
			
 
				+    def update(self, i, cap, stream):
			
 
				+        """Read stream frames in daemon thread and update image buffer."""
			
 
				+        n, f = 0, self.frames[i]  # frame number, frame array
			
 
				+        while self.running and cap.isOpened() and n < (f - 1):
			
 
				+            if len(self.imgs[i]) < 30:  # keep a <=30-image buffer
			
 
				+                n += 1
			
 
				+                cap.grab()  # .read() = .grab() followed by .retrieve()
			
 
				+                if n % self.vid_stride == 0:
			
 
				+                    success, im = cap.retrieve()
			
 
				+                    if not success:
			
 
				+                        im = np.zeros(self.shape[i], dtype=np.uint8)
			
 
				+                        LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
			
 
				+                        cap.open(stream)  # re-open stream if signal was lost
			
 
				+                    if self.buffer:
			
 
				+                        self.imgs[i].append(im)
			
 
				+                    else:
			
 
				+                        self.imgs[i] = [im]
			
 
				+            else:
			
 
				+                time.sleep(0.01)  # wait until the buffer is empty
			
 
				+
			
 
				+    def close(self):
			
 
				+        """Terminates stream loader, stops threads, and releases video capture resources."""
			
 
				+        self.running = False  # stop flag for Thread
			
 
				+        for thread in self.threads:
			
 
				+            if thread.is_alive():
			
 
				+                thread.join(timeout=5)  # Add timeout
			
 
				+        for cap in self.caps:  # Iterate through the stored VideoCapture objects
			
 
				+            try:
			
 
				+                cap.release()  # release video capture
			
 
				+            except Exception as e:
			
 
				+                LOGGER.warning(f"WARNING ⚠️ Could not release VideoCapture object: {e}")
			
 
				+        cv2.destroyAllWindows()
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Iterates through YOLO image feed and re-opens unresponsive streams."""
			
 
				+        self.count = -1
			
 
				+        return self
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        """Returns the next batch of frames from multiple video streams for processing."""
			
 
				+        self.count += 1
			
 
				+
			
 
				+        images = []
			
 
				+        for i, x in enumerate(self.imgs):
			
 
				+            # Wait until a frame is available in each buffer
			
 
				+            while not x:
			
 
				+                if not self.threads[i].is_alive() or cv2.waitKey(1) == ord("q"):  # q to quit
			
 
				+                    self.close()
			
 
				+                    raise StopIteration
			
 
				+                time.sleep(1 / min(self.fps))
			
 
				+                x = self.imgs[i]
			
 
				+                if not x:
			
 
				+                    LOGGER.warning(f"WARNING ⚠️ Waiting for stream {i}")
			
 
				+
			
 
				+            # Get and remove the first frame from imgs buffer
			
 
				+            if self.buffer:
			
 
				+                images.append(x.pop(0))
			
 
				+
			
 
				+            # Get the last frame, and clear the rest from the imgs buffer
			
 
				+            else:
			
 
				+                images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
			
 
				+                x.clear()
			
 
				+
			
 
				+        return self.sources, images, [""] * self.bs
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Return the number of video streams in the LoadStreams object."""
			
 
				+        return self.bs  # 1E12 frames = 32 streams at 30 FPS for 30 years
			
 
				+
			
 
				+
			
 
				+class LoadScreenshots:
			
 
				+    """
			
 
				+    Ultralytics screenshot dataloader for capturing and processing screen images.
			
 
				+
			
 
				+    This class manages the loading of screenshot images for processing with YOLO. It is suitable for use with
			
 
				+    `yolo predict source=screen`.
			
 
				+
			
 
				+    Attributes:
			
 
				+        source (str): The source input indicating which screen to capture.
			
 
				+        screen (int): The screen number to capture.
			
 
				+        left (int): The left coordinate for screen capture area.
			
 
				+        top (int): The top coordinate for screen capture area.
			
 
				+        width (int): The width of the screen capture area.
			
 
				+        height (int): The height of the screen capture area.
			
 
				+        mode (str): Set to 'stream' indicating real-time capture.
			
 
				+        frame (int): Counter for captured frames.
			
 
				+        sct (mss.mss): Screen capture object from `mss` library.
			
 
				+        bs (int): Batch size, set to 1.
			
 
				+        fps (int): Frames per second, set to 30.
			
 
				+        monitor (Dict[str, int]): Monitor configuration details.
			
 
				+
			
 
				+    Methods:
			
 
				+        __iter__: Returns an iterator object.
			
 
				+        __next__: Captures the next screenshot and returns it.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> loader = LoadScreenshots("0 100 100 640 480")  # screen 0, top-left (100,100), 640x480
			
 
				+        >>> for source, im, im0s, vid_cap, s in loader:
			
 
				+        ...     print(f"Captured frame: {im.shape}")
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, source):
			
 
				+        """Initialize screenshot capture with specified screen and region parameters."""
			
 
				+        check_requirements("mss")
			
 
				+        import mss  # noqa
			
 
				+
			
 
				+        source, *params = source.split()
			
 
				+        self.screen, left, top, width, height = 0, None, None, None, None  # default to full screen 0
			
 
				+        if len(params) == 1:
			
 
				+            self.screen = int(params[0])
			
 
				+        elif len(params) == 4:
			
 
				+            left, top, width, height = (int(x) for x in params)
			
 
				+        elif len(params) == 5:
			
 
				+            self.screen, left, top, width, height = (int(x) for x in params)
			
 
				+        self.mode = "stream"
			
 
				+        self.frame = 0
			
 
				+        self.sct = mss.mss()
			
 
				+        self.bs = 1
			
 
				+        self.fps = 30
			
 
				+
			
 
				+        # Parse monitor shape
			
 
				+        monitor = self.sct.monitors[self.screen]
			
 
				+        self.top = monitor["top"] if top is None else (monitor["top"] + top)
			
 
				+        self.left = monitor["left"] if left is None else (monitor["left"] + left)
			
 
				+        self.width = width or monitor["width"]
			
 
				+        self.height = height or monitor["height"]
			
 
				+        self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Yields the next screenshot image from the specified screen or region for processing."""
			
 
				+        return self
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        """Captures and returns the next screenshot as a numpy array using the mss library."""
			
 
				+        im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3]  # BGRA to BGR
			
 
				+        s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
			
 
				+
			
 
				+        self.frame += 1
			
 
				+        return [str(self.screen)], [im0], [s]  # screen, img, string
			
 
				+
			
 
				+
			
 
				+class LoadImagesAndVideos:
			
 
				+    """
			
 
				+    A class for loading and processing images and videos for YOLO object detection.
			
 
				+
			
 
				+    This class manages the loading and pre-processing of image and video data from various sources, including
			
 
				+    single image files, video files, and lists of image and video paths.
			
 
				+
			
 
				+    Attributes:
			
 
				+        files (List[str]): List of image and video file paths.
			
 
				+        nf (int): Total number of files (images and videos).
			
 
				+        video_flag (List[bool]): Flags indicating whether a file is a video (True) or an image (False).
			
 
				+        mode (str): Current mode, 'image' or 'video'.
			
 
				+        vid_stride (int): Stride for video frame-rate.
			
 
				+        bs (int): Batch size.
			
 
				+        cap (cv2.VideoCapture): Video capture object for OpenCV.
			
 
				+        frame (int): Frame counter for video.
			
 
				+        frames (int): Total number of frames in the video.
			
 
				+        count (int): Counter for iteration, initialized at 0 during __iter__().
			
 
				+        ni (int): Number of images.
			
 
				+
			
 
				+    Methods:
			
 
				+        __init__: Initialize the LoadImagesAndVideos object.
			
 
				+        __iter__: Returns an iterator object for VideoStream or ImageFolder.
			
 
				+        __next__: Returns the next batch of images or video frames along with their paths and metadata.
			
 
				+        _new_video: Creates a new video capture object for the given path.
			
 
				+        __len__: Returns the number of batches in the object.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> loader = LoadImagesAndVideos("path/to/data", batch=32, vid_stride=1)
			
 
				+        >>> for paths, imgs, info in loader:
			
 
				+        ...     # Process batch of images or video frames
			
 
				+        ...     pass
			
 
				+
			
 
				+    Notes:
			
 
				+        - Supports various image formats including HEIC.
			
 
				+        - Handles both local files and directories.
			
 
				+        - Can read from a text file containing paths to images and videos.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, path, batch=1, vid_stride=1):
			
 
				+        """Initialize dataloader for images and videos, supporting various input formats."""
			
 
				+        parent = None
			
 
				+        if isinstance(path, str) and Path(path).suffix == ".txt":  # *.txt file with img/vid/dir on each line
			
 
				+            parent = Path(path).parent
			
 
				+            path = Path(path).read_text().splitlines()  # list of sources
			
 
				+        files = []
			
 
				+        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
			
 
				+            a = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
			
 
				+            if "*" in a:
			
 
				+                files.extend(sorted(glob.glob(a, recursive=True)))  # glob
			
 
				+            elif os.path.isdir(a):
			
 
				+                files.extend(sorted(glob.glob(os.path.join(a, "*.*"))))  # dir
			
 
				+            elif os.path.isfile(a):
			
 
				+                files.append(a)  # files (absolute or relative to CWD)
			
 
				+            elif parent and (parent / p).is_file():
			
 
				+                files.append(str((parent / p).absolute()))  # files (relative to *.txt file parent)
			
 
				+            else:
			
 
				+                raise FileNotFoundError(f"{p} does not exist")
			
 
				+
			
 
				+        # Define files as images or videos
			
 
				+        images, videos = [], []
			
 
				+        for f in files:
			
 
				+            suffix = f.split(".")[-1].lower()  # Get file extension without the dot and lowercase
			
 
				+            if suffix in IMG_FORMATS:
			
 
				+                images.append(f)
			
 
				+            elif suffix in VID_FORMATS:
			
 
				+                videos.append(f)
			
 
				+        ni, nv = len(images), len(videos)
			
 
				+
			
 
				+        self.files = images + videos
			
 
				+        self.nf = ni + nv  # number of files
			
 
				+        self.ni = ni  # number of images
			
 
				+        self.video_flag = [False] * ni + [True] * nv
			
 
				+        self.mode = "video" if ni == 0 else "image"  # default to video if no images
			
 
				+        self.vid_stride = vid_stride  # video frame-rate stride
			
 
				+        self.bs = batch
			
 
				+        if any(videos):
			
 
				+            self._new_video(videos[0])  # new video
			
 
				+        else:
			
 
				+            self.cap = None
			
 
				+        if self.nf == 0:
			
 
				+            raise FileNotFoundError(f"No images or videos found in {p}. {FORMATS_HELP_MSG}")
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Iterates through image/video files, yielding source paths, images, and metadata."""
			
 
				+        self.count = 0
			
 
				+        return self
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        """Returns the next batch of images or video frames with their paths and metadata."""
			
 
				+        paths, imgs, info = [], [], []
			
 
				+        while len(imgs) < self.bs:
			
 
				+            if self.count >= self.nf:  # end of file list
			
 
				+                if imgs:
			
 
				+                    return paths, imgs, info  # return last partial batch
			
 
				+                else:
			
 
				+                    raise StopIteration
			
 
				+
			
 
				+            path = self.files[self.count]
			
 
				+            if self.video_flag[self.count]:
			
 
				+                self.mode = "video"
			
 
				+                if not self.cap or not self.cap.isOpened():
			
 
				+                    self._new_video(path)
			
 
				+
			
 
				+                success = False
			
 
				+                for _ in range(self.vid_stride):
			
 
				+                    success = self.cap.grab()
			
 
				+                    if not success:
			
 
				+                        break  # end of video or failure
			
 
				+
			
 
				+                if success:
			
 
				+                    success, im0 = self.cap.retrieve()
			
 
				+                    if success:
			
 
				+                        self.frame += 1
			
 
				+                        paths.append(path)
			
 
				+                        imgs.append(im0)
			
 
				+                        info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
			
 
				+                        if self.frame == self.frames:  # end of video
			
 
				+                            self.count += 1
			
 
				+                            self.cap.release()
			
 
				+                else:
			
 
				+                    # Move to the next file if the current video ended or failed to open
			
 
				+                    self.count += 1
			
 
				+                    if self.cap:
			
 
				+                        self.cap.release()
			
 
				+                    if self.count < self.nf:
			
 
				+                        self._new_video(self.files[self.count])
			
 
				+            else:
			
 
				+                # Handle image files (including HEIC)
			
 
				+                self.mode = "image"
			
 
				+                if path.split(".")[-1].lower() == "heic":
			
 
				+                    # Load HEIC image using Pillow with pillow-heif
			
 
				+                    check_requirements("pillow-heif")
			
 
				+
			
 
				+                    from pillow_heif import register_heif_opener
			
 
				+
			
 
				+                    register_heif_opener()  # Register HEIF opener with Pillow
			
 
				+                    with Image.open(path) as img:
			
 
				+                        im0 = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)  # convert image to BGR nparray
			
 
				+                else:
			
 
				+                    im0 = imread(path)  # BGR
			
 
				+                if im0 is None:
			
 
				+                    LOGGER.warning(f"WARNING ⚠️ Image Read Error {path}")
			
 
				+                else:
			
 
				+                    paths.append(path)
			
 
				+                    imgs.append(im0)
			
 
				+                    info.append(f"image {self.count + 1}/{self.nf} {path}: ")
			
 
				+                self.count += 1  # move to the next file
			
 
				+                if self.count >= self.ni:  # end of image list
			
 
				+                    break
			
 
				+
			
 
				+        return paths, imgs, info
			
 
				+
			
 
				+    def _new_video(self, path):
			
 
				+        """Creates a new video capture object for the given path and initializes video-related attributes."""
			
 
				+        self.frame = 0
			
 
				+        self.cap = cv2.VideoCapture(path)
			
 
				+        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
			
 
				+        if not self.cap.isOpened():
			
 
				+            raise FileNotFoundError(f"Failed to open video {path}")
			
 
				+        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Returns the number of files (images and videos) in the dataset."""
			
 
				+        return math.ceil(self.nf / self.bs)  # number of batches
			
 
				+
			
 
				+
			
 
				+class LoadPilAndNumpy:
			
 
				+    """
			
 
				+    Load images from PIL and Numpy arrays for batch processing.
			
 
				+
			
 
				+    This class manages loading and pre-processing of image data from both PIL and Numpy formats. It performs basic
			
 
				+    validation and format conversion to ensure that the images are in the required format for downstream processing.
			
 
				+
			
 
				+    Attributes:
			
 
				+        paths (List[str]): List of image paths or autogenerated filenames.
			
 
				+        im0 (List[np.ndarray]): List of images stored as Numpy arrays.
			
 
				+        mode (str): Type of data being processed, set to 'image'.
			
 
				+        bs (int): Batch size, equivalent to the length of `im0`.
			
 
				+
			
 
				+    Methods:
			
 
				+        _single_check: Validate and format a single image to a Numpy array.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> from PIL import Image
			
 
				+        >>> import numpy as np
			
 
				+        >>> pil_img = Image.new("RGB", (100, 100))
			
 
				+        >>> np_img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
			
 
				+        >>> loader = LoadPilAndNumpy([pil_img, np_img])
			
 
				+        >>> paths, images, _ = next(iter(loader))
			
 
				+        >>> print(f"Loaded {len(images)} images")
			
 
				+        Loaded 2 images
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, im0):
			
 
				+        """Initializes a loader for PIL and Numpy images, converting inputs to a standardized format."""
			
 
				+        if not isinstance(im0, list):
			
 
				+            im0 = [im0]
			
 
				+        # use `image{i}.jpg` when Image.filename returns an empty path.
			
 
				+        self.paths = [getattr(im, "filename", "") or f"image{i}.jpg" for i, im in enumerate(im0)]
			
 
				+        self.im0 = [self._single_check(im) for im in im0]
			
 
				+        self.mode = "image"
			
 
				+        self.bs = len(self.im0)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _single_check(im):
			
 
				+        """Validate and format an image to numpy array, ensuring RGB order and contiguous memory."""
			
 
				+        assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
			
 
				+        if isinstance(im, Image.Image):
			
 
				+            if im.mode != "RGB":
			
 
				+                im = im.convert("RGB")
			
 
				+            im = np.asarray(im)[:, :, ::-1]
			
 
				+            im = np.ascontiguousarray(im)  # contiguous
			
 
				+        return im
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Returns the length of the 'im0' attribute, representing the number of loaded images."""
			
 
				+        return len(self.im0)
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        """Returns the next batch of images, paths, and metadata for processing."""
			
 
				+        if self.count == 1:  # loop only once as it's batch inference
			
 
				+            raise StopIteration
			
 
				+        self.count += 1
			
 
				+        return self.paths, self.im0, [""] * self.bs
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Iterates through PIL/numpy images, yielding paths, raw images, and metadata for processing."""
			
 
				+        self.count = 0
			
 
				+        return self
			
 
				+
			
 
				+
			
 
				+class LoadTensor:
			
 
				+    """
			
 
				+    A class for loading and processing tensor data for object detection tasks.
			
 
				+
			
 
				+    This class handles the loading and pre-processing of image data from PyTorch tensors, preparing them for
			
 
				+    further processing in object detection pipelines.
			
 
				+
			
 
				+    Attributes:
			
 
				+        im0 (torch.Tensor): The input tensor containing the image(s) with shape (B, C, H, W).
			
 
				+        bs (int): Batch size, inferred from the shape of `im0`.
			
 
				+        mode (str): Current processing mode, set to 'image'.
			
 
				+        paths (List[str]): List of image paths or auto-generated filenames.
			
 
				+
			
 
				+    Methods:
			
 
				+        _single_check: Validates and formats an input tensor.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> import torch
			
 
				+        >>> tensor = torch.rand(1, 3, 640, 640)
			
 
				+        >>> loader = LoadTensor(tensor)
			
 
				+        >>> paths, images, info = next(iter(loader))
			
 
				+        >>> print(f"Processed {len(images)} images")
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, im0) -> None:
			
 
				+        """Initialize LoadTensor object for processing torch.Tensor image data."""
			
 
				+        self.im0 = self._single_check(im0)
			
 
				+        self.bs = self.im0.shape[0]
			
 
				+        self.mode = "image"
			
 
				+        self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _single_check(im, stride=32):
			
 
				+        """Validates and formats a single image tensor, ensuring correct shape and normalization."""
			
 
				+        s = (
			
 
				+            f"WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
			
 
				+            f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
			
 
				+        )
			
 
				+        if len(im.shape) != 4:
			
 
				+            if len(im.shape) != 3:
			
 
				+                raise ValueError(s)
			
 
				+            LOGGER.warning(s)
			
 
				+            im = im.unsqueeze(0)
			
 
				+        if im.shape[2] % stride or im.shape[3] % stride:
			
 
				+            raise ValueError(s)
			
 
				+        if im.max() > 1.0 + torch.finfo(im.dtype).eps:  # torch.float32 eps is 1.2e-07
			
 
				+            LOGGER.warning(
			
 
				+                f"WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. "
			
 
				+                f"Dividing input by 255."
			
 
				+            )
			
 
				+            im = im.float() / 255.0
			
 
				+
			
 
				+        return im
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """Yields an iterator object for iterating through tensor image data."""
			
 
				+        self.count = 0
			
 
				+        return self
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        """Yields the next batch of tensor images and metadata for processing."""
			
 
				+        if self.count == 1:
			
 
				+            raise StopIteration
			
 
				+        self.count += 1
			
 
				+        return self.paths, self.im0, [""] * self.bs
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """Returns the batch size of the tensor input."""
			
 
				+        return self.bs
			
 
				+
			
 
				+
			
 
				+def autocast_list(source):
			
 
				+    """Merges a list of sources into a list of numpy arrays or PIL images for Ultralytics prediction."""
			
 
				+    files = []
			
 
				+    for im in source:
			
 
				+        if isinstance(im, (str, Path)):  # filename or uri
			
 
				+            files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im))
			
 
				+        elif isinstance(im, (Image.Image, np.ndarray)):  # PIL or np Image
			
 
				+            files.append(im)
			
 
				+        else:
			
 
				+            raise TypeError(
			
 
				+                f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
			
 
				+                f"See https://docs.ultralytics.com/modes/predict for supported source types."
			
 
				+            )
			
 
				+
			
 
				+    return files
			
 
				+
			
 
				+
			
 
				+def get_best_youtube_url(url, method="pytube"):
			
 
				+    """
			
 
				+    Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
			
 
				+
			
 
				+    Args:
			
 
				+        url (str): The URL of the YouTube video.
			
 
				+        method (str): The method to use for extracting video info. Options are "pytube", "pafy", and "yt-dlp".
			
 
				+            Defaults to "pytube".
			
 
				+
			
 
				+    Returns:
			
 
				+        (str | None): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
			
 
				+        >>> best_url = get_best_youtube_url(url)
			
 
				+        >>> print(best_url)
			
 
				+        https://rr4---sn-q4flrnek.googlevideo.com/videoplayback?expire=...
			
 
				+
			
 
				+    Notes:
			
 
				+        - Requires additional libraries based on the chosen method: pytubefix, pafy, or yt-dlp.
			
 
				+        - The function prioritizes streams with at least 1080p resolution when available.
			
 
				+        - For the "yt-dlp" method, it looks for formats with video codec, no audio, and *.mp4 extension.
			
 
				+    """
			
 
				+    if method == "pytube":
			
 
				+        # Switched from pytube to pytubefix to resolve https://github.com/pytube/pytube/issues/1954
			
 
				+        check_requirements("pytubefix>=6.5.2")
			
 
				+        from pytubefix import YouTube
			
 
				+
			
 
				+        streams = YouTube(url).streams.filter(file_extension="mp4", only_video=True)
			
 
				+        streams = sorted(streams, key=lambda s: s.resolution, reverse=True)  # sort streams by resolution
			
 
				+        for stream in streams:
			
 
				+            if stream.resolution and int(stream.resolution[:-1]) >= 1080:  # check if resolution is at least 1080p
			
 
				+                return stream.url
			
 
				+
			
 
				+    elif method == "pafy":
			
 
				+        check_requirements(("pafy", "youtube_dl==2020.12.2"))
			
 
				+        import pafy  # noqa
			
 
				+
			
 
				+        return pafy.new(url).getbestvideo(preftype="mp4").url
			
 
				+
			
 
				+    elif method == "yt-dlp":
			
 
				+        check_requirements("yt-dlp")
			
 
				+        import yt_dlp
			
 
				+
			
 
				+        with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
			
 
				+            info_dict = ydl.extract_info(url, download=False)  # extract info
			
 
				+        for f in reversed(info_dict.get("formats", [])):  # reversed because best is usually last
			
 
				+            # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
			
 
				+            good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
			
 
				+            if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
			
 
				+                return f.get("url")
			
 
				+
			
 
				+
			
 
				+# Define constants
			
 
				+LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)
			
--- a/ultralytics/data/scripts/download_weights.sh
+++ b/ultralytics/data/scripts/download_weights.sh
@@ -0,0 +1,18 @@
 
				+#!/bin/bash
			
 
				+# Ultralytics YOLO 🚀, AGPL-3.0 license
			
 
				+# Download latest models from https://github.com/ultralytics/assets/releases
			
 
				+# Example usage: bash ultralytics/data/scripts/download_weights.sh
			
 
				+# parent
			
 
				+# └── weights
			
 
				+#     ├── yolov8n.pt  ← downloads here
			
 
				+#     ├── yolov8s.pt
			
 
				+#     └── ...
			
 
				+
			
 
				+python - <<EOF
			
 
				+from ultralytics.utils.downloads import attempt_download_asset
			
 
				+
			
 
				+assets = [f"yolov8{size}{suffix}.pt" for size in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose")]
			
 
				+for x in assets:
			
 
				+    attempt_download_asset(f"weights/{x}")
			
 
				+
			
 
				+EOF
			
--- a/ultralytics/data/scripts/get_coco.sh
+++ b/ultralytics/data/scripts/get_coco.sh
@@ -0,0 +1,60 @@
 
				+#!/bin/bash
			
 
				+# Ultralytics YOLO 🚀, AGPL-3.0 license
			
 
				+# Download COCO 2017 dataset https://cocodataset.org
			
 
				+# Example usage: bash data/scripts/get_coco.sh
			
 
				+# parent
			
 
				+# ├── ultralytics
			
 
				+# └── datasets
			
 
				+#     └── coco  ← downloads here
			
 
				+
			
 
				+# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
			
 
				+if [ "$#" -gt 0 ]; then
			
 
				+  for opt in "$@"; do
			
 
				+    case "${opt}" in
			
 
				+    --train) train=true ;;
			
 
				+    --val) val=true ;;
			
 
				+    --test) test=true ;;
			
 
				+    --segments) segments=true ;;
			
 
				+    --sama) sama=true ;;
			
 
				+    esac
			
 
				+  done
			
 
				+else
			
 
				+  train=true
			
 
				+  val=true
			
 
				+  test=false
			
 
				+  segments=false
			
 
				+  sama=false
			
 
				+fi
			
 
				+
			
 
				+# Download/unzip labels
			
 
				+d='../datasets' # unzip directory
			
 
				+url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
			
 
				+if [ "$segments" == "true" ]; then
			
 
				+  f='coco2017labels-segments.zip' # 169 MB
			
 
				+elif [ "$sama" == "true" ]; then
			
 
				+  f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
			
 
				+else
			
 
				+  f='coco2017labels.zip' # 46 MB
			
 
				+fi
			
 
				+echo 'Downloading' $url$f ' ...'
			
 
				+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
			
 
				+
			
 
				+# Download/unzip images
			
 
				+d='../datasets/coco/images' # unzip directory
			
 
				+url=http://images.cocodataset.org/zips/
			
 
				+if [ "$train" == "true" ]; then
			
 
				+  f='train2017.zip' # 19G, 118k images
			
 
				+  echo 'Downloading' $url$f '...'
			
 
				+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
			
 
				+fi
			
 
				+if [ "$val" == "true" ]; then
			
 
				+  f='val2017.zip' # 1G, 5k images
			
 
				+  echo 'Downloading' $url$f '...'
			
 
				+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
			
 
				+fi
			
 
				+if [ "$test" == "true" ]; then
			
 
				+  f='test2017.zip' # 7G, 41k images (optional)
			
 
				+  echo 'Downloading' $url$f '...'
			
 
				+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
			
 
				+fi
			
 
				+wait # finish background tasks
			
--- a/ultralytics/data/scripts/get_coco128.sh
+++ b/ultralytics/data/scripts/get_coco128.sh
@@ -0,0 +1,17 @@
 
				+#!/bin/bash
			
 
				+# Ultralytics YOLO 🚀, AGPL-3.0 license
			
 
				+# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
			
 
				+# Example usage: bash data/scripts/get_coco128.sh
			
 
				+# parent
			
 
				+# ├── ultralytics
			
 
				+# └── datasets
			
 
				+#     └── coco128  ← downloads here
			
 
				+
			
 
				+# Download/unzip images and labels
			
 
				+d='../datasets' # unzip directory
			
 
				+url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
			
 
				+f='coco128.zip' # or 'coco128-segments.zip', 68 MB
			
 
				+echo 'Downloading' $url$f ' ...'
			
 
				+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
			
 
				+
			
 
				+wait # finish background tasks
			
--- a/ultralytics/data/scripts/get_imagenet.sh
+++ b/ultralytics/data/scripts/get_imagenet.sh
@@ -0,0 +1,51 @@
 
				+#!/bin/bash
			
 
				+# Ultralytics YOLO 🚀, AGPL-3.0 license
			
 
				+# Download ILSVRC2012 ImageNet dataset https://image-net.org
			
 
				+# Example usage: bash data/scripts/get_imagenet.sh
			
 
				+# parent
			
 
				+# ├── ultralytics
			
 
				+# └── datasets
			
 
				+#     └── imagenet  ← downloads here
			
 
				+
			
 
				+# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
			
 
				+if [ "$#" -gt 0 ]; then
			
 
				+  for opt in "$@"; do
			
 
				+    case "${opt}" in
			
 
				+    --train) train=true ;;
			
 
				+    --val) val=true ;;
			
 
				+    esac
			
 
				+  done
			
 
				+else
			
 
				+  train=true
			
 
				+  val=true
			
 
				+fi
			
 
				+
			
 
				+# Make dir
			
 
				+d='../datasets/imagenet' # unzip directory
			
 
				+mkdir -p $d && cd $d
			
 
				+
			
 
				+# Download/unzip train
			
 
				+if [ "$train" == "true" ]; then
			
 
				+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
			
 
				+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
			
 
				+  tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
			
 
				+  find . -name "*.tar" | while read NAME; do
			
 
				+    mkdir -p "${NAME%.tar}"
			
 
				+    tar -xf "${NAME}" -C "${NAME%.tar}"
			
 
				+    rm -f "${NAME}"
			
 
				+  done
			
 
				+  cd ..
			
 
				+fi
			
 
				+
			
 
				+# Download/unzip val
			
 
				+if [ "$val" == "true" ]; then
			
 
				+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
			
 
				+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
			
 
				+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
			
 
				+fi
			
 
				+
			
 
				+# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
			
 
				+# rm train/n04266014/n04266014_10835.JPEG
			
 
				+
			
 
				+# TFRecords (optional)
			
 
				+# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt
			
--- a/ultralytics/data/split_dota.py
+++ b/ultralytics/data/split_dota.py
@@ -0,0 +1,298 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import itertools
			
 
				+from glob import glob
			
 
				+from math import ceil
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from PIL import Image
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from ultralytics.data.utils import exif_size, img2label_paths
			
 
				+from ultralytics.utils.checks import check_requirements
			
 
				+
			
 
				+
			
 
				+def bbox_iof(polygon1, bbox2, eps=1e-6):
			
 
				+    """
			
 
				+    Calculate Intersection over Foreground (IoF) between polygons and bounding boxes.
			
 
				+
			
 
				+    Args:
			
 
				+        polygon1 (np.ndarray): Polygon coordinates, shape (n, 8).
			
 
				+        bbox2 (np.ndarray): Bounding boxes, shape (n, 4).
			
 
				+        eps (float, optional): Small value to prevent division by zero. Defaults to 1e-6.
			
 
				+
			
 
				+    Returns:
			
 
				+        (np.ndarray): IoF scores, shape (n, 1) or (n, m) if bbox2 is (m, 4).
			
 
				+
			
 
				+    Note:
			
 
				+        Polygon format: [x1, y1, x2, y2, x3, y3, x4, y4].
			
 
				+        Bounding box format: [x_min, y_min, x_max, y_max].
			
 
				+    """
			
 
				+    check_requirements("shapely")
			
 
				+    from shapely.geometry import Polygon
			
 
				+
			
 
				+    polygon1 = polygon1.reshape(-1, 4, 2)
			
 
				+    lt_point = np.min(polygon1, axis=-2)  # left-top
			
 
				+    rb_point = np.max(polygon1, axis=-2)  # right-bottom
			
 
				+    bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
			
 
				+
			
 
				+    lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
			
 
				+    rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
			
 
				+    wh = np.clip(rb - lt, 0, np.inf)
			
 
				+    h_overlaps = wh[..., 0] * wh[..., 1]
			
 
				+
			
 
				+    left, top, right, bottom = (bbox2[..., i] for i in range(4))
			
 
				+    polygon2 = np.stack([left, top, right, top, right, bottom, left, bottom], axis=-1).reshape(-1, 4, 2)
			
 
				+
			
 
				+    sg_polys1 = [Polygon(p) for p in polygon1]
			
 
				+    sg_polys2 = [Polygon(p) for p in polygon2]
			
 
				+    overlaps = np.zeros(h_overlaps.shape)
			
 
				+    for p in zip(*np.nonzero(h_overlaps)):
			
 
				+        overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
			
 
				+    unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
			
 
				+    unions = unions[..., None]
			
 
				+
			
 
				+    unions = np.clip(unions, eps, np.inf)
			
 
				+    outputs = overlaps / unions
			
 
				+    if outputs.ndim == 1:
			
 
				+        outputs = outputs[..., None]
			
 
				+    return outputs
			
 
				+
			
 
				+
			
 
				+def load_yolo_dota(data_root, split="train"):
			
 
				+    """
			
 
				+    Load DOTA dataset.
			
 
				+
			
 
				+    Args:
			
 
				+        data_root (str): Data root.
			
 
				+        split (str): The split data set, could be `train` or `val`.
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+            - data_root
			
 
				+                - images
			
 
				+                    - train
			
 
				+                    - val
			
 
				+                - labels
			
 
				+                    - train
			
 
				+                    - val
			
 
				+    """
			
 
				+    assert split in {"train", "val"}, f"Split must be 'train' or 'val', not {split}."
			
 
				+    im_dir = Path(data_root) / "images" / split
			
 
				+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
			
 
				+    im_files = glob(str(Path(data_root) / "images" / split / "*"))
			
 
				+    lb_files = img2label_paths(im_files)
			
 
				+    annos = []
			
 
				+    for im_file, lb_file in zip(im_files, lb_files):
			
 
				+        w, h = exif_size(Image.open(im_file))
			
 
				+        with open(lb_file) as f:
			
 
				+            lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
			
 
				+            lb = np.array(lb, dtype=np.float32)
			
 
				+        annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
			
 
				+    return annos
			
 
				+
			
 
				+
			
 
				+def get_windows(im_size, crop_sizes=(1024,), gaps=(200,), im_rate_thr=0.6, eps=0.01):
			
 
				+    """
			
 
				+    Get the coordinates of windows.
			
 
				+
			
 
				+    Args:
			
 
				+        im_size (tuple): Original image size, (h, w).
			
 
				+        crop_sizes (List(int)): Crop size of windows.
			
 
				+        gaps (List(int)): Gap between crops.
			
 
				+        im_rate_thr (float): Threshold of windows areas divided by image ares.
			
 
				+        eps (float): Epsilon value for math operations.
			
 
				+    """
			
 
				+    h, w = im_size
			
 
				+    windows = []
			
 
				+    for crop_size, gap in zip(crop_sizes, gaps):
			
 
				+        assert crop_size > gap, f"invalid crop_size gap pair [{crop_size} {gap}]"
			
 
				+        step = crop_size - gap
			
 
				+
			
 
				+        xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
			
 
				+        xs = [step * i for i in range(xn)]
			
 
				+        if len(xs) > 1 and xs[-1] + crop_size > w:
			
 
				+            xs[-1] = w - crop_size
			
 
				+
			
 
				+        yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
			
 
				+        ys = [step * i for i in range(yn)]
			
 
				+        if len(ys) > 1 and ys[-1] + crop_size > h:
			
 
				+            ys[-1] = h - crop_size
			
 
				+
			
 
				+        start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
			
 
				+        stop = start + crop_size
			
 
				+        windows.append(np.concatenate([start, stop], axis=1))
			
 
				+    windows = np.concatenate(windows, axis=0)
			
 
				+
			
 
				+    im_in_wins = windows.copy()
			
 
				+    im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
			
 
				+    im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
			
 
				+    im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
			
 
				+    win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
			
 
				+    im_rates = im_areas / win_areas
			
 
				+    if not (im_rates > im_rate_thr).any():
			
 
				+        max_rate = im_rates.max()
			
 
				+        im_rates[abs(im_rates - max_rate) < eps] = 1
			
 
				+    return windows[im_rates > im_rate_thr]
			
 
				+
			
 
				+
			
 
				+def get_window_obj(anno, windows, iof_thr=0.7):
			
 
				+    """Get objects for each window."""
			
 
				+    h, w = anno["ori_size"]
			
 
				+    label = anno["label"]
			
 
				+    if len(label):
			
 
				+        label[:, 1::2] *= w
			
 
				+        label[:, 2::2] *= h
			
 
				+        iofs = bbox_iof(label[:, 1:], windows)
			
 
				+        # Unnormalized and misaligned coordinates
			
 
				+        return [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))]  # window_anns
			
 
				+    else:
			
 
				+        return [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))]  # window_anns
			
 
				+
			
 
				+
			
 
				+def crop_and_save(anno, windows, window_objs, im_dir, lb_dir, allow_background_images=True):
			
 
				+    """
			
 
				+    Crop images and save new labels.
			
 
				+
			
 
				+    Args:
			
 
				+        anno (dict): Annotation dict, including `filepath`, `label`, `ori_size` as its keys.
			
 
				+        windows (list): A list of windows coordinates.
			
 
				+        window_objs (list): A list of labels inside each window.
			
 
				+        im_dir (str): The output directory path of images.
			
 
				+        lb_dir (str): The output directory path of labels.
			
 
				+        allow_background_images (bool): Whether to include background images without labels.
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+            - data_root
			
 
				+                - images
			
 
				+                    - train
			
 
				+                    - val
			
 
				+                - labels
			
 
				+                    - train
			
 
				+                    - val
			
 
				+    """
			
 
				+    im = cv2.imread(anno["filepath"])
			
 
				+    name = Path(anno["filepath"]).stem
			
 
				+    for i, window in enumerate(windows):
			
 
				+        x_start, y_start, x_stop, y_stop = window.tolist()
			
 
				+        new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
			
 
				+        patch_im = im[y_start:y_stop, x_start:x_stop]
			
 
				+        ph, pw = patch_im.shape[:2]
			
 
				+
			
 
				+        label = window_objs[i]
			
 
				+        if len(label) or allow_background_images:
			
 
				+            cv2.imwrite(str(Path(im_dir) / f"{new_name}.jpg"), patch_im)
			
 
				+        if len(label):
			
 
				+            label[:, 1::2] -= x_start
			
 
				+            label[:, 2::2] -= y_start
			
 
				+            label[:, 1::2] /= pw
			
 
				+            label[:, 2::2] /= ph
			
 
				+
			
 
				+            with open(Path(lb_dir) / f"{new_name}.txt", "w") as f:
			
 
				+                for lb in label:
			
 
				+                    formatted_coords = [f"{coord:.6g}" for coord in lb[1:]]
			
 
				+                    f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
			
 
				+
			
 
				+
			
 
				+def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=(1024,), gaps=(200,)):
			
 
				+    """
			
 
				+    Split both images and labels.
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+            - data_root
			
 
				+                - images
			
 
				+                    - split
			
 
				+                - labels
			
 
				+                    - split
			
 
				+        and the output directory structure is:
			
 
				+            - save_dir
			
 
				+                - images
			
 
				+                    - split
			
 
				+                - labels
			
 
				+                    - split
			
 
				+    """
			
 
				+    im_dir = Path(save_dir) / "images" / split
			
 
				+    im_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    lb_dir = Path(save_dir) / "labels" / split
			
 
				+    lb_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    annos = load_yolo_dota(data_root, split=split)
			
 
				+    for anno in tqdm(annos, total=len(annos), desc=split):
			
 
				+        windows = get_windows(anno["ori_size"], crop_sizes, gaps)
			
 
				+        window_objs = get_window_obj(anno, windows)
			
 
				+        crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
			
 
				+
			
 
				+
			
 
				+def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
			
 
				+    """
			
 
				+    Split train and val set of DOTA.
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+            - data_root
			
 
				+                - images
			
 
				+                    - train
			
 
				+                    - val
			
 
				+                - labels
			
 
				+                    - train
			
 
				+                    - val
			
 
				+        and the output directory structure is:
			
 
				+            - save_dir
			
 
				+                - images
			
 
				+                    - train
			
 
				+                    - val
			
 
				+                - labels
			
 
				+                    - train
			
 
				+                    - val
			
 
				+    """
			
 
				+    crop_sizes, gaps = [], []
			
 
				+    for r in rates:
			
 
				+        crop_sizes.append(int(crop_size / r))
			
 
				+        gaps.append(int(gap / r))
			
 
				+    for split in ["train", "val"]:
			
 
				+        split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
			
 
				+
			
 
				+
			
 
				+def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
			
 
				+    """
			
 
				+    Split test set of DOTA, labels are not included within this set.
			
 
				+
			
 
				+    Notes:
			
 
				+        The directory structure assumed for the DOTA dataset:
			
 
				+            - data_root
			
 
				+                - images
			
 
				+                    - test
			
 
				+        and the output directory structure is:
			
 
				+            - save_dir
			
 
				+                - images
			
 
				+                    - test
			
 
				+    """
			
 
				+    crop_sizes, gaps = [], []
			
 
				+    for r in rates:
			
 
				+        crop_sizes.append(int(crop_size / r))
			
 
				+        gaps.append(int(gap / r))
			
 
				+    save_dir = Path(save_dir) / "images" / "test"
			
 
				+    save_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    im_dir = Path(data_root) / "images" / "test"
			
 
				+    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
			
 
				+    im_files = glob(str(im_dir / "*"))
			
 
				+    for im_file in tqdm(im_files, total=len(im_files), desc="test"):
			
 
				+        w, h = exif_size(Image.open(im_file))
			
 
				+        windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
			
 
				+        im = cv2.imread(im_file)
			
 
				+        name = Path(im_file).stem
			
 
				+        for window in windows:
			
 
				+            x_start, y_start, x_stop, y_stop = window.tolist()
			
 
				+            new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
			
 
				+            patch_im = im[y_start:y_stop, x_start:x_stop]
			
 
				+            cv2.imwrite(str(save_dir / f"{new_name}.jpg"), patch_im)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    split_trainval(data_root="DOTAv2", save_dir="DOTAv2-split")
			
 
				+    split_test(data_root="DOTAv2", save_dir="DOTAv2-split")
			
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@@ -0,0 +1,721 @@
 
				+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
			
 
				+
			
 
				+import hashlib
			
 
				+import json
			
 
				+import os
			
 
				+import random
			
 
				+import subprocess
			
 
				+import time
			
 
				+import zipfile
			
 
				+from multiprocessing.pool import ThreadPool
			
 
				+from pathlib import Path
			
 
				+from tarfile import is_tarfile
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from PIL import Image, ImageOps
			
 
				+
			
 
				+from ultralytics.nn.autobackend import check_class_names
			
 
				+from ultralytics.utils import (
			
 
				+    DATASETS_DIR,
			
 
				+    LOGGER,
			
 
				+    NUM_THREADS,
			
 
				+    ROOT,
			
 
				+    SETTINGS_FILE,
			
 
				+    TQDM,
			
 
				+    clean_url,
			
 
				+    colorstr,
			
 
				+    emojis,
			
 
				+    is_dir_writeable,
			
 
				+    yaml_load,
			
 
				+    yaml_save,
			
 
				+)
			
 
				+from ultralytics.utils.checks import check_file, check_font, is_ascii
			
 
				+from ultralytics.utils.downloads import download, safe_download, unzip_file
			
 
				+from ultralytics.utils.ops import segments2boxes
			
 
				+
			
 
				+HELP_URL = "See https://docs.ultralytics.com/datasets for dataset formatting guidance."
			
 
				+IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm", "heic"}  # image suffixes
			
 
				+VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"}  # video suffixes
			
 
				+PIN_MEMORY = str(os.getenv("PIN_MEMORY", True)).lower() == "true"  # global pin_memory for dataloaders
			
 
				+FORMATS_HELP_MSG = f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
			
 
				+
			
 
				+
			
 
				+def img2label_paths(img_paths):
			
 
				+    """Define label paths as a function of image paths."""
			
 
				+    sa, sb = f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}"  # /images/, /labels/ substrings
			
 
				+    return [sb.join(x.rsplit(sa, 1)).rsplit(".", 1)[0] + ".txt" for x in img_paths]
			
 
				+
			
 
				+
			
 
				+def get_hash(paths):
			
 
				+    """Returns a single hash value of a list of paths (files or dirs)."""
			
 
				+    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
			
 
				+    h = hashlib.sha256(str(size).encode())  # hash sizes
			
 
				+    h.update("".join(paths).encode())  # hash paths
			
 
				+    return h.hexdigest()  # return hash
			
 
				+
			
 
				+
			
 
				+def exif_size(img: Image.Image):
			
 
				+    """Returns exif-corrected PIL size."""
			
 
				+    s = img.size  # (width, height)
			
 
				+    if img.format == "JPEG":  # only support JPEG images
			
 
				+        try:
			
 
				+            if exif := img.getexif():
			
 
				+                rotation = exif.get(274, None)  # the EXIF key for the orientation tag is 274
			
 
				+                if rotation in {6, 8}:  # rotation 270 or 90
			
 
				+                    s = s[1], s[0]
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    return s
			
 
				+
			
 
				+
			
 
				+def verify_image(args):
			
 
				+    """Verify one image."""
			
 
				+    (im_file, cls), prefix = args
			
 
				+    # Number (found, corrupt), message
			
 
				+    nf, nc, msg = 0, 0, ""
			
 
				+    try:
			
 
				+        im = Image.open(im_file)
			
 
				+        im.verify()  # PIL verify
			
 
				+        shape = exif_size(im)  # image size
			
 
				+        shape = (shape[1], shape[0])  # hw
			
 
				+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
			
 
				+        assert im.format.lower() in IMG_FORMATS, f"Invalid image format {im.format}. {FORMATS_HELP_MSG}"
			
 
				+        if im.format.lower() in {"jpg", "jpeg"}:
			
 
				+            with open(im_file, "rb") as f:
			
 
				+                f.seek(-2, 2)
			
 
				+                if f.read() != b"\xff\xd9":  # corrupt JPEG
			
 
				+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
			
 
				+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
			
 
				+        nf = 1
			
 
				+    except Exception as e:
			
 
				+        nc = 1
			
 
				+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
			
 
				+    return (im_file, cls), nf, nc, msg
			
 
				+
			
 
				+
			
 
				+def verify_image_label(args):
			
 
				+    """Verify one image-label pair."""
			
 
				+    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
			
 
				+    # Number (missing, found, empty, corrupt), message, segments, keypoints
			
 
				+    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, "", [], None
			
 
				+    try:
			
 
				+        # Verify images
			
 
				+        im = Image.open(im_file)
			
 
				+        im.verify()  # PIL verify
			
 
				+        shape = exif_size(im)  # image size
			
 
				+        shape = (shape[1], shape[0])  # hw
			
 
				+        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
			
 
				+        assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}. {FORMATS_HELP_MSG}"
			
 
				+        if im.format.lower() in {"jpg", "jpeg"}:
			
 
				+            with open(im_file, "rb") as f:
			
 
				+                f.seek(-2, 2)
			
 
				+                if f.read() != b"\xff\xd9":  # corrupt JPEG
			
 
				+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
			
 
				+                    msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
			
 
				+
			
 
				+        # Verify labels
			
 
				+        if os.path.isfile(lb_file):
			
 
				+            nf = 1  # label found
			
 
				+            with open(lb_file) as f:
			
 
				+                lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
			
 
				+                if any(len(x) > 6 for x in lb) and (not keypoint):  # is segment
			
 
				+                    classes = np.array([x[0] for x in lb], dtype=np.float32)
			
 
				+                    segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb]  # (cls, xy1...)
			
 
				+                    lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
			
 
				+                lb = np.array(lb, dtype=np.float32)
			
 
				+            if nl := len(lb):
			
 
				+                if keypoint:
			
 
				+                    assert lb.shape[1] == (5 + nkpt * ndim), f"labels require {(5 + nkpt * ndim)} columns each"
			
 
				+                    points = lb[:, 5:].reshape(-1, ndim)[:, :2]
			
 
				+                else:
			
 
				+                    assert lb.shape[1] == 5, f"labels require 5 columns, {lb.shape[1]} columns detected"
			
 
				+                    points = lb[:, 1:]
			
 
				+                assert points.max() <= 1, f"non-normalized or out of bounds coordinates {points[points > 1]}"
			
 
				+                assert lb.min() >= 0, f"negative label values {lb[lb < 0]}"
			
 
				+
			
 
				+                # All labels
			
 
				+                max_cls = lb[:, 0].max()  # max label count
			
 
				+                assert max_cls <= num_cls, (
			
 
				+                    f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
			
 
				+                    f"Possible class labels are 0-{num_cls - 1}"
			
 
				+                )
			
 
				+                _, i = np.unique(lb, axis=0, return_index=True)
			
 
				+                if len(i) < nl:  # duplicate row check
			
 
				+                    lb = lb[i]  # remove duplicates
			
 
				+                    if segments:
			
 
				+                        segments = [segments[x] for x in i]
			
 
				+                    msg = f"{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed"
			
 
				+            else:
			
 
				+                ne = 1  # label empty
			
 
				+                lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
			
 
				+        else:
			
 
				+            nm = 1  # label missing
			
 
				+            lb = np.zeros((0, (5 + nkpt * ndim) if keypoints else 5), dtype=np.float32)
			
 
				+        if keypoint:
			
 
				+            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
			
 
				+            if ndim == 2:
			
 
				+                kpt_mask = np.where((keypoints[..., 0] < 0) | (keypoints[..., 1] < 0), 0.0, 1.0).astype(np.float32)
			
 
				+                keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1)  # (nl, nkpt, 3)
			
 
				+        lb = lb[:, :5]
			
 
				+        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
			
 
				+    except Exception as e:
			
 
				+        nc = 1
			
 
				+        msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
			
 
				+        return [None, None, None, None, None, nm, nf, ne, nc, msg]
			
 
				+
			
 
				+
			
 
				+def visualize_image_annotations(image_path, txt_path, label_map):
			
 
				+    """
			
 
				+    Visualizes YOLO annotations (bounding boxes and class labels) on an image.
			
 
				+
			
 
				+    This function reads an image and its corresponding annotation file in YOLO format, then
			
 
				+    draws bounding boxes around detected objects and labels them with their respective class names.
			
 
				+    The bounding box colors are assigned based on the class ID, and the text color is dynamically
			
 
				+    adjusted for readability, depending on the background color's luminance.
			
 
				+
			
 
				+    Args:
			
 
				+        image_path (str): The path to the image file to annotate, and it can be in formats supported by PIL (e.g., .jpg, .png).
			
 
				+        txt_path (str): The path to the annotation file in YOLO format, that should contain one line per object with:
			
 
				+                        - class_id (int): The class index.
			
 
				+                        - x_center (float): The X center of the bounding box (relative to image width).
			
 
				+                        - y_center (float): The Y center of the bounding box (relative to image height).
			
 
				+                        - width (float): The width of the bounding box (relative to image width).
			
 
				+                        - height (float): The height of the bounding box (relative to image height).
			
 
				+        label_map (dict): A dictionary that maps class IDs (integers) to class labels (strings).
			
 
				+
			
 
				+    Example:
			
 
				+        >>> label_map = {0: "cat", 1: "dog", 2: "bird"}  # It should include all annotated classes details
			
 
				+        >>> visualize_image_annotations("path/to/image.jpg", "path/to/annotations.txt", label_map)
			
 
				+    """
			
 
				+    import matplotlib.pyplot as plt
			
 
				+
			
 
				+    from ultralytics.utils.plotting import colors
			
 
				+
			
 
				+    img = np.array(Image.open(image_path))
			
 
				+    img_height, img_width = img.shape[:2]
			
 
				+    annotations = []
			
 
				+    with open(txt_path) as file:
			
 
				+        for line in file:
			
 
				+            class_id, x_center, y_center, width, height = map(float, line.split())
			
 
				+            x = (x_center - width / 2) * img_width
			
 
				+            y = (y_center - height / 2) * img_height
			
 
				+            w = width * img_width
			
 
				+            h = height * img_height
			
 
				+            annotations.append((x, y, w, h, int(class_id)))
			
 
				+    fig, ax = plt.subplots(1)  # Plot the image and annotations
			
 
				+    for x, y, w, h, label in annotations:
			
 
				+        color = tuple(c / 255 for c in colors(label, True))  # Get and normalize the RGB color
			
 
				+        rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color, facecolor="none")  # Create a rectangle
			
 
				+        ax.add_patch(rect)
			
 
				+        luminance = 0.2126 * color[0] + 0.7152 * color[1] + 0.0722 * color[2]  # Formula for luminance
			
 
				+        ax.text(x, y - 5, label_map[label], color="white" if luminance < 0.5 else "black", backgroundcolor=color)
			
 
				+    ax.imshow(img)
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
			
 
				+    """
			
 
				+    Convert a list of polygons to a binary mask of the specified image size.
			
 
				+
			
 
				+    Args:
			
 
				+        imgsz (tuple): The size of the image as (height, width).
			
 
				+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
			
 
				+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
			
 
				+        color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
			
 
				+        downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
			
 
				+
			
 
				+    Returns:
			
 
				+        (np.ndarray): A binary mask of the specified image size with the polygons filled in.
			
 
				+    """
			
 
				+    mask = np.zeros(imgsz, dtype=np.uint8)
			
 
				+    polygons = np.asarray(polygons, dtype=np.int32)
			
 
				+    polygons = polygons.reshape((polygons.shape[0], -1, 2))
			
 
				+    cv2.fillPoly(mask, polygons, color=color)
			
 
				+    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
			
 
				+    # Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
			
 
				+    return cv2.resize(mask, (nw, nh))
			
 
				+
			
 
				+
			
 
				+def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
			
 
				+    """
			
 
				+    Convert a list of polygons to a set of binary masks of the specified image size.
			
 
				+
			
 
				+    Args:
			
 
				+        imgsz (tuple): The size of the image as (height, width).
			
 
				+        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
			
 
				+                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
			
 
				+        color (int): The color value to fill in the polygons on the masks.
			
 
				+        downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
			
 
				+
			
 
				+    Returns:
			
 
				+        (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
			
 
				+    """
			
 
				+    return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
			
 
				+
			
 
				+
			
 
				+def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
			
 
				+    """Return a (640, 640) overlap mask."""
			
 
				+    masks = np.zeros(
			
 
				+        (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
			
 
				+        dtype=np.int32 if len(segments) > 255 else np.uint8,
			
 
				+    )
			
 
				+    areas = []
			
 
				+    ms = []
			
 
				+    for si in range(len(segments)):
			
 
				+        mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1)
			
 
				+        ms.append(mask.astype(masks.dtype))
			
 
				+        areas.append(mask.sum())
			
 
				+    areas = np.asarray(areas)
			
 
				+    index = np.argsort(-areas)
			
 
				+    ms = np.array(ms)[index]
			
 
				+    for i in range(len(segments)):
			
 
				+        mask = ms[i] * (i + 1)
			
 
				+        masks = masks + mask
			
 
				+        masks = np.clip(masks, a_min=0, a_max=i + 1)
			
 
				+    return masks, index
			
 
				+
			
 
				+
			
 
				+def find_dataset_yaml(path: Path) -> Path:
			
 
				+    """
			
 
				+    Find and return the YAML file associated with a Detect, Segment or Pose dataset.
			
 
				+
			
 
				+    This function searches for a YAML file at the root level of the provided directory first, and if not found, it
			
 
				+    performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
			
 
				+    is raised if no YAML file is found or if multiple YAML files are found.
			
 
				+
			
 
				+    Args:
			
 
				+        path (Path): The directory path to search for the YAML file.
			
 
				+
			
 
				+    Returns:
			
 
				+        (Path): The path of the found YAML file.
			
 
				+    """
			
 
				+    files = list(path.glob("*.yaml")) or list(path.rglob("*.yaml"))  # try root level first and then recursive
			
 
				+    assert files, f"No YAML file found in '{path.resolve()}'"
			
 
				+    if len(files) > 1:
			
 
				+        files = [f for f in files if f.stem == path.stem]  # prefer *.yaml files that match
			
 
				+    assert len(files) == 1, f"Expected 1 YAML file in '{path.resolve()}', but found {len(files)}.\n{files}"
			
 
				+    return files[0]
			
 
				+
			
 
				+
			
 
				+def check_det_dataset(dataset, autodownload=True):
			
 
				+    """
			
 
				+    Download, verify, and/or unzip a dataset if not found locally.
			
 
				+
			
 
				+    This function checks the availability of a specified dataset, and if not found, it has the option to download and
			
 
				+    unzip the dataset. It then reads and parses the accompanying YAML data, ensuring key requirements are met and also
			
 
				+    resolves paths related to the dataset.
			
 
				+
			
 
				+    Args:
			
 
				+        dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
			
 
				+        autodownload (bool, optional): Whether to automatically download the dataset if not found. Defaults to True.
			
 
				+
			
 
				+    Returns:
			
 
				+        (dict): Parsed dataset information and paths.
			
 
				+    """
			
 
				+    file = check_file(dataset)
			
 
				+
			
 
				+    # Download (optional)
			
 
				+    extract_dir = ""
			
 
				+    if zipfile.is_zipfile(file) or is_tarfile(file):
			
 
				+        new_dir = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
			
 
				+        file = find_dataset_yaml(DATASETS_DIR / new_dir)
			
 
				+        extract_dir, autodownload = file.parent, False
			
 
				+
			
 
				+    # Read YAML
			
 
				+    data = yaml_load(file, append_filename=True)  # dictionary
			
 
				+
			
 
				+    # Checks
			
 
				+    for k in "train", "val":
			
 
				+        if k not in data:
			
 
				+            if k != "val" or "validation" not in data:
			
 
				+                raise SyntaxError(
			
 
				+                    emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")
			
 
				+                )
			
 
				+            LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
			
 
				+            data["val"] = data.pop("validation")  # replace 'validation' key with 'val' key
			
 
				+    if "names" not in data and "nc" not in data:
			
 
				+        raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
			
 
				+    if "names" in data and "nc" in data and len(data["names"]) != data["nc"]:
			
 
				+        raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
			
 
				+    if "names" not in data:
			
 
				+        data["names"] = [f"class_{i}" for i in range(data["nc"])]
			
 
				+    else:
			
 
				+        data["nc"] = len(data["names"])
			
 
				+
			
 
				+    data["names"] = check_class_names(data["names"])
			
 
				+
			
 
				+    # Resolve paths
			
 
				+    path = Path(extract_dir or data.get("path") or Path(data.get("yaml_file", "")).parent)  # dataset root
			
 
				+    if not path.is_absolute():
			
 
				+        path = (DATASETS_DIR / path).resolve()
			
 
				+
			
 
				+    # Set paths
			
 
				+    data["path"] = path  # download scripts
			
 
				+    for k in "train", "val", "test", "minival":
			
 
				+        if data.get(k):  # prepend path
			
 
				+            if isinstance(data[k], str):
			
 
				+                x = (path / data[k]).resolve()
			
 
				+                if not x.exists() and data[k].startswith("../"):
			
 
				+                    x = (path / data[k][3:]).resolve()
			
 
				+                data[k] = str(x)
			
 
				+            else:
			
 
				+                data[k] = [str((path / x).resolve()) for x in data[k]]
			
 
				+
			
 
				+    # Parse YAML
			
 
				+    val, s = (data.get(x) for x in ("val", "download"))
			
 
				+    if val:
			
 
				+        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
			
 
				+        if not all(x.exists() for x in val):
			
 
				+            name = clean_url(dataset)  # dataset name with URL auth stripped
			
 
				+            m = f"\nDataset '{name}' images not found ⚠️, missing path '{[x for x in val if not x.exists()][0]}'"
			
 
				+            if s and autodownload:
			
 
				+                LOGGER.warning(m)
			
 
				+            else:
			
 
				+                m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_FILE}'"
			
 
				+                raise FileNotFoundError(m)
			
 
				+            t = time.time()
			
 
				+            r = None  # success
			
 
				+            if s.startswith("http") and s.endswith(".zip"):  # URL
			
 
				+                safe_download(url=s, dir=DATASETS_DIR, delete=True)
			
 
				+            elif s.startswith("bash "):  # bash script
			
 
				+                LOGGER.info(f"Running {s} ...")
			
 
				+                r = os.system(s)
			
 
				+            else:  # python script
			
 
				+                exec(s, {"yaml": data})
			
 
				+            dt = f"({round(time.time() - t, 1)}s)"
			
 
				+            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in {0, None} else f"failure {dt} ❌"
			
 
				+            LOGGER.info(f"Dataset download {s}\n")
			
 
				+    check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf")  # download fonts
			
 
				+
			
 
				+    return data  # dictionary
			
 
				+
			
 
				+
			
 
				+def check_cls_dataset(dataset, split=""):
			
 
				+    """
			
 
				+    Checks a classification dataset such as Imagenet.
			
 
				+
			
 
				+    This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
			
 
				+    If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
			
 
				+
			
 
				+    Args:
			
 
				+        dataset (str | Path): The name of the dataset.
			
 
				+        split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
			
 
				+
			
 
				+    Returns:
			
 
				+        (dict): A dictionary containing the following keys:
			
 
				+            - 'train' (Path): The directory path containing the training set of the dataset.
			
 
				+            - 'val' (Path): The directory path containing the validation set of the dataset.
			
 
				+            - 'test' (Path): The directory path containing the test set of the dataset.
			
 
				+            - 'nc' (int): The number of classes in the dataset.
			
 
				+            - 'names' (dict): A dictionary of class names in the dataset.
			
 
				+    """
			
 
				+    # Download (optional if dataset=https://file.zip is passed directly)
			
 
				+    if str(dataset).startswith(("http:/", "https:/")):
			
 
				+        dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
			
 
				+    elif Path(dataset).suffix in {".zip", ".tar", ".gz"}:
			
 
				+        file = check_file(dataset)
			
 
				+        dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
			
 
				+
			
 
				+    dataset = Path(dataset)
			
 
				+    data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
			
 
				+    if not data_dir.is_dir():
			
 
				+        LOGGER.warning(f"\nDataset not found ⚠️, missing path {data_dir}, attempting download...")
			
 
				+        t = time.time()
			
 
				+        if str(dataset) == "imagenet":
			
 
				+            subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
			
 
				+        else:
			
 
				+            url = f"https://github.com/ultralytics/assets/releases/download/v0.0.0/{dataset}.zip"
			
 
				+            download(url, dir=data_dir.parent)
			
 
				+        s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
			
 
				+        LOGGER.info(s)
			
 
				+    train_set = data_dir / "train"
			
 
				+    val_set = (
			
 
				+        data_dir / "val"
			
 
				+        if (data_dir / "val").exists()
			
 
				+        else data_dir / "validation"
			
 
				+        if (data_dir / "validation").exists()
			
 
				+        else None
			
 
				+    )  # data/test or data/val
			
 
				+    test_set = data_dir / "test" if (data_dir / "test").exists() else None  # data/val or data/test
			
 
				+    if split == "val" and not val_set:
			
 
				+        LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
			
 
				+    elif split == "test" and not test_set:
			
 
				+        LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
			
 
				+
			
 
				+    nc = len([x for x in (data_dir / "train").glob("*") if x.is_dir()])  # number of classes
			
 
				+    names = [x.name for x in (data_dir / "train").iterdir() if x.is_dir()]  # class names list
			
 
				+    names = dict(enumerate(sorted(names)))
			
 
				+
			
 
				+    # Print to console
			
 
				+    for k, v in {"train": train_set, "val": val_set, "test": test_set}.items():
			
 
				+        prefix = f"{colorstr(f'{k}:')} {v}..."
			
 
				+        if v is None:
			
 
				+            LOGGER.info(prefix)
			
 
				+        else:
			
 
				+            files = [path for path in v.rglob("*.*") if path.suffix[1:].lower() in IMG_FORMATS]
			
 
				+            nf = len(files)  # number of files
			
 
				+            nd = len({file.parent for file in files})  # number of directories
			
 
				+            if nf == 0:
			
 
				+                if k == "train":
			
 
				+                    raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
			
 
				+                else:
			
 
				+                    LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found")
			
 
				+            elif nd != nc:
			
 
				+                LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}")
			
 
				+            else:
			
 
				+                LOGGER.info(f"{prefix} found {nf} images in {nd} classes ✅ ")
			
 
				+
			
 
				+    return {"train": train_set, "val": val_set, "test": test_set, "nc": nc, "names": names}
			
 
				+
			
 
				+
			
 
				+class HUBDatasetStats:
			
 
				+    """
			
 
				+    A class for generating HUB dataset JSON and `-hub` dataset directory.
			
 
				+
			
 
				+    Args:
			
 
				+        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco8.yaml'.
			
 
				+        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
			
 
				+        autodownload (bool): Attempt to download dataset if not found locally. Default is False.
			
 
				+
			
 
				+    Example:
			
 
				+        Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
			
 
				+            i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
			
 
				+        ```python
			
 
				+        from ultralytics.data.utils import HUBDatasetStats
			
 
				+
			
 
				+        stats = HUBDatasetStats("path/to/coco8.zip", task="detect")  # detect dataset
			
 
				+        stats = HUBDatasetStats("path/to/coco8-seg.zip", task="segment")  # segment dataset
			
 
				+        stats = HUBDatasetStats("path/to/coco8-pose.zip", task="pose")  # pose dataset
			
 
				+        stats = HUBDatasetStats("path/to/dota8.zip", task="obb")  # OBB dataset
			
 
				+        stats = HUBDatasetStats("path/to/imagenet10.zip", task="classify")  # classification dataset
			
 
				+
			
 
				+        stats.get_json(save=True)
			
 
				+        stats.process_images()
			
 
				+        ```
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, path="coco8.yaml", task="detect", autodownload=False):
			
 
				+        """Initialize class."""
			
 
				+        path = Path(path).resolve()
			
 
				+        LOGGER.info(f"Starting HUB dataset checks for {path}....")
			
 
				+
			
 
				+        self.task = task  # detect, segment, pose, classify, obb
			
 
				+        if self.task == "classify":
			
 
				+            unzip_dir = unzip_file(path)
			
 
				+            data = check_cls_dataset(unzip_dir)
			
 
				+            data["path"] = unzip_dir
			
 
				+        else:  # detect, segment, pose, obb
			
 
				+            _, data_dir, yaml_path = self._unzip(Path(path))
			
 
				+            try:
			
 
				+                # Load YAML with checks
			
 
				+                data = yaml_load(yaml_path)
			
 
				+                data["path"] = ""  # strip path since YAML should be in dataset root for all HUB datasets
			
 
				+                yaml_save(yaml_path, data)
			
 
				+                data = check_det_dataset(yaml_path, autodownload)  # dict
			
 
				+                data["path"] = data_dir  # YAML path should be set to '' (relative) or parent (absolute)
			
 
				+            except Exception as e:
			
 
				+                raise Exception("error/HUB/dataset_stats/init") from e
			
 
				+
			
 
				+        self.hub_dir = Path(f"{data['path']}-hub")
			
 
				+        self.im_dir = self.hub_dir / "images"
			
 
				+        self.stats = {"nc": len(data["names"]), "names": list(data["names"].values())}  # statistics dictionary
			
 
				+        self.data = data
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _unzip(path):
			
 
				+        """Unzip data.zip."""
			
 
				+        if not str(path).endswith(".zip"):  # path is data.yaml
			
 
				+            return False, None, path
			
 
				+        unzip_dir = unzip_file(path, path=path.parent)
			
 
				+        assert unzip_dir.is_dir(), (
			
 
				+            f"Error unzipping {path}, {unzip_dir} not found. path/to/abc.zip MUST unzip to path/to/abc/"
			
 
				+        )
			
 
				+        return True, str(unzip_dir), find_dataset_yaml(unzip_dir)  # zipped, data_dir, yaml_path
			
 
				+
			
 
				+    def _hub_ops(self, f):
			
 
				+        """Saves a compressed image for HUB previews."""
			
 
				+        compress_one_image(f, self.im_dir / Path(f).name)  # save to dataset-hub
			
 
				+
			
 
				+    def get_json(self, save=False, verbose=False):
			
 
				+        """Return dataset JSON for Ultralytics HUB."""
			
 
				+
			
 
				+        def _round(labels):
			
 
				+            """Update labels to integer class and 4 decimal place floats."""
			
 
				+            if self.task == "detect":
			
 
				+                coordinates = labels["bboxes"]
			
 
				+            elif self.task in {"segment", "obb"}:  # Segment and OBB use segments. OBB segments are normalized xyxyxyxy
			
 
				+                coordinates = [x.flatten() for x in labels["segments"]]
			
 
				+            elif self.task == "pose":
			
 
				+                n, nk, nd = labels["keypoints"].shape
			
 
				+                coordinates = np.concatenate((labels["bboxes"], labels["keypoints"].reshape(n, nk * nd)), 1)
			
 
				+            else:
			
 
				+                raise ValueError(f"Undefined dataset task={self.task}.")
			
 
				+            zipped = zip(labels["cls"], coordinates)
			
 
				+            return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]
			
 
				+
			
 
				+        for split in "train", "val", "test":
			
 
				+            self.stats[split] = None  # predefine
			
 
				+            path = self.data.get(split)
			
 
				+
			
 
				+            # Check split
			
 
				+            if path is None:  # no split
			
 
				+                continue
			
 
				+            files = [f for f in Path(path).rglob("*.*") if f.suffix[1:].lower() in IMG_FORMATS]  # image files in split
			
 
				+            if not files:  # no images
			
 
				+                continue
			
 
				+
			
 
				+            # Get dataset statistics
			
 
				+            if self.task == "classify":
			
 
				+                from torchvision.datasets import ImageFolder  # scope for faster 'import ultralytics'
			
 
				+
			
 
				+                dataset = ImageFolder(self.data[split])
			
 
				+
			
 
				+                x = np.zeros(len(dataset.classes)).astype(int)
			
 
				+                for im in dataset.imgs:
			
 
				+                    x[im[1]] += 1
			
 
				+
			
 
				+                self.stats[split] = {
			
 
				+                    "instance_stats": {"total": len(dataset), "per_class": x.tolist()},
			
 
				+                    "image_stats": {"total": len(dataset), "unlabelled": 0, "per_class": x.tolist()},
			
 
				+                    "labels": [{Path(k).name: v} for k, v in dataset.imgs],
			
 
				+                }
			
 
				+            else:
			
 
				+                from ultralytics.data import YOLODataset
			
 
				+
			
 
				+                dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
			
 
				+                x = np.array(
			
 
				+                    [
			
 
				+                        np.bincount(label["cls"].astype(int).flatten(), minlength=self.data["nc"])
			
 
				+                        for label in TQDM(dataset.labels, total=len(dataset), desc="Statistics")
			
 
				+                    ]
			
 
				+                )  # shape(128x80)
			
 
				+                self.stats[split] = {
			
 
				+                    "instance_stats": {"total": int(x.sum()), "per_class": x.sum(0).tolist()},
			
 
				+                    "image_stats": {
			
 
				+                        "total": len(dataset),
			
 
				+                        "unlabelled": int(np.all(x == 0, 1).sum()),
			
 
				+                        "per_class": (x > 0).sum(0).tolist(),
			
 
				+                    },
			
 
				+                    "labels": [{Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)],
			
 
				+                }
			
 
				+
			
 
				+        # Save, print and return
			
 
				+        if save:
			
 
				+            self.hub_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/
			
 
				+            stats_path = self.hub_dir / "stats.json"
			
 
				+            LOGGER.info(f"Saving {stats_path.resolve()}...")
			
 
				+            with open(stats_path, "w") as f:
			
 
				+                json.dump(self.stats, f)  # save stats.json
			
 
				+        if verbose:
			
 
				+            LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
			
 
				+        return self.stats
			
 
				+
			
 
				+    def process_images(self):
			
 
				+        """Compress images for Ultralytics HUB."""
			
 
				+        from ultralytics.data import YOLODataset  # ClassificationDataset
			
 
				+
			
 
				+        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/images/
			
 
				+        for split in "train", "val", "test":
			
 
				+            if self.data.get(split) is None:
			
 
				+                continue
			
 
				+            dataset = YOLODataset(img_path=self.data[split], data=self.data)
			
 
				+            with ThreadPool(NUM_THREADS) as pool:
			
 
				+                for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f"{split} images"):
			
 
				+                    pass
			
 
				+        LOGGER.info(f"Done. All images saved to {self.im_dir}")
			
 
				+        return self.im_dir
			
 
				+
			
 
				+
			
 
				+def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
			
 
				+    """
			
 
				+    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Python
			
 
				+    Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
			
 
				+    resized.
			
 
				+
			
 
				+    Args:
			
 
				+        f (str): The path to the input image file.
			
 
				+        f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
			
 
				+        max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
			
 
				+        quality (int, optional): The image compression quality as a percentage. Default is 50%.
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        from pathlib import Path
			
 
				+        from ultralytics.data.utils import compress_one_image
			
 
				+
			
 
				+        for f in Path("path/to/dataset").rglob("*.jpg"):
			
 
				+            compress_one_image(f)
			
 
				+        ```
			
 
				+    """
			
 
				+    try:  # use PIL
			
 
				+        im = Image.open(f)
			
 
				+        r = max_dim / max(im.height, im.width)  # ratio
			
 
				+        if r < 1.0:  # image too large
			
 
				+            im = im.resize((int(im.width * r), int(im.height * r)))
			
 
				+        im.save(f_new or f, "JPEG", quality=quality, optimize=True)  # save
			
 
				+    except Exception as e:  # use OpenCV
			
 
				+        LOGGER.info(f"WARNING ⚠️ HUB ops PIL failure {f}: {e}")
			
 
				+        im = cv2.imread(f)
			
 
				+        im_height, im_width = im.shape[:2]
			
 
				+        r = max_dim / max(im_height, im_width)  # ratio
			
 
				+        if r < 1.0:  # image too large
			
 
				+            im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
			
 
				+        cv2.imwrite(str(f_new or f), im)
			
 
				+
			
 
				+
			
 
				+def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annotated_only=False):
			
 
				+    """
			
 
				+    Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
			
 
				+
			
 
				+    Args:
			
 
				+        path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco8/images'.
			
 
				+        weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
			
 
				+        annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
			
 
				+
			
 
				+    Example:
			
 
				+        ```python
			
 
				+        from ultralytics.data.utils import autosplit
			
 
				+
			
 
				+        autosplit()
			
 
				+        ```
			
 
				+    """
			
 
				+    path = Path(path)  # images dir
			
 
				+    files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
			
 
				+    n = len(files)  # number of files
			
 
				+    random.seed(0)  # for reproducibility
			
 
				+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
			
 
				+
			
 
				+    txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"]  # 3 txt files
			
 
				+    for x in txt:
			
 
				+        if (path.parent / x).exists():
			
 
				+            (path.parent / x).unlink()  # remove existing
			
 
				+
			
 
				+    LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
			
 
				+    for i, img in TQDM(zip(indices, files), total=n):
			
 
				+        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
			
 
				+            with open(path.parent / txt[i], "a") as f:
			
 
				+                f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n")  # add image to txt file
			
 
				+
			
 
				+
			
 
				+def load_dataset_cache_file(path):
			
 
				+    """Load an Ultralytics *.cache dictionary from path."""
			
 
				+    import gc
			
 
				+
			
 
				+    gc.disable()  # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
			
 
				+    cache = np.load(str(path), allow_pickle=True).item()  # load dict
			
 
				+    gc.enable()
			
 
				+    return cache
			
 
				+
			
 
				+
			
 
				+def save_dataset_cache_file(prefix, path, x, version):
			
 
				+    """Save an Ultralytics dataset *.cache dictionary x to path."""
			
 
				+    x["version"] = version  # add cache version
			
 
				+    if is_dir_writeable(path.parent):
			
 
				+        if path.exists():
			
 
				+            path.unlink()  # remove *.cache file if exists
			
 
				+        np.save(str(path), x)  # save cache for next time
			
 
				+        path.with_suffix(".cache.npy").rename(path)  # remove .npy suffix
			
 
				+        LOGGER.info(f"{prefix}New cache created: {path}")
			
 
				+    else:
			
 
				+        LOGGER.warning(f"{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.")