123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- # Copyright (c) OpenMMLab. All rights reserved.
- import itertools
- from typing import Iterator, List, Optional, Sized, Union
- import numpy as np
- import torch
- from mmengine.dataset import BaseDataset
- from mmengine.dist import get_dist_info, sync_random_seed
- from torch.utils.data import Sampler
- from mmdet.registry import DATA_SAMPLERS
- @DATA_SAMPLERS.register_module()
- class MultiSourceSampler(Sampler):
- r"""Multi-Source Infinite Sampler.
- According to the sampling ratio, sample data from different
- datasets to form batches.
- Args:
- dataset (Sized): The dataset.
- batch_size (int): Size of mini-batch.
- source_ratio (list[int | float]): The sampling ratio of different
- source datasets in a mini-batch.
- shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
- seed (int, optional): Random seed. If None, set a random seed.
- Defaults to None.
- Examples:
- >>> dataset_type = 'ConcatDataset'
- >>> sub_dataset_type = 'CocoDataset'
- >>> data_root = 'data/coco/'
- >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
- >>> unsup_ann = '../coco_semi_annos/' \
- >>> 'instances_train2017.1@10-unlabeled.json'
- >>> dataset = dict(type=dataset_type,
- >>> datasets=[
- >>> dict(
- >>> type=sub_dataset_type,
- >>> data_root=data_root,
- >>> ann_file=sup_ann,
- >>> data_prefix=dict(img='train2017/'),
- >>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
- >>> pipeline=sup_pipeline),
- >>> dict(
- >>> type=sub_dataset_type,
- >>> data_root=data_root,
- >>> ann_file=unsup_ann,
- >>> data_prefix=dict(img='train2017/'),
- >>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
- >>> pipeline=unsup_pipeline),
- >>> ])
- >>> train_dataloader = dict(
- >>> batch_size=5,
- >>> num_workers=5,
- >>> persistent_workers=True,
- >>> sampler=dict(type='MultiSourceSampler',
- >>> batch_size=5, source_ratio=[1, 4]),
- >>> batch_sampler=None,
- >>> dataset=dataset)
- """
- def __init__(self,
- dataset: Sized,
- batch_size: int,
- source_ratio: List[Union[int, float]],
- shuffle: bool = True,
- seed: Optional[int] = None) -> None:
- assert hasattr(dataset, 'cumulative_sizes'),\
- f'The dataset must be ConcatDataset, but get {dataset}'
- assert isinstance(batch_size, int) and batch_size > 0, \
- 'batch_size must be a positive integer value, ' \
- f'but got batch_size={batch_size}'
- assert isinstance(source_ratio, list), \
- f'source_ratio must be a list, but got source_ratio={source_ratio}'
- assert len(source_ratio) == len(dataset.cumulative_sizes), \
- 'The length of source_ratio must be equal to ' \
- f'the number of datasets, but got source_ratio={source_ratio}'
- rank, world_size = get_dist_info()
- self.rank = rank
- self.world_size = world_size
- self.dataset = dataset
- self.cumulative_sizes = [0] + dataset.cumulative_sizes
- self.batch_size = batch_size
- self.source_ratio = source_ratio
- self.num_per_source = [
- int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
- ]
- self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
- assert sum(self.num_per_source) == batch_size, \
- 'The sum of num_per_source must be equal to ' \
- f'batch_size, but get {self.num_per_source}'
- self.seed = sync_random_seed() if seed is None else seed
- self.shuffle = shuffle
- self.source2inds = {
- source: self._indices_of_rank(len(ds))
- for source, ds in enumerate(dataset.datasets)
- }
- def _infinite_indices(self, sample_size: int) -> Iterator[int]:
- """Infinitely yield a sequence of indices."""
- g = torch.Generator()
- g.manual_seed(self.seed)
- while True:
- if self.shuffle:
- yield from torch.randperm(sample_size, generator=g).tolist()
- else:
- yield from torch.arange(sample_size).tolist()
- def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
- """Slice the infinite indices by rank."""
- yield from itertools.islice(
- self._infinite_indices(sample_size), self.rank, None,
- self.world_size)
- def __iter__(self) -> Iterator[int]:
- batch_buffer = []
- while True:
- for source, num in enumerate(self.num_per_source):
- batch_buffer_per_source = []
- for idx in self.source2inds[source]:
- idx += self.cumulative_sizes[source]
- batch_buffer_per_source.append(idx)
- if len(batch_buffer_per_source) == num:
- batch_buffer += batch_buffer_per_source
- break
- yield from batch_buffer
- batch_buffer = []
- def __len__(self) -> int:
- return len(self.dataset)
- def set_epoch(self, epoch: int) -> None:
- """Not supported in `epoch-based runner."""
- pass
- @DATA_SAMPLERS.register_module()
- class GroupMultiSourceSampler(MultiSourceSampler):
- r"""Group Multi-Source Infinite Sampler.
- According to the sampling ratio, sample data from different
- datasets but the same group to form batches.
- Args:
- dataset (Sized): The dataset.
- batch_size (int): Size of mini-batch.
- source_ratio (list[int | float]): The sampling ratio of different
- source datasets in a mini-batch.
- shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
- seed (int, optional): Random seed. If None, set a random seed.
- Defaults to None.
- """
- def __init__(self,
- dataset: BaseDataset,
- batch_size: int,
- source_ratio: List[Union[int, float]],
- shuffle: bool = True,
- seed: Optional[int] = None) -> None:
- super().__init__(
- dataset=dataset,
- batch_size=batch_size,
- source_ratio=source_ratio,
- shuffle=shuffle,
- seed=seed)
- self._get_source_group_info()
- self.group_source2inds = [{
- source:
- self._indices_of_rank(self.group2size_per_source[source][group])
- for source in range(len(dataset.datasets))
- } for group in range(len(self.group_ratio))]
- def _get_source_group_info(self) -> None:
- self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
- self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
- for source, dataset in enumerate(self.dataset.datasets):
- for idx in range(len(dataset)):
- data_info = dataset.get_data_info(idx)
- width, height = data_info['width'], data_info['height']
- group = 0 if width < height else 1
- self.group2size_per_source[source][group] += 1
- self.group2inds_per_source[source][group].append(idx)
- self.group_sizes = np.zeros(2, dtype=np.int64)
- for group2size in self.group2size_per_source:
- for group, size in group2size.items():
- self.group_sizes[group] += size
- self.group_ratio = self.group_sizes / sum(self.group_sizes)
- def __iter__(self) -> Iterator[int]:
- batch_buffer = []
- while True:
- group = np.random.choice(
- list(range(len(self.group_ratio))), p=self.group_ratio)
- for source, num in enumerate(self.num_per_source):
- batch_buffer_per_source = []
- for idx in self.group_source2inds[group][source]:
- idx = self.group2inds_per_source[source][group][
- idx] + self.cumulative_sizes[source]
- batch_buffer_per_source.append(idx)
- if len(batch_buffer_per_source) == num:
- batch_buffer += batch_buffer_per_source
- break
- yield from batch_buffer
- batch_buffer = []
|