multi_source_sampler.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. import itertools
  3. from typing import Iterator, List, Optional, Sized, Union
  4. import numpy as np
  5. import torch
  6. from mmengine.dataset import BaseDataset
  7. from mmengine.dist import get_dist_info, sync_random_seed
  8. from torch.utils.data import Sampler
  9. from mmdet.registry import DATA_SAMPLERS
  10. @DATA_SAMPLERS.register_module()
  11. class MultiSourceSampler(Sampler):
  12. r"""Multi-Source Infinite Sampler.
  13. According to the sampling ratio, sample data from different
  14. datasets to form batches.
  15. Args:
  16. dataset (Sized): The dataset.
  17. batch_size (int): Size of mini-batch.
  18. source_ratio (list[int | float]): The sampling ratio of different
  19. source datasets in a mini-batch.
  20. shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
  21. seed (int, optional): Random seed. If None, set a random seed.
  22. Defaults to None.
  23. Examples:
  24. >>> dataset_type = 'ConcatDataset'
  25. >>> sub_dataset_type = 'CocoDataset'
  26. >>> data_root = 'data/coco/'
  27. >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
  28. >>> unsup_ann = '../coco_semi_annos/' \
  29. >>> 'instances_train2017.1@10-unlabeled.json'
  30. >>> dataset = dict(type=dataset_type,
  31. >>> datasets=[
  32. >>> dict(
  33. >>> type=sub_dataset_type,
  34. >>> data_root=data_root,
  35. >>> ann_file=sup_ann,
  36. >>> data_prefix=dict(img='train2017/'),
  37. >>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
  38. >>> pipeline=sup_pipeline),
  39. >>> dict(
  40. >>> type=sub_dataset_type,
  41. >>> data_root=data_root,
  42. >>> ann_file=unsup_ann,
  43. >>> data_prefix=dict(img='train2017/'),
  44. >>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
  45. >>> pipeline=unsup_pipeline),
  46. >>> ])
  47. >>> train_dataloader = dict(
  48. >>> batch_size=5,
  49. >>> num_workers=5,
  50. >>> persistent_workers=True,
  51. >>> sampler=dict(type='MultiSourceSampler',
  52. >>> batch_size=5, source_ratio=[1, 4]),
  53. >>> batch_sampler=None,
  54. >>> dataset=dataset)
  55. """
  56. def __init__(self,
  57. dataset: Sized,
  58. batch_size: int,
  59. source_ratio: List[Union[int, float]],
  60. shuffle: bool = True,
  61. seed: Optional[int] = None) -> None:
  62. assert hasattr(dataset, 'cumulative_sizes'),\
  63. f'The dataset must be ConcatDataset, but get {dataset}'
  64. assert isinstance(batch_size, int) and batch_size > 0, \
  65. 'batch_size must be a positive integer value, ' \
  66. f'but got batch_size={batch_size}'
  67. assert isinstance(source_ratio, list), \
  68. f'source_ratio must be a list, but got source_ratio={source_ratio}'
  69. assert len(source_ratio) == len(dataset.cumulative_sizes), \
  70. 'The length of source_ratio must be equal to ' \
  71. f'the number of datasets, but got source_ratio={source_ratio}'
  72. rank, world_size = get_dist_info()
  73. self.rank = rank
  74. self.world_size = world_size
  75. self.dataset = dataset
  76. self.cumulative_sizes = [0] + dataset.cumulative_sizes
  77. self.batch_size = batch_size
  78. self.source_ratio = source_ratio
  79. self.num_per_source = [
  80. int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
  81. ]
  82. self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
  83. assert sum(self.num_per_source) == batch_size, \
  84. 'The sum of num_per_source must be equal to ' \
  85. f'batch_size, but get {self.num_per_source}'
  86. self.seed = sync_random_seed() if seed is None else seed
  87. self.shuffle = shuffle
  88. self.source2inds = {
  89. source: self._indices_of_rank(len(ds))
  90. for source, ds in enumerate(dataset.datasets)
  91. }
  92. def _infinite_indices(self, sample_size: int) -> Iterator[int]:
  93. """Infinitely yield a sequence of indices."""
  94. g = torch.Generator()
  95. g.manual_seed(self.seed)
  96. while True:
  97. if self.shuffle:
  98. yield from torch.randperm(sample_size, generator=g).tolist()
  99. else:
  100. yield from torch.arange(sample_size).tolist()
  101. def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
  102. """Slice the infinite indices by rank."""
  103. yield from itertools.islice(
  104. self._infinite_indices(sample_size), self.rank, None,
  105. self.world_size)
  106. def __iter__(self) -> Iterator[int]:
  107. batch_buffer = []
  108. while True:
  109. for source, num in enumerate(self.num_per_source):
  110. batch_buffer_per_source = []
  111. for idx in self.source2inds[source]:
  112. idx += self.cumulative_sizes[source]
  113. batch_buffer_per_source.append(idx)
  114. if len(batch_buffer_per_source) == num:
  115. batch_buffer += batch_buffer_per_source
  116. break
  117. yield from batch_buffer
  118. batch_buffer = []
  119. def __len__(self) -> int:
  120. return len(self.dataset)
  121. def set_epoch(self, epoch: int) -> None:
  122. """Not supported in `epoch-based runner."""
  123. pass
  124. @DATA_SAMPLERS.register_module()
  125. class GroupMultiSourceSampler(MultiSourceSampler):
  126. r"""Group Multi-Source Infinite Sampler.
  127. According to the sampling ratio, sample data from different
  128. datasets but the same group to form batches.
  129. Args:
  130. dataset (Sized): The dataset.
  131. batch_size (int): Size of mini-batch.
  132. source_ratio (list[int | float]): The sampling ratio of different
  133. source datasets in a mini-batch.
  134. shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
  135. seed (int, optional): Random seed. If None, set a random seed.
  136. Defaults to None.
  137. """
  138. def __init__(self,
  139. dataset: BaseDataset,
  140. batch_size: int,
  141. source_ratio: List[Union[int, float]],
  142. shuffle: bool = True,
  143. seed: Optional[int] = None) -> None:
  144. super().__init__(
  145. dataset=dataset,
  146. batch_size=batch_size,
  147. source_ratio=source_ratio,
  148. shuffle=shuffle,
  149. seed=seed)
  150. self._get_source_group_info()
  151. self.group_source2inds = [{
  152. source:
  153. self._indices_of_rank(self.group2size_per_source[source][group])
  154. for source in range(len(dataset.datasets))
  155. } for group in range(len(self.group_ratio))]
  156. def _get_source_group_info(self) -> None:
  157. self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
  158. self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
  159. for source, dataset in enumerate(self.dataset.datasets):
  160. for idx in range(len(dataset)):
  161. data_info = dataset.get_data_info(idx)
  162. width, height = data_info['width'], data_info['height']
  163. group = 0 if width < height else 1
  164. self.group2size_per_source[source][group] += 1
  165. self.group2inds_per_source[source][group].append(idx)
  166. self.group_sizes = np.zeros(2, dtype=np.int64)
  167. for group2size in self.group2size_per_source:
  168. for group, size in group2size.items():
  169. self.group_sizes[group] += size
  170. self.group_ratio = self.group_sizes / sum(self.group_sizes)
  171. def __iter__(self) -> Iterator[int]:
  172. batch_buffer = []
  173. while True:
  174. group = np.random.choice(
  175. list(range(len(self.group_ratio))), p=self.group_ratio)
  176. for source, num in enumerate(self.num_per_source):
  177. batch_buffer_per_source = []
  178. for idx in self.group_source2inds[group][source]:
  179. idx = self.group2inds_per_source[source][group][
  180. idx] + self.cumulative_sizes[source]
  181. batch_buffer_per_source.append(idx)
  182. if len(batch_buffer_per_source) == num:
  183. batch_buffer += batch_buffer_per_source
  184. break
  185. yield from batch_buffer
  186. batch_buffer = []