convert_train_benchmark_script.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. import argparse
  3. import os
  4. import os.path as osp
  5. def parse_args():
  6. parser = argparse.ArgumentParser(
  7. description='Convert benchmark model json to script')
  8. parser.add_argument(
  9. 'txt_path', type=str, help='txt path output by benchmark_filter')
  10. parser.add_argument(
  11. '--run', action='store_true', help='run script directly')
  12. parser.add_argument(
  13. '--out', type=str, help='path to save model benchmark script')
  14. args = parser.parse_args()
  15. return args
  16. def determine_gpus(cfg_name):
  17. gpus = 8
  18. gpus_pre_node = 8
  19. if cfg_name.find('16x') >= 0:
  20. gpus = 16
  21. elif cfg_name.find('4xb4') >= 0:
  22. gpus = 4
  23. gpus_pre_node = 4
  24. elif 'lad' in cfg_name:
  25. gpus = 2
  26. gpus_pre_node = 2
  27. return gpus, gpus_pre_node
  28. def main():
  29. args = parse_args()
  30. if args.out:
  31. out_suffix = args.out.split('.')[-1]
  32. assert args.out.endswith('.sh'), \
  33. f'Expected out file path suffix is .sh, but get .{out_suffix}'
  34. assert args.out or args.run, \
  35. ('Please specify at least one operation (save/run/ the '
  36. 'script) with the argument "--out" or "--run"')
  37. root_name = './tools'
  38. train_script_name = osp.join(root_name, 'slurm_train.sh')
  39. commands = []
  40. partition_name = 'PARTITION=$1 '
  41. commands.append(partition_name)
  42. commands.append('\n')
  43. work_dir = 'WORK_DIR=$2 '
  44. commands.append(work_dir)
  45. commands.append('\n')
  46. cpus_pre_task = 'CPUS_PER_TASK=${3:-4} '
  47. commands.append(cpus_pre_task)
  48. commands.append('\n')
  49. commands.append('\n')
  50. with open(args.txt_path, 'r') as f:
  51. model_cfgs = f.readlines()
  52. for i, cfg in enumerate(model_cfgs):
  53. cfg = cfg.strip()
  54. if len(cfg) == 0:
  55. continue
  56. # print cfg name
  57. echo_info = f'echo \'{cfg}\' &'
  58. commands.append(echo_info)
  59. commands.append('\n')
  60. fname, _ = osp.splitext(osp.basename(cfg))
  61. out_fname = '$WORK_DIR/' + fname
  62. gpus, gpus_pre_node = determine_gpus(cfg)
  63. command_info = f'GPUS={gpus} GPUS_PER_NODE={gpus_pre_node} ' \
  64. f'CPUS_PER_TASK=$CPUS_PRE_TASK {train_script_name} '
  65. command_info += '$PARTITION '
  66. command_info += f'{fname} '
  67. command_info += f'{cfg} '
  68. command_info += f'{out_fname} '
  69. command_info += '--cfg-options default_hooks.checkpoint.' \
  70. 'max_keep_ckpts=1 '
  71. command_info += '&'
  72. commands.append(command_info)
  73. if i < len(model_cfgs):
  74. commands.append('\n')
  75. command_str = ''.join(commands)
  76. if args.out:
  77. with open(args.out, 'w') as f:
  78. f.write(command_str)
  79. if args.run:
  80. os.system(command_str)
  81. if __name__ == '__main__':
  82. main()