check_links.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # Modified from:
  2. # https://github.com/allenai/allennlp/blob/main/scripts/check_links.py
  3. import argparse
  4. import logging
  5. import os
  6. import pathlib
  7. import re
  8. import sys
  9. from multiprocessing.dummy import Pool
  10. from typing import NamedTuple, Optional, Tuple
  11. import requests
  12. from mmengine.logging import MMLogger
  13. def parse_args():
  14. parser = argparse.ArgumentParser(
  15. description='Goes through all the inline-links '
  16. 'in markdown files and reports the breakages')
  17. parser.add_argument(
  18. '--num-threads',
  19. type=int,
  20. default=100,
  21. help='Number of processes to confirm the link')
  22. parser.add_argument('--https-proxy', type=str, help='https proxy')
  23. parser.add_argument(
  24. '--out',
  25. type=str,
  26. default='link_reports.txt',
  27. help='output path of reports')
  28. args = parser.parse_args()
  29. return args
  30. OK_STATUS_CODES = (
  31. 200,
  32. 401, # the resource exists but may require some sort of login.
  33. 403, # ^ same
  34. 405, # HEAD method not allowed.
  35. # the resource exists, but our default 'Accept-' header may not
  36. # match what the server can provide.
  37. 406,
  38. )
  39. class MatchTuple(NamedTuple):
  40. source: str
  41. name: str
  42. link: str
  43. def check_link(
  44. match_tuple: MatchTuple,
  45. http_session: requests.Session,
  46. logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
  47. reason: Optional[str] = None
  48. if match_tuple.link.startswith('http'):
  49. result_ok, reason = check_url(match_tuple, http_session)
  50. else:
  51. result_ok = check_path(match_tuple)
  52. if logger is None:
  53. print(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
  54. else:
  55. logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
  56. return match_tuple, result_ok, reason
  57. def check_url(match_tuple: MatchTuple,
  58. http_session: requests.Session) -> Tuple[bool, str]:
  59. """Check if a URL is reachable."""
  60. try:
  61. result = http_session.head(
  62. match_tuple.link, timeout=5, allow_redirects=True)
  63. return (
  64. result.ok or result.status_code in OK_STATUS_CODES,
  65. f'status code = {result.status_code}',
  66. )
  67. except (requests.ConnectionError, requests.Timeout):
  68. return False, 'connection error'
  69. def check_path(match_tuple: MatchTuple) -> bool:
  70. """Check if a file in this repository exists."""
  71. relative_path = match_tuple.link.split('#')[0]
  72. full_path = os.path.join(
  73. os.path.dirname(str(match_tuple.source)), relative_path)
  74. return os.path.exists(full_path)
  75. def main():
  76. args = parse_args()
  77. # setup logger
  78. logger = MMLogger.get_instance(name='mmdet', log_file=args.out)
  79. # setup https_proxy
  80. if args.https_proxy:
  81. os.environ['https_proxy'] = args.https_proxy
  82. # setup http_session
  83. http_session = requests.Session()
  84. for resource_prefix in ('http://', 'https://'):
  85. http_session.mount(
  86. resource_prefix,
  87. requests.adapters.HTTPAdapter(
  88. max_retries=5,
  89. pool_connections=20,
  90. pool_maxsize=args.num_threads),
  91. )
  92. logger.info('Finding all markdown files in the current directory...')
  93. project_root = (pathlib.Path(__file__).parent / '..').resolve()
  94. markdown_files = project_root.glob('**/*.md')
  95. all_matches = set()
  96. url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
  97. for markdown_file in markdown_files:
  98. with open(markdown_file) as handle:
  99. for line in handle.readlines():
  100. matches = url_regex.findall(line)
  101. for name, link in matches:
  102. if 'localhost' not in link:
  103. all_matches.add(
  104. MatchTuple(
  105. source=str(markdown_file),
  106. name=name,
  107. link=link))
  108. logger.info(f' {len(all_matches)} markdown files found')
  109. logger.info('Checking to make sure we can retrieve each link...')
  110. with Pool(processes=args.num_threads) as pool:
  111. results = pool.starmap(check_link, [(match, http_session, logger)
  112. for match in list(all_matches)])
  113. # collect unreachable results
  114. unreachable_results = [(match_tuple, reason)
  115. for match_tuple, success, reason in results
  116. if not success]
  117. if unreachable_results:
  118. logger.info('================================================')
  119. logger.info(f'Unreachable links ({len(unreachable_results)}):')
  120. for match_tuple, reason in unreachable_results:
  121. logger.info(' > Source: ' + match_tuple.source)
  122. logger.info(' Name: ' + match_tuple.name)
  123. logger.info(' Link: ' + match_tuple.link)
  124. if reason is not None:
  125. logger.info(' Reason: ' + reason)
  126. sys.exit(1)
  127. logger.info('No Unreachable link found.')
  128. if __name__ == '__main__':
  129. main()