fast_partial.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. #! /usr/bin/env python
  2. # encoding: utf-8
  3. # Thomas Nagy, 2017-2018 (ita)
  4. """
  5. A system for fast partial rebuilds
  6. Creating a large amount of task objects up front can take some time.
  7. By making a few assumptions, it is possible to avoid posting creating
  8. task objects for targets that are already up-to-date.
  9. On a silly benchmark the gain observed for 1M tasks can be 5m->10s
  10. for a single file change.
  11. Usage::
  12. def options(opt):
  13. opt.load('fast_partial')
  14. Assuptions:
  15. * Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled)
  16. * For full project builds: no --targets and no pruning from subfolders
  17. * The installation phase is ignored
  18. * `use=` dependencies are specified up front even across build groups
  19. * Task generator source files are not obtained from globs
  20. Implementation details:
  21. * The first layer obtains file timestamps to recalculate file hashes only
  22. when necessary (similar to md5_tstamp); the timestamps are then stored
  23. in a dedicated pickle file
  24. * A second layer associates each task generator to a file set to help
  25. detecting changes. Task generators are to create their tasks only when
  26. the related files have been modified. A specific db file is created
  27. to store such data (5m -> 1m10)
  28. * A third layer binds build context proxies onto task generators, replacing
  29. the default context. While loading data for the full build uses more memory
  30. (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s)
  31. * A fourth layer enables a 2-level cache on file signatures to
  32. reduce the size of the main pickle file (13s -> 10s)
  33. """
  34. import os
  35. from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils
  36. from waflib.TaskGen import feature, after_method, taskgen_method
  37. import waflib.Node
  38. DONE = 0
  39. DIRTY = 1
  40. NEEDED = 2
  41. SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram']
  42. TSTAMP_DB = '.wafpickle_tstamp_db_file'
  43. SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split()
  44. class bld_proxy(object):
  45. def __init__(self, bld):
  46. object.__setattr__(self, 'bld', bld)
  47. object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {}))
  48. self.node_class.__module__ = 'waflib.Node'
  49. self.node_class.ctx = self
  50. object.__setattr__(self, 'root', self.node_class('', None))
  51. for x in SAVED_ATTRS:
  52. if x != 'root':
  53. object.__setattr__(self, x, {})
  54. self.fix_nodes()
  55. def __setattr__(self, name, value):
  56. bld = object.__getattribute__(self, 'bld')
  57. setattr(bld, name, value)
  58. def __delattr__(self, name):
  59. bld = object.__getattribute__(self, 'bld')
  60. delattr(bld, name)
  61. def __getattribute__(self, name):
  62. try:
  63. return object.__getattribute__(self, name)
  64. except AttributeError:
  65. bld = object.__getattribute__(self, 'bld')
  66. return getattr(bld, name)
  67. def __call__(self, *k, **kw):
  68. return self.bld(*k, **kw)
  69. def fix_nodes(self):
  70. for x in ('srcnode', 'path', 'bldnode'):
  71. node = self.root.find_dir(getattr(self.bld, x).abspath())
  72. object.__setattr__(self, x, node)
  73. def set_key(self, store_key):
  74. object.__setattr__(self, 'store_key', store_key)
  75. def fix_tg_path(self, *tgs):
  76. # changing Node objects on task generators is possible
  77. # yet, all Node objects must belong to the same parent
  78. for tg in tgs:
  79. tg.path = self.root.make_node(tg.path.abspath())
  80. def restore(self):
  81. dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
  82. Logs.debug('rev_use: reading %s', dbfn)
  83. try:
  84. data = Utils.readf(dbfn, 'rb')
  85. except (EnvironmentError, EOFError):
  86. # handle missing file/empty file
  87. Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
  88. else:
  89. try:
  90. waflib.Node.pickle_lock.acquire()
  91. waflib.Node.Nod3 = self.node_class
  92. try:
  93. data = Build.cPickle.loads(data)
  94. except Exception as e:
  95. Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
  96. else:
  97. for x in SAVED_ATTRS:
  98. object.__setattr__(self, x, data.get(x, {}))
  99. finally:
  100. waflib.Node.pickle_lock.release()
  101. self.fix_nodes()
  102. def store(self):
  103. data = {}
  104. for x in Build.SAVED_ATTRS:
  105. data[x] = getattr(self, x)
  106. db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
  107. try:
  108. waflib.Node.pickle_lock.acquire()
  109. waflib.Node.Nod3 = self.node_class
  110. x = Build.cPickle.dumps(data, Build.PROTOCOL)
  111. finally:
  112. waflib.Node.pickle_lock.release()
  113. Logs.debug('rev_use: storing %s', db)
  114. Utils.writef(db + '.tmp', x, m='wb')
  115. try:
  116. st = os.stat(db)
  117. os.remove(db)
  118. if not Utils.is_win32:
  119. os.chown(db + '.tmp', st.st_uid, st.st_gid)
  120. except (AttributeError, OSError):
  121. pass
  122. os.rename(db + '.tmp', db)
  123. class bld(Build.BuildContext):
  124. def __init__(self, **kw):
  125. super(bld, self).__init__(**kw)
  126. self.hashes_md5_tstamp = {}
  127. def __call__(self, *k, **kw):
  128. # this is one way of doing it, one could use a task generator method too
  129. bld = kw['bld'] = bld_proxy(self)
  130. ret = TaskGen.task_gen(*k, **kw)
  131. self.task_gen_cache_names = {}
  132. self.add_to_group(ret, group=kw.get('group'))
  133. ret.bld = bld
  134. bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx))
  135. return ret
  136. def is_dirty(self):
  137. return True
  138. def store_tstamps(self):
  139. # Called after a build is finished
  140. # For each task generator, record all files involved in task objects
  141. # optimization: done only if there was something built
  142. do_store = False
  143. try:
  144. f_deps = self.f_deps
  145. except AttributeError:
  146. f_deps = self.f_deps = {}
  147. self.f_tstamps = {}
  148. allfiles = set()
  149. for g in self.groups:
  150. for tg in g:
  151. try:
  152. staleness = tg.staleness
  153. except AttributeError:
  154. staleness = DIRTY
  155. if staleness != DIRTY:
  156. # DONE case: there was nothing built
  157. # NEEDED case: the tg was brought in because of 'use' propagation
  158. # but nothing really changed for them, there may be incomplete
  159. # tasks (object files) and in this case it is best to let the next build
  160. # figure out if an input/output file changed
  161. continue
  162. do_cache = False
  163. for tsk in tg.tasks:
  164. if tsk.hasrun == Task.SUCCESS:
  165. do_cache = True
  166. pass
  167. elif tsk.hasrun == Task.SKIPPED:
  168. pass
  169. else:
  170. # one failed task, clear the cache for this tg
  171. try:
  172. del f_deps[(tg.path.abspath(), tg.idx)]
  173. except KeyError:
  174. pass
  175. else:
  176. # just store the new state because there is a change
  177. do_store = True
  178. # skip the rest because there is no valid cache possible
  179. break
  180. else:
  181. if not do_cache:
  182. # all skipped, but is there anything in cache?
  183. try:
  184. f_deps[(tg.path.abspath(), tg.idx)]
  185. except KeyError:
  186. # probably cleared because a wscript file changed
  187. # store it
  188. do_cache = True
  189. if do_cache:
  190. # there was a rebuild, store the data structure too
  191. tg.bld.store()
  192. # all tasks skipped but no cache
  193. # or a successful task build
  194. do_store = True
  195. st = set()
  196. for tsk in tg.tasks:
  197. st.update(tsk.inputs)
  198. st.update(self.node_deps.get(tsk.uid(), []))
  199. # TODO do last/when loading the tgs?
  200. lst = []
  201. for k in ('wscript', 'wscript_build'):
  202. n = tg.path.find_node(k)
  203. if n:
  204. n.get_bld_sig()
  205. lst.append(n.abspath())
  206. lst.extend(sorted(x.abspath() for x in st))
  207. allfiles.update(lst)
  208. f_deps[(tg.path.abspath(), tg.idx)] = lst
  209. for x in allfiles:
  210. # f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds
  211. self.f_tstamps[x] = self.hashes_md5_tstamp[x][0]
  212. if do_store:
  213. dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
  214. Logs.debug('rev_use: storing %s', dbfn)
  215. dbfn_tmp = dbfn + '.tmp'
  216. x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL)
  217. Utils.writef(dbfn_tmp, x, m='wb')
  218. os.rename(dbfn_tmp, dbfn)
  219. Logs.debug('rev_use: stored %s', dbfn)
  220. def store(self):
  221. self.store_tstamps()
  222. if self.producer.dirty:
  223. Build.BuildContext.store(self)
  224. def compute_needed_tgs(self):
  225. # assume the 'use' keys are not modified during the build phase
  226. dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
  227. Logs.debug('rev_use: Loading %s', dbfn)
  228. try:
  229. data = Utils.readf(dbfn, 'rb')
  230. except (EnvironmentError, EOFError):
  231. Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
  232. self.f_deps = {}
  233. self.f_tstamps = {}
  234. else:
  235. try:
  236. self.f_tstamps, self.f_deps = Build.cPickle.loads(data)
  237. except Exception as e:
  238. Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
  239. self.f_deps = {}
  240. self.f_tstamps = {}
  241. else:
  242. Logs.debug('rev_use: Loaded %s', dbfn)
  243. # 1. obtain task generators that contain rebuilds
  244. # 2. obtain the 'use' graph and its dual
  245. stales = set()
  246. reverse_use_map = Utils.defaultdict(list)
  247. use_map = Utils.defaultdict(list)
  248. for g in self.groups:
  249. for tg in g:
  250. if tg.is_stale():
  251. stales.add(tg)
  252. try:
  253. lst = tg.use = Utils.to_list(tg.use)
  254. except AttributeError:
  255. pass
  256. else:
  257. for x in lst:
  258. try:
  259. xtg = self.get_tgen_by_name(x)
  260. except Errors.WafError:
  261. pass
  262. else:
  263. use_map[tg].append(xtg)
  264. reverse_use_map[xtg].append(tg)
  265. Logs.debug('rev_use: found %r stale tgs', len(stales))
  266. # 3. dfs to post downstream tg as stale
  267. visited = set()
  268. def mark_down(tg):
  269. if tg in visited:
  270. return
  271. visited.add(tg)
  272. Logs.debug('rev_use: marking down %r as stale', tg.name)
  273. tg.staleness = DIRTY
  274. for x in reverse_use_map[tg]:
  275. mark_down(x)
  276. for tg in stales:
  277. mark_down(tg)
  278. # 4. dfs to find ancestors tg to mark as needed
  279. self.needed_tgs = needed_tgs = set()
  280. def mark_needed(tg):
  281. if tg in needed_tgs:
  282. return
  283. needed_tgs.add(tg)
  284. if tg.staleness == DONE:
  285. Logs.debug('rev_use: marking up %r as needed', tg.name)
  286. tg.staleness = NEEDED
  287. for x in use_map[tg]:
  288. mark_needed(x)
  289. for xx in visited:
  290. mark_needed(xx)
  291. # so we have the whole tg trees to post in the set "needed"
  292. # load their build trees
  293. for tg in needed_tgs:
  294. tg.bld.restore()
  295. tg.bld.fix_tg_path(tg)
  296. # the stale ones should be fully build, while the needed ones
  297. # may skip a few tasks, see create_compiled_task and apply_link_after below
  298. Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs))
  299. def post_group(self):
  300. # assumption: we can ignore the folder/subfolders cuts
  301. def tgpost(tg):
  302. try:
  303. f = tg.post
  304. except AttributeError:
  305. pass
  306. else:
  307. f()
  308. if not self.targets or self.targets == '*':
  309. for tg in self.groups[self.current_group]:
  310. # this can cut quite a lot of tg objects
  311. if tg in self.needed_tgs:
  312. tgpost(tg)
  313. else:
  314. # default implementation
  315. return Build.BuildContext.post_group()
  316. def get_build_iterator(self):
  317. if not self.targets or self.targets == '*':
  318. self.compute_needed_tgs()
  319. return Build.BuildContext.get_build_iterator(self)
  320. @taskgen_method
  321. def is_stale(self):
  322. # assume no globs
  323. self.staleness = DIRTY
  324. # 1. the case of always stale targets
  325. if getattr(self, 'always_stale', False):
  326. return True
  327. # 2. check if the db file exists
  328. db = os.path.join(self.bld.variant_dir, Context.DBFILE)
  329. try:
  330. dbstat = os.stat(db).st_mtime
  331. except OSError:
  332. Logs.debug('rev_use: must post %r because this is a clean build')
  333. return True
  334. # 3. check if the configuration changed
  335. if os.stat(self.bld.bldnode.find_node('c4che/build.config.py').abspath()).st_mtime > dbstat:
  336. Logs.debug('rev_use: must post %r because the configuration has changed', self.name)
  337. return True
  338. # 3.a any tstamp data?
  339. try:
  340. f_deps = self.bld.f_deps
  341. except AttributeError:
  342. Logs.debug('rev_use: must post %r because there is no f_deps', self.name)
  343. return True
  344. # 4. check if this is the first build (no cache)
  345. try:
  346. lst = f_deps[(self.path.abspath(), self.idx)]
  347. except KeyError:
  348. Logs.debug('rev_use: must post %r because there it has no cached data', self.name)
  349. return True
  350. try:
  351. cache = self.bld.cache_tstamp_rev_use
  352. except AttributeError:
  353. cache = self.bld.cache_tstamp_rev_use = {}
  354. # 5. check the timestamp of each dependency files listed is unchanged
  355. f_tstamps = self.bld.f_tstamps
  356. for x in lst:
  357. try:
  358. old_ts = f_tstamps[x]
  359. except KeyError:
  360. Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x)
  361. return True
  362. try:
  363. try:
  364. ts = cache[x]
  365. except KeyError:
  366. ts = cache[x] = os.stat(x).st_mtime
  367. except OSError:
  368. del f_deps[(self.path.abspath(), self.idx)]
  369. Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x)
  370. return True
  371. else:
  372. if ts != old_ts:
  373. Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts)
  374. return True
  375. self.staleness = DONE
  376. return False
  377. @taskgen_method
  378. def create_compiled_task(self, name, node):
  379. # skip the creation of object files
  380. # assumption: object-only targets are not skippable
  381. if self.staleness == NEEDED:
  382. # only libraries/programs can skip object files
  383. for x in SKIPPABLE:
  384. if x in self.features:
  385. return None
  386. out = '%s.%d.o' % (node.name, self.idx)
  387. task = self.create_task(name, node, node.parent.find_or_declare(out))
  388. try:
  389. self.compiled_tasks.append(task)
  390. except AttributeError:
  391. self.compiled_tasks = [task]
  392. return task
  393. @feature(*SKIPPABLE)
  394. @after_method('apply_link')
  395. def apply_link_after(self):
  396. # cprogram/cxxprogram might be unnecessary
  397. if self.staleness != NEEDED:
  398. return
  399. for tsk in self.tasks:
  400. tsk.hasrun = Task.SKIPPED
  401. def path_from(self, node):
  402. # handle nodes of distinct types
  403. if node.ctx is not self.ctx:
  404. node = self.ctx.root.make_node(node.abspath())
  405. return self.default_path_from(node)
  406. waflib.Node.Node.default_path_from = waflib.Node.Node.path_from
  407. waflib.Node.Node.path_from = path_from
  408. def h_file(self):
  409. # similar to md5_tstamp.py, but with 2-layer cache
  410. # global_cache for the build context common for all task generators
  411. # local_cache for the build context proxy (one by task generator)
  412. #
  413. # the global cache is not persistent
  414. # the local cache is persistent and meant for partial builds
  415. #
  416. # assume all calls are made from a single thread
  417. #
  418. filename = self.abspath()
  419. st = os.stat(filename)
  420. global_cache = self.ctx.bld.hashes_md5_tstamp
  421. local_cache = self.ctx.hashes_md5_tstamp
  422. if filename in global_cache:
  423. # value already calculated in this build
  424. cval = global_cache[filename]
  425. # the value in global cache is assumed to be calculated once
  426. # reverifying it could cause task generators
  427. # to get distinct tstamp values, thus missing rebuilds
  428. local_cache[filename] = cval
  429. return cval[1]
  430. if filename in local_cache:
  431. cval = local_cache[filename]
  432. if cval[0] == st.st_mtime:
  433. # correct value from a previous build
  434. # put it in the global cache
  435. global_cache[filename] = cval
  436. return cval[1]
  437. ret = Utils.h_file(filename)
  438. local_cache[filename] = global_cache[filename] = (st.st_mtime, ret)
  439. return ret
  440. waflib.Node.Node.h_file = h_file