Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# (c) Stefan Countryman 2019 

2 

3""" 

4Classes for versioning files using in a given directory. Currently implemented 

5with git. 

6""" 

7 

8import os 

9import shlex 

10import logging 

11import tempfile 

12import functools 

13from subprocess import Popen, PIPE 

14from collections import namedtuple 

15from llama.utils import GenerationError, bytes2str 

16from llama.classes import LOCAL_TMPDIR_NAME 

17 

18LOGGER = logging.getLogger(__name__) 

19# the contents of GIT_IGNORE go in the ``.gitignore`` for each eventdir 

20GIT_IGNORE = f"""{LOCAL_TMPDIR_NAME} 

21.webcache/* 

22""" 

23 

24 

25class GitRepoUninitialized(ValueError): 

26 """ 

27 An exception indicating that a git repository has not been 

28 initialized. 

29 """ 

30 

31 

32class GitHandler(namedtuple('GitHandlerTuple', ('eventdir',))): 

33 """ 

34 A class that performs ``git`` operations on an ``eventdir``. 

35 

36 You can also call an instance as if it were a function to perform git 

37 commands conveniently; the interface is the same as ``subprocess.Popen`` 

38 with ``cwd`` set to the ``GitHandler``'s ``eventdir`` (for convenience at 

39 the command line). 

40 

41 Parameters: 

42 ----------- 

43 eventdir : str 

44 The path to the directory that the new ``GitHandler`` instance will 

45 manipulate. 

46 """ 

47 

48 def __call__(self, *args, stdout=PIPE, stderr=PIPE, stdin=PIPE, **kwargs): 

49 """Perform git commands conveniently; like calling ``subprocess.Popen`` 

50 on ``['git']+args`` with ``cwd`` set to the ``GitHandler``'s ``eventdir`` (for 

51 convenience at the command line) and STDOUT/STDERR/STDIN set to 

52 ``PIPE``. Allows you to omit everything but the arguments you'd pass to 

53 ``git``. 

54 

55 Parameters 

56 ---------- 

57 *args 

58 Arguments to pass to ``git`` in the subprocess. Leave ``git`` out 

59 of this list as it is included automatically. 

60 **kwargs 

61 Extra keyword arguments to pass to ``subprocess.Popen``. ``cwd`` is 

62 set to the event directory and ``stdout``, ``stderr``, and 

63 ``stdin`` are set to ``subprocess.PIPE`` by default. 

64 

65 Returns 

66 ------- 

67 proc : subprocess.Popen 

68 The subprocess object launched to make the git call. 

69 """ 

70 kwargs['cwd'] = kwargs.get('cwd', self.eventdir) 

71 return Popen(['git']+list(args), stdout=stdout, stderr=stderr, 

72 stdin=stdin, **kwargs) 

73 

74 @property 

75 def eventid(self): 

76 """Parse an eventid from the eventdir by splitting off the basename.""" 

77 return os.path.split(os.path.abspath(self.eventdir))[1] 

78 

79 @property 

80 def current_hash(self): 

81 """Get the current git hash for this directory.""" 

82 proc = self('rev-parse', 'HEAD') 

83 res, err = proc.communicate() 

84 if proc.returncode: 

85 raise GenerationError(("Could not parse git version for eventdir " 

86 "{}.\nSTDOUT:\n{}\nSTDERR:\n{}" 

87 "\n").format(self.eventdir, res, err)) 

88 return res.decode().strip() 

89 

90 def filename_for_download(self, filename, last_hash=None): 

91 """Get a filename that includes the ``eventid``, revision number, and 

92 version hash for ``filename`` (i.e. what version number this is in the 

93 version history; e.g. if three versions of *this* file exist in the 

94 version history, then this is version 3). If this ``filename`` does not 

95 appear in the git history, it will be marked 'v0' and the hash will be 

96 'UNVERSIONED'. The output format is ``eventid``, version, first 7 

97 digits of commit hash, and ``filename``, split by hyphens, so that the 

98 third version of ``skymap_info.json`` for event ``S1234a`` with git 

99 hash ``dedb33f`` would be called 

100 ``S1234a-v3-dedb33f-skymap_info.json``. Use this for file downloads or 

101 files sent to other services in order to facilitate data product 

102 tracking outside the highly-organized confines of a pipeline run 

103 directory.""" 

104 ref = self.hashes(filename, last_hash=last_hash) 

105 # don't use the ``serial_version`` function; it will call ``git`` a second 

106 # time, which is a fairly expensive operation. 

107 return '{}-v{}-{}-{}'.format(self.eventid, len(ref), 

108 ref[0][:7] if ref else 'UNVERSIONED', 

109 filename) 

110 

111 def serial_version(self, last_hash=None): 

112 """The serial version of this file as stored in the version history. 

113 Note that this is merely a count of how many prior versions of the file 

114 exist in this history; it is not an unambiguous label (in the same way 

115 that the hash value is). Use this for human and interpretation. If the 

116 file does not exist, this function returns ``0`` (unversioned), so it 

117 effectively starts at ``1``.""" 

118 return len(self.hashes(filename, last_hash=last_hash)) 

119 

120 def copy_file(self, filename, outpath, commit_hash=None, 

121 serial_version=None): 

122 """Check out a copy of a file, optionally specifying a particular 

123 version of the file from this event's history, to the given outpath. If 

124 no version is specified with ``commit_hash`` or ``serial_version``, the 

125 latest version will be copied. 

126 

127 Parameters 

128 ---------- 

129 filename : str 

130 The relative path to the file from ``self.eventdir`` (in most cases 

131 just the filename). 

132 outpath : str 

133 The path to the output file, or, if this path corresponds to an 

134 existing directory, the directory in which it should be saved (with 

135 the ``os.path.basename`` of``filename``). **If the file exists, it 

136 will be overwritten without warning**. 

137 commit_hash : str, optional 

138 The commit hash, or partial commit hash containing the starting 

139 characters of the full hash (as long as enough characters are 

140 provided to disambiguate hashes), of the version of ``filename`` 

141 that is to be copied to ``outpath``. **You can only specify one 

142 of** ``commit_hash`` **or** ``serial_version``. 

143 serial_version : int, optional 

144 The ``serial_version`` (i.e. the numbered version) of ``filename`` 

145 to checkout. This is potentially more ambiguous than using 

146 ``commit_hash``. **You can only specify one of** ``commit_hash`` 

147 **or** ``serial_version``. 

148 

149 Returns 

150 ------- 

151 outfile : str 

152 Path to the final output file. 

153 

154 Raises 

155 ------ 

156 GitRepoUninitialized 

157 If the event directory is not a git directory. 

158 ValueError 

159 If both ``commit_hash`` and ``serial_version`` are specified or if they 

160 do not correspond to available file versions. 

161 IOError 

162 If the ``outpath`` cannot be written to. 

163 FileNotFoundError 

164 If the file checkout fails. 

165 """ 

166 if not self.is_repo: 

167 raise GitRepoUninitialized() 

168 if (commit_hash is not None) and (serial_version is not None): 

169 raise ValueError("Must specify only one of ``commit_hash`` or " 

170 "``serial_version``.") 

171 if commit_hash is not None: 

172 commit_hash = self.hashes(filename, last_hash=commit_hash)[0] 

173 elif serial_version is not None: 

174 # serial version starts at 1 but lists index starting at 0 

175 commit_hash = self.hashes(filename)[::-1][serial_version-1] 

176 else: 

177 commit_hash = 'HEAD' 

178 outpath = os.path.abspath(outpath) 

179 if os.path.isdir(outpath): 

180 outpath = os.path.join(outpath, filename) 

181 with tempfile.TemporaryDirectory() as tmpdir: 

182 cmd = ['--work-tree={}'.format(tmpdir), 'checkout', 

183 commit_hash, '--', filename] 

184 LOGGER.debug("Checkout to tmpdir ``%s`` from eventdir %s", 

185 ' '.join(shlex.quote(w) for w in cmd), self.eventdir) 

186 proc = self(*cmd) 

187 res, err = proc.communicate() 

188 if proc.returncode: 

189 raise FileNotFoundError("Checkout failed: ``{}``".format(cmd)) 

190 tmp = os.path.join(tmpdir, filename) 

191 assert os.path.isfile(tmp) 

192 LOGGER.debug("Copying %s from tmpdir %s to final path %s", 

193 filename, tmpdir, outpath) 

194 with open(outpath, 'wb') as outfile: 

195 with open(tmp, 'rb') as infile: 

196 outfile.write(infile.read()) 

197 os.unlink(tmp) 

198 return outpath 

199 

200 def reset_hard(self, ref=None): 

201 """Hard reset the status of the branch to a given ref, losing all 

202 subsequent changes. If ``ref`` is not provided, reset to the last 

203 commit.""" 

204 ref = ref if ref is not None else 'HEAD' 

205 proc = self('reset', '--hard', ref) 

206 res, err = proc.communicate() 

207 if proc.returncode: 

208 raise GenerationError(("Could not hard reset to {} for eventdir " 

209 "{}.\nSTDOUT:\n{}\nSTDERR:\n{}" 

210 "\n").format(ref, self.eventdir, res, err)) 

211 

212 def init(self): 

213 """Initialize the ``eventdir`` as a git repository.""" 

214 proc = self('init') 

215 res, err = proc.communicate() 

216 if proc.returncode: 

217 raise GenerationError(("Could not initialize git repository for " 

218 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n" 

219 "{}\n").format(self.eventdir, res, err)) 

220 with open(os.path.join(self.eventdir, '.gitignore'), 'w') as out: 

221 out.write(GIT_IGNORE) 

222 

223 def add(self, *files): 

224 """Run ``git add`` for all ``files``. Raises a ``GitRepoUninitialized`` 

225 exception if not a git repository.""" 

226 if not self.is_repo: 

227 raise GitRepoUninitialized() 

228 proc = self('add', *files) 

229 res, err = proc.communicate() 

230 if proc.returncode: 

231 raise GenerationError(("Could not ``git add`` files {} for " 

232 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n{}\n" 

233 "").format(files, self.eventdir, res, err)) 

234 

235 def remove(self, *files): 

236 """Run ``git rm`` for all ``files``. Raises a ``GitRepoUninitialized`` 

237 exception if not a git repository.""" 

238 if not self.is_repo: 

239 raise GitRepoUninitialized() 

240 LOGGER.debug("``git rm`` %s in %s", files, self.eventdir) 

241 proc = self('rm', *files) 

242 res, err = proc.communicate() 

243 if proc.returncode: 

244 raise GenerationError(("Could not ``git rm`` files {} for " 

245 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n{}\n" 

246 "").format(files, self.eventdir, 

247 res.decode(), err.decode())) 

248 LOGGER.debug("git removed files %s, STDERR:\n%s\nSTDOUT:\n%s", files, 

249 res.decode(), err.decode()) 

250 self.commit_changes("Removed {}".format(files)) 

251 

252 def commit_changes(self, message): 

253 """``git add`` all files in the ``eventdir`` and commit changes using 

254 ``message`` as the commit message. Raises a ``GitRepoUninitialized`` 

255 exception if not a git repository. This will FAIL with a 

256 ``GenerationError`` if there are no new changes.""" 

257 if not self.is_repo: 

258 raise GitRepoUninitialized() 

259 self.add("-u") # update. deleted files are also removed from git repo. 

260 self.add(".") # add all files, modified or new, to the repo. 

261 LOGGER.debug("``git commit -m`` '%s' in %s", message, self.eventdir) 

262 proc = self('commit', '-m', message) 

263 res, err = proc.communicate() 

264 if proc.returncode: 

265 raise GenerationError(("Could not ``git commit`` with message {} " 

266 "for eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n" 

267 "{}\n").format(message, self.eventdir, 

268 res.decode(), err.decode())) 

269 

270 def show_log(self, ref='HEAD'): 

271 """Show the git commit message and notes for the given ``ref``.""" 

272 if not self.is_repo: 

273 raise GitRepoUninitialized() 

274 LOGGER.debug("``git log -1`` ref %s in %s", ref, self.eventdir) 

275 proc = self('log', '-1', ref) 

276 res, err = proc.communicate() 

277 if proc.returncode: 

278 raise GenerationError(("Error showing log from {} in eventdir {}." 

279 "\nSTDOUT:\n{}\nSTDERR:\n{}\n" 

280 "").format(ref, self.eventdir, res.decode(), 

281 err.decode())) 

282 return res.decode() 

283 

284 def is_clean(self): 

285 """Return whether there are any changes made to the ``eventdir`` since 

286 the last commit. Raises a ``GitRepoUninitialized`` exception if not a 

287 git repository.""" 

288 if not self.is_repo: 

289 raise GitRepoUninitialized() 

290 proc = self('status', '--porcelain') 

291 res, err = proc.communicate() 

292 if proc.returncode: 

293 raise GenerationError(("Could not get git status in eventdir {}." 

294 "\nSTDOUT:\n{}\nSTDERR:\n{}\n" 

295 "").format(self.eventdir, res.decode(), 

296 err.decode())) 

297 return not res # if response is empty, no changes 

298 

299 def is_repo(self): 

300 """Checks whether this event directory is a git repo by seeing if it 

301 contains a ``.git`` subdirectory. Raises a ``GitRepoUninitialized`` 

302 exception if not a git repository.""" 

303 return os.path.isdir(os.path.join(self.eventdir, '.git')) 

304 

305 def text_graph(self, *filenames, style='html'): 

306 """Print a text graph of all files in the past history. 

307 

308 Parameters 

309 ---------- 

310 *filenames : str, optional 

311 An arbitrary list of filenames that will be spliced onto the end of 

312 the argument list for ``git log``. Use this to narrow down the 

313 history shown. Use ``--`` to specify all files in the past history 

314 of the ``HEAD`` state. 

315 style : str, optional 

316 The format to put the output in. Options include 'html' (if this is 

317 going to go on a summary page). 

318 """ 

319 if not self.is_repo: 

320 raise GitRepoUninitialized() 

321 fmt = { 

322 'html': (r'<a href="?hash=%H"><span class="lg2-bold-blue">' 

323 r'%h</span></a> - ' 

324 r'<span class="lg2-bold-cyan">%aD</span> ' 

325 r'<span class="lg2-bold-green">(%ar)</span>%d%n' 

326 r' <span class="lg2-white">%s</span> ' 

327 r'<span class="lg2-dim-white">- %an</span>'), 

328 } 

329 if not filenames: 

330 filenames = ['--'] 

331 if not filenames[0] == '--': 

332 filenames = ['--'] + list(filenames) 

333 proc = self( 

334 'log', 

335 '--graph', 

336 '--abbrev-commit', 

337 '--decorate', 

338 '--format='+fmt[style], 

339 *filenames, 

340 ) 

341 res, err = proc.communicate() 

342 if proc.returncode: 

343 raise ValueError(("Could not get a text graph for filenames " 

344 "{} in event directory {}.\nSTDOUT:\n{}\n" 

345 "STDERR:\n{}\n").format(filenames, self.eventdir, 

346 res.decode(), 

347 err.decode())) 

348 return res.decode() 

349 

350 def hashes(self, *filenames, pretty="", last_hash=None): 

351 """Get a list of full commit hashes for all commits related to the 

352 provided filenames. Returns an empty list if no filenames are provided 

353 or if the filename is not being tracked by git. 

354 

355 Parameters 

356 ---------- 

357 filenames : list 

358 Relative paths from the ``eventdir`` whose commits should be 

359 retrieved. Returns an empty list if no filenames are specified. To 

360 match all paths in the commit history, specify '--' as the only 

361 filename. 

362 pretty : str, optional 

363 The git format string specifying what to return for each commit. By 

364 default, only returns the git hash for each commit pertaining to 

365 the given ``filenames``. 

366 last_hash : str, optional 

367 If specified, only return hashes up to and including this hash; 

368 does not return hashes appearing topoligically later than this one. 

369 This can be a partial hash containing only the starting characters 

370 of the full hash (e.g. the first 7 characters, as is typically seen 

371 elsewhere) as long as enough characters are provided to 

372 disambiguate the available hashes. 

373 

374 Returns 

375 ------- 

376 hashes : list 

377 A list of git checksums for the commits related to the specified 

378 filenames (or some other per-commmit string whose contents are 

379 defined by ``pretty``). 

380 

381 Raises 

382 ------ 

383 GitRepoUninitialized  

384 If not a git repository. 

385 ValueError 

386 If the command cannot be run with the given filenames in the given 

387 ``eventdir``. 

388 ValueError 

389 If the input ``last_hash`` is ambiguous (matches more than one hash) 

390 or if it matches no hashes. 

391 """ 

392 if not self.is_repo: 

393 raise GitRepoUninitialized() 

394 if not filenames: 

395 return [] 

396 if filenames[0] != '--': 

397 filenames = ['--'] + list(filenames) 

398 pretty_fmt = '%H' 

399 if pretty: 

400 pretty_fmt += ' ' + pretty 

401 cmd = (['log', '--pretty="{}"'.format(pretty_fmt), 

402 '--topo-order'] + list(filenames)) 

403 proc = self(*cmd) 

404 res, err = proc.communicate() 

405 if proc.returncode: 

406 raise ValueError(("Could not get a list of hashes for filenames " 

407 "{} in event directory {} with CLI arguments\n" 

408 "{}\nSTDOUT:\n{}\n" 

409 "STDERR:\n{}\n").format(filenames, self.eventdir, 

410 cmd, res.decode(), 

411 err.decode())) 

412 if not res: 

413 return [] 

414 # git likes to throw in extra quotation marks around the commit hashes; 

415 # strip these 

416 commits = [h.strip('"\'') for h in bytes2str(res).strip().split("\n")] 

417 if pretty: 

418 hashes = [c.split(' ', 1)[0] for c in commits] 

419 commits = [c.split(' ', 1)[1] for c in commits] 

420 else: 

421 hashes = commits 

422 if last_hash is not None: 

423 matches = [i for i, h in enumerate(hashes) 

424 if h.startswith(last_hash)] 

425 if not matches: 

426 raise ValueError(("``last_hash`` {} matches none of the hashes " 

427 "({}) for {}").format(last_hash, hashes, 

428 self)) 

429 if len(matches) > 1: 

430 raise ValueError(("Ambiguous ``last_hash`` {} matches multiple " 

431 "hashes ({}) from full hash list ({}) for " 

432 "{}").format(last_hash, matches, hashes, 

433 self)) 

434 commits = commits[matches[0]:] 

435 return commits 

436 

437 def diff(self, *args): 

438 """Return the ``git diff`` for the given file paths (from their last 

439 commits) as a string. Raises a ``GitRepoUninitialized`` exception if not 

440 a git repository. This diff can be applied using ``git apply``. 

441 

442 Parameters 

443 ---------- 

444 *args : str, optional 

445 File paths relative to the root of the git directory whose diffs 

446 should be taken. If no args are provided, the result will always be 

447 an empty string. 

448 

449 Returns 

450 ------- 

451 diff : str 

452 The exact text returned by ``git diff ARG1 ARG2...`` for the 

453 provided arguments. An empty string is returned if none of the file 

454 contents of the given paths have changed since the last commit OR 

455 if no paths are specified (note that this differs from standard 

456 ``git diff`` behavior, where ALL diffs from the last commit are 

457 provided if no arguments are specified). 

458 """ 

459 if not self.is_repo: 

460 raise GitRepoUninitialized() 

461 if not args: 

462 return "" 

463 proc = self('diff', '--', *args) 

464 res, err = proc.communicate() 

465 if proc.returncode: 

466 raise ValueError(("Could not get ``git diffs`` for filenames " 

467 "{} in event directory {}.\nSTDOUT:\n{}\n" 

468 "STDERR:\n{}\n").format(args, self.eventdir, 

469 res, err)) 

470 return bytes2str(res) 

471 

472 def is_ancestor(self, possible_ancestor_hash, commit_hash): 

473 """Check whether ``possible_ancestor_hash`` is a topological ancestor 

474 of ``commit_hash``. Returns True if the hashes refer to the same 

475 commit. Raises a ``GitRepoUninitialized`` exception if not a git 

476 repository. Useful for figuring out if one commit came after another 

477 (from a data flow perspective). 

478 

479 Returns 

480 ------- 

481 is_ancestor : bool 

482 True if ``possible_ancestor_hash`` is an ancestor of 

483 ``commit_hash``, False otherwise. NOTE that a value of False does 

484 not imply that ``commit_hash`` is an ancestor of 

485 ``possible_ancestor_hash`` (since they can be from different 

486 branches alltogether). 

487 """ 

488 if not self.is_repo: 

489 raise GitRepoUninitialized() 

490 proc = self( 

491 'merge-base', 

492 '--is-ancestor', 

493 possible_ancestor_hash, 

494 commit_hash 

495 ) 

496 res, err = proc.communicate() 

497 if proc.returncode == 0: 

498 return True 

499 if proc.returncode == 1: 

500 return False 

501 raise ValueError(("Error while seeing if {} is a git ancestor of {} " 

502 "using ``git merge-base --is-ancestor PARENT " 

503 "CHILD`` in event directory {}.\nSTDOUT:\n{}\n" 

504 "STDERR:\n{}\n").format(possible_ancestor_hash, 

505 commit_hash, self.eventdir, 

506 res, err)) 

507 

508 

509class GitDirMixin(object): 

510 """ 

511 A mixin for ``EventTuple`` and ``FileHandlerTuple`` subclasses that allows 

512 you to manipulate their event directories through a ``git`` property 

513 returning a ``GitHandler`` pointing to that property. 

514 """ 

515 

516 @property 

517 def git(self): 

518 """Get a ``GitHandler`` for manipulating the ``eventdir`` as a git 

519 repository. Used for versioning events.""" 

520 return GitHandler(self.eventdir) 

521 

522 @staticmethod 

523 def decorate_checkout(func): 

524 """ 

525 Commit the state of the event before file generation attempt to the 

526 event's history and proceed with checkout. 

527 """ 

528 

529 @functools.wraps(func) 

530 def wrapper(self, *args, **kwargs): 

531 """ 

532 Commit the state of the event before file generation attempt to the 

533 event's history and proceed with checkout. 

534 """ 

535 self.git.init() 

536 if not self.git.is_clean(): 

537 self.git.commit_changes(f"Changes before generating {self}") 

538 tmp_self = func(self, *args, **kwargs) 

539 if not self.git.is_clean(): 

540 raise GenerationError("Changes occured to event directory " 

541 f"for {self} while files were being " 

542 "checked out; aborting file generation.") 

543 return tmp_self 

544 

545 return wrapper 

546 

547 @staticmethod 

548 def decorate_checkin(func): 

549 """ 

550 If generation and check in succeeded, commit changes to event history. 

551 """ 

552 

553 @functools.wraps(func) 

554 def wrapper(self, gen_result, *args, **kwargs): 

555 """ 

556 If generation and check in succeeded, commit changes to event 

557 history. ``gen_result`` here refers to the ``GenerationResult`` to 

558 be checked in. 

559 """ 

560 self.git.init() 

561 if not self.git.is_clean(): 

562 self.git.commit_changes(f"Changes before checking in {self}") 

563 # it's possible that a later version of the file beat us to 

564 # check-in. we should make sure that whatever's in the event 

565 # directory is not obsolete before trying to replace it. 

566 if self.exists() and not self.is_obsolete(): 

567 msg = (f"Current version of {self} is not obsolete; " 

568 "it's possible that a more recent update of the " 

569 "file finished before this attempt. Giving up " 

570 f"on checking in {gen_result}.") 

571 LOGGER.error(msg) 

572 raise GenerationError(msg) 

573 try: 

574 result = func(self, gen_result, *args, **kwargs) 

575 except: # noqa 

576 LOGGER.error("Rolling back to last commit.") 

577 proc = self.git.reset_hard() 

578 LOGGER.error(f"Rolled back.") 

579 raise 

580 commit_msg = ( 

581 ("Done regenerating {}. Files added from manifest:" 

582 "\n\n{}\n").format(self, '\n'.join(self.manifest)) 

583 ) 

584 self.git.commit_changes(commit_msg) 

585 return result 

586 

587 return wrapper