Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# (c) Stefan Countryman 2019
3"""
4Classes for versioning files using in a given directory. Currently implemented
5with git.
6"""
8import os
9import shlex
10import logging
11import tempfile
12import functools
13from subprocess import Popen, PIPE
14from collections import namedtuple
15from llama.utils import GenerationError, bytes2str
16from llama.classes import LOCAL_TMPDIR_NAME
18LOGGER = logging.getLogger(__name__)
19# the contents of GIT_IGNORE go in the ``.gitignore`` for each eventdir
20GIT_IGNORE = f"""{LOCAL_TMPDIR_NAME}
21.webcache/*
22"""
25class GitRepoUninitialized(ValueError):
26 """
27 An exception indicating that a git repository has not been
28 initialized.
29 """
32class GitHandler(namedtuple('GitHandlerTuple', ('eventdir',))):
33 """
34 A class that performs ``git`` operations on an ``eventdir``.
36 You can also call an instance as if it were a function to perform git
37 commands conveniently; the interface is the same as ``subprocess.Popen``
38 with ``cwd`` set to the ``GitHandler``'s ``eventdir`` (for convenience at
39 the command line).
41 Parameters:
42 -----------
43 eventdir : str
44 The path to the directory that the new ``GitHandler`` instance will
45 manipulate.
46 """
48 def __call__(self, *args, stdout=PIPE, stderr=PIPE, stdin=PIPE, **kwargs):
49 """Perform git commands conveniently; like calling ``subprocess.Popen``
50 on ``['git']+args`` with ``cwd`` set to the ``GitHandler``'s ``eventdir`` (for
51 convenience at the command line) and STDOUT/STDERR/STDIN set to
52 ``PIPE``. Allows you to omit everything but the arguments you'd pass to
53 ``git``.
55 Parameters
56 ----------
57 *args
58 Arguments to pass to ``git`` in the subprocess. Leave ``git`` out
59 of this list as it is included automatically.
60 **kwargs
61 Extra keyword arguments to pass to ``subprocess.Popen``. ``cwd`` is
62 set to the event directory and ``stdout``, ``stderr``, and
63 ``stdin`` are set to ``subprocess.PIPE`` by default.
65 Returns
66 -------
67 proc : subprocess.Popen
68 The subprocess object launched to make the git call.
69 """
70 kwargs['cwd'] = kwargs.get('cwd', self.eventdir)
71 return Popen(['git']+list(args), stdout=stdout, stderr=stderr,
72 stdin=stdin, **kwargs)
74 @property
75 def eventid(self):
76 """Parse an eventid from the eventdir by splitting off the basename."""
77 return os.path.split(os.path.abspath(self.eventdir))[1]
79 @property
80 def current_hash(self):
81 """Get the current git hash for this directory."""
82 proc = self('rev-parse', 'HEAD')
83 res, err = proc.communicate()
84 if proc.returncode:
85 raise GenerationError(("Could not parse git version for eventdir "
86 "{}.\nSTDOUT:\n{}\nSTDERR:\n{}"
87 "\n").format(self.eventdir, res, err))
88 return res.decode().strip()
90 def filename_for_download(self, filename, last_hash=None):
91 """Get a filename that includes the ``eventid``, revision number, and
92 version hash for ``filename`` (i.e. what version number this is in the
93 version history; e.g. if three versions of *this* file exist in the
94 version history, then this is version 3). If this ``filename`` does not
95 appear in the git history, it will be marked 'v0' and the hash will be
96 'UNVERSIONED'. The output format is ``eventid``, version, first 7
97 digits of commit hash, and ``filename``, split by hyphens, so that the
98 third version of ``skymap_info.json`` for event ``S1234a`` with git
99 hash ``dedb33f`` would be called
100 ``S1234a-v3-dedb33f-skymap_info.json``. Use this for file downloads or
101 files sent to other services in order to facilitate data product
102 tracking outside the highly-organized confines of a pipeline run
103 directory."""
104 ref = self.hashes(filename, last_hash=last_hash)
105 # don't use the ``serial_version`` function; it will call ``git`` a second
106 # time, which is a fairly expensive operation.
107 return '{}-v{}-{}-{}'.format(self.eventid, len(ref),
108 ref[0][:7] if ref else 'UNVERSIONED',
109 filename)
111 def serial_version(self, last_hash=None):
112 """The serial version of this file as stored in the version history.
113 Note that this is merely a count of how many prior versions of the file
114 exist in this history; it is not an unambiguous label (in the same way
115 that the hash value is). Use this for human and interpretation. If the
116 file does not exist, this function returns ``0`` (unversioned), so it
117 effectively starts at ``1``."""
118 return len(self.hashes(filename, last_hash=last_hash))
120 def copy_file(self, filename, outpath, commit_hash=None,
121 serial_version=None):
122 """Check out a copy of a file, optionally specifying a particular
123 version of the file from this event's history, to the given outpath. If
124 no version is specified with ``commit_hash`` or ``serial_version``, the
125 latest version will be copied.
127 Parameters
128 ----------
129 filename : str
130 The relative path to the file from ``self.eventdir`` (in most cases
131 just the filename).
132 outpath : str
133 The path to the output file, or, if this path corresponds to an
134 existing directory, the directory in which it should be saved (with
135 the ``os.path.basename`` of``filename``). **If the file exists, it
136 will be overwritten without warning**.
137 commit_hash : str, optional
138 The commit hash, or partial commit hash containing the starting
139 characters of the full hash (as long as enough characters are
140 provided to disambiguate hashes), of the version of ``filename``
141 that is to be copied to ``outpath``. **You can only specify one
142 of** ``commit_hash`` **or** ``serial_version``.
143 serial_version : int, optional
144 The ``serial_version`` (i.e. the numbered version) of ``filename``
145 to checkout. This is potentially more ambiguous than using
146 ``commit_hash``. **You can only specify one of** ``commit_hash``
147 **or** ``serial_version``.
149 Returns
150 -------
151 outfile : str
152 Path to the final output file.
154 Raises
155 ------
156 GitRepoUninitialized
157 If the event directory is not a git directory.
158 ValueError
159 If both ``commit_hash`` and ``serial_version`` are specified or if they
160 do not correspond to available file versions.
161 IOError
162 If the ``outpath`` cannot be written to.
163 FileNotFoundError
164 If the file checkout fails.
165 """
166 if not self.is_repo:
167 raise GitRepoUninitialized()
168 if (commit_hash is not None) and (serial_version is not None):
169 raise ValueError("Must specify only one of ``commit_hash`` or "
170 "``serial_version``.")
171 if commit_hash is not None:
172 commit_hash = self.hashes(filename, last_hash=commit_hash)[0]
173 elif serial_version is not None:
174 # serial version starts at 1 but lists index starting at 0
175 commit_hash = self.hashes(filename)[::-1][serial_version-1]
176 else:
177 commit_hash = 'HEAD'
178 outpath = os.path.abspath(outpath)
179 if os.path.isdir(outpath):
180 outpath = os.path.join(outpath, filename)
181 with tempfile.TemporaryDirectory() as tmpdir:
182 cmd = ['--work-tree={}'.format(tmpdir), 'checkout',
183 commit_hash, '--', filename]
184 LOGGER.debug("Checkout to tmpdir ``%s`` from eventdir %s",
185 ' '.join(shlex.quote(w) for w in cmd), self.eventdir)
186 proc = self(*cmd)
187 res, err = proc.communicate()
188 if proc.returncode:
189 raise FileNotFoundError("Checkout failed: ``{}``".format(cmd))
190 tmp = os.path.join(tmpdir, filename)
191 assert os.path.isfile(tmp)
192 LOGGER.debug("Copying %s from tmpdir %s to final path %s",
193 filename, tmpdir, outpath)
194 with open(outpath, 'wb') as outfile:
195 with open(tmp, 'rb') as infile:
196 outfile.write(infile.read())
197 os.unlink(tmp)
198 return outpath
200 def reset_hard(self, ref=None):
201 """Hard reset the status of the branch to a given ref, losing all
202 subsequent changes. If ``ref`` is not provided, reset to the last
203 commit."""
204 ref = ref if ref is not None else 'HEAD'
205 proc = self('reset', '--hard', ref)
206 res, err = proc.communicate()
207 if proc.returncode:
208 raise GenerationError(("Could not hard reset to {} for eventdir "
209 "{}.\nSTDOUT:\n{}\nSTDERR:\n{}"
210 "\n").format(ref, self.eventdir, res, err))
212 def init(self):
213 """Initialize the ``eventdir`` as a git repository."""
214 proc = self('init')
215 res, err = proc.communicate()
216 if proc.returncode:
217 raise GenerationError(("Could not initialize git repository for "
218 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n"
219 "{}\n").format(self.eventdir, res, err))
220 with open(os.path.join(self.eventdir, '.gitignore'), 'w') as out:
221 out.write(GIT_IGNORE)
223 def add(self, *files):
224 """Run ``git add`` for all ``files``. Raises a ``GitRepoUninitialized``
225 exception if not a git repository."""
226 if not self.is_repo:
227 raise GitRepoUninitialized()
228 proc = self('add', *files)
229 res, err = proc.communicate()
230 if proc.returncode:
231 raise GenerationError(("Could not ``git add`` files {} for "
232 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n{}\n"
233 "").format(files, self.eventdir, res, err))
235 def remove(self, *files):
236 """Run ``git rm`` for all ``files``. Raises a ``GitRepoUninitialized``
237 exception if not a git repository."""
238 if not self.is_repo:
239 raise GitRepoUninitialized()
240 LOGGER.debug("``git rm`` %s in %s", files, self.eventdir)
241 proc = self('rm', *files)
242 res, err = proc.communicate()
243 if proc.returncode:
244 raise GenerationError(("Could not ``git rm`` files {} for "
245 "eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n{}\n"
246 "").format(files, self.eventdir,
247 res.decode(), err.decode()))
248 LOGGER.debug("git removed files %s, STDERR:\n%s\nSTDOUT:\n%s", files,
249 res.decode(), err.decode())
250 self.commit_changes("Removed {}".format(files))
252 def commit_changes(self, message):
253 """``git add`` all files in the ``eventdir`` and commit changes using
254 ``message`` as the commit message. Raises a ``GitRepoUninitialized``
255 exception if not a git repository. This will FAIL with a
256 ``GenerationError`` if there are no new changes."""
257 if not self.is_repo:
258 raise GitRepoUninitialized()
259 self.add("-u") # update. deleted files are also removed from git repo.
260 self.add(".") # add all files, modified or new, to the repo.
261 LOGGER.debug("``git commit -m`` '%s' in %s", message, self.eventdir)
262 proc = self('commit', '-m', message)
263 res, err = proc.communicate()
264 if proc.returncode:
265 raise GenerationError(("Could not ``git commit`` with message {} "
266 "for eventdir {}.\nSTDOUT:\n{}\nSTDERR:\n"
267 "{}\n").format(message, self.eventdir,
268 res.decode(), err.decode()))
270 def show_log(self, ref='HEAD'):
271 """Show the git commit message and notes for the given ``ref``."""
272 if not self.is_repo:
273 raise GitRepoUninitialized()
274 LOGGER.debug("``git log -1`` ref %s in %s", ref, self.eventdir)
275 proc = self('log', '-1', ref)
276 res, err = proc.communicate()
277 if proc.returncode:
278 raise GenerationError(("Error showing log from {} in eventdir {}."
279 "\nSTDOUT:\n{}\nSTDERR:\n{}\n"
280 "").format(ref, self.eventdir, res.decode(),
281 err.decode()))
282 return res.decode()
284 def is_clean(self):
285 """Return whether there are any changes made to the ``eventdir`` since
286 the last commit. Raises a ``GitRepoUninitialized`` exception if not a
287 git repository."""
288 if not self.is_repo:
289 raise GitRepoUninitialized()
290 proc = self('status', '--porcelain')
291 res, err = proc.communicate()
292 if proc.returncode:
293 raise GenerationError(("Could not get git status in eventdir {}."
294 "\nSTDOUT:\n{}\nSTDERR:\n{}\n"
295 "").format(self.eventdir, res.decode(),
296 err.decode()))
297 return not res # if response is empty, no changes
299 def is_repo(self):
300 """Checks whether this event directory is a git repo by seeing if it
301 contains a ``.git`` subdirectory. Raises a ``GitRepoUninitialized``
302 exception if not a git repository."""
303 return os.path.isdir(os.path.join(self.eventdir, '.git'))
305 def text_graph(self, *filenames, style='html'):
306 """Print a text graph of all files in the past history.
308 Parameters
309 ----------
310 *filenames : str, optional
311 An arbitrary list of filenames that will be spliced onto the end of
312 the argument list for ``git log``. Use this to narrow down the
313 history shown. Use ``--`` to specify all files in the past history
314 of the ``HEAD`` state.
315 style : str, optional
316 The format to put the output in. Options include 'html' (if this is
317 going to go on a summary page).
318 """
319 if not self.is_repo:
320 raise GitRepoUninitialized()
321 fmt = {
322 'html': (r'<a href="?hash=%H"><span class="lg2-bold-blue">'
323 r'%h</span></a> - '
324 r'<span class="lg2-bold-cyan">%aD</span> '
325 r'<span class="lg2-bold-green">(%ar)</span>%d%n'
326 r' <span class="lg2-white">%s</span> '
327 r'<span class="lg2-dim-white">- %an</span>'),
328 }
329 if not filenames:
330 filenames = ['--']
331 if not filenames[0] == '--':
332 filenames = ['--'] + list(filenames)
333 proc = self(
334 'log',
335 '--graph',
336 '--abbrev-commit',
337 '--decorate',
338 '--format='+fmt[style],
339 *filenames,
340 )
341 res, err = proc.communicate()
342 if proc.returncode:
343 raise ValueError(("Could not get a text graph for filenames "
344 "{} in event directory {}.\nSTDOUT:\n{}\n"
345 "STDERR:\n{}\n").format(filenames, self.eventdir,
346 res.decode(),
347 err.decode()))
348 return res.decode()
350 def hashes(self, *filenames, pretty="", last_hash=None):
351 """Get a list of full commit hashes for all commits related to the
352 provided filenames. Returns an empty list if no filenames are provided
353 or if the filename is not being tracked by git.
355 Parameters
356 ----------
357 filenames : list
358 Relative paths from the ``eventdir`` whose commits should be
359 retrieved. Returns an empty list if no filenames are specified. To
360 match all paths in the commit history, specify '--' as the only
361 filename.
362 pretty : str, optional
363 The git format string specifying what to return for each commit. By
364 default, only returns the git hash for each commit pertaining to
365 the given ``filenames``.
366 last_hash : str, optional
367 If specified, only return hashes up to and including this hash;
368 does not return hashes appearing topoligically later than this one.
369 This can be a partial hash containing only the starting characters
370 of the full hash (e.g. the first 7 characters, as is typically seen
371 elsewhere) as long as enough characters are provided to
372 disambiguate the available hashes.
374 Returns
375 -------
376 hashes : list
377 A list of git checksums for the commits related to the specified
378 filenames (or some other per-commmit string whose contents are
379 defined by ``pretty``).
381 Raises
382 ------
383 GitRepoUninitialized
384 If not a git repository.
385 ValueError
386 If the command cannot be run with the given filenames in the given
387 ``eventdir``.
388 ValueError
389 If the input ``last_hash`` is ambiguous (matches more than one hash)
390 or if it matches no hashes.
391 """
392 if not self.is_repo:
393 raise GitRepoUninitialized()
394 if not filenames:
395 return []
396 if filenames[0] != '--':
397 filenames = ['--'] + list(filenames)
398 pretty_fmt = '%H'
399 if pretty:
400 pretty_fmt += ' ' + pretty
401 cmd = (['log', '--pretty="{}"'.format(pretty_fmt),
402 '--topo-order'] + list(filenames))
403 proc = self(*cmd)
404 res, err = proc.communicate()
405 if proc.returncode:
406 raise ValueError(("Could not get a list of hashes for filenames "
407 "{} in event directory {} with CLI arguments\n"
408 "{}\nSTDOUT:\n{}\n"
409 "STDERR:\n{}\n").format(filenames, self.eventdir,
410 cmd, res.decode(),
411 err.decode()))
412 if not res:
413 return []
414 # git likes to throw in extra quotation marks around the commit hashes;
415 # strip these
416 commits = [h.strip('"\'') for h in bytes2str(res).strip().split("\n")]
417 if pretty:
418 hashes = [c.split(' ', 1)[0] for c in commits]
419 commits = [c.split(' ', 1)[1] for c in commits]
420 else:
421 hashes = commits
422 if last_hash is not None:
423 matches = [i for i, h in enumerate(hashes)
424 if h.startswith(last_hash)]
425 if not matches:
426 raise ValueError(("``last_hash`` {} matches none of the hashes "
427 "({}) for {}").format(last_hash, hashes,
428 self))
429 if len(matches) > 1:
430 raise ValueError(("Ambiguous ``last_hash`` {} matches multiple "
431 "hashes ({}) from full hash list ({}) for "
432 "{}").format(last_hash, matches, hashes,
433 self))
434 commits = commits[matches[0]:]
435 return commits
437 def diff(self, *args):
438 """Return the ``git diff`` for the given file paths (from their last
439 commits) as a string. Raises a ``GitRepoUninitialized`` exception if not
440 a git repository. This diff can be applied using ``git apply``.
442 Parameters
443 ----------
444 *args : str, optional
445 File paths relative to the root of the git directory whose diffs
446 should be taken. If no args are provided, the result will always be
447 an empty string.
449 Returns
450 -------
451 diff : str
452 The exact text returned by ``git diff ARG1 ARG2...`` for the
453 provided arguments. An empty string is returned if none of the file
454 contents of the given paths have changed since the last commit OR
455 if no paths are specified (note that this differs from standard
456 ``git diff`` behavior, where ALL diffs from the last commit are
457 provided if no arguments are specified).
458 """
459 if not self.is_repo:
460 raise GitRepoUninitialized()
461 if not args:
462 return ""
463 proc = self('diff', '--', *args)
464 res, err = proc.communicate()
465 if proc.returncode:
466 raise ValueError(("Could not get ``git diffs`` for filenames "
467 "{} in event directory {}.\nSTDOUT:\n{}\n"
468 "STDERR:\n{}\n").format(args, self.eventdir,
469 res, err))
470 return bytes2str(res)
472 def is_ancestor(self, possible_ancestor_hash, commit_hash):
473 """Check whether ``possible_ancestor_hash`` is a topological ancestor
474 of ``commit_hash``. Returns True if the hashes refer to the same
475 commit. Raises a ``GitRepoUninitialized`` exception if not a git
476 repository. Useful for figuring out if one commit came after another
477 (from a data flow perspective).
479 Returns
480 -------
481 is_ancestor : bool
482 True if ``possible_ancestor_hash`` is an ancestor of
483 ``commit_hash``, False otherwise. NOTE that a value of False does
484 not imply that ``commit_hash`` is an ancestor of
485 ``possible_ancestor_hash`` (since they can be from different
486 branches alltogether).
487 """
488 if not self.is_repo:
489 raise GitRepoUninitialized()
490 proc = self(
491 'merge-base',
492 '--is-ancestor',
493 possible_ancestor_hash,
494 commit_hash
495 )
496 res, err = proc.communicate()
497 if proc.returncode == 0:
498 return True
499 if proc.returncode == 1:
500 return False
501 raise ValueError(("Error while seeing if {} is a git ancestor of {} "
502 "using ``git merge-base --is-ancestor PARENT "
503 "CHILD`` in event directory {}.\nSTDOUT:\n{}\n"
504 "STDERR:\n{}\n").format(possible_ancestor_hash,
505 commit_hash, self.eventdir,
506 res, err))
509class GitDirMixin(object):
510 """
511 A mixin for ``EventTuple`` and ``FileHandlerTuple`` subclasses that allows
512 you to manipulate their event directories through a ``git`` property
513 returning a ``GitHandler`` pointing to that property.
514 """
516 @property
517 def git(self):
518 """Get a ``GitHandler`` for manipulating the ``eventdir`` as a git
519 repository. Used for versioning events."""
520 return GitHandler(self.eventdir)
522 @staticmethod
523 def decorate_checkout(func):
524 """
525 Commit the state of the event before file generation attempt to the
526 event's history and proceed with checkout.
527 """
529 @functools.wraps(func)
530 def wrapper(self, *args, **kwargs):
531 """
532 Commit the state of the event before file generation attempt to the
533 event's history and proceed with checkout.
534 """
535 self.git.init()
536 if not self.git.is_clean():
537 self.git.commit_changes(f"Changes before generating {self}")
538 tmp_self = func(self, *args, **kwargs)
539 if not self.git.is_clean():
540 raise GenerationError("Changes occured to event directory "
541 f"for {self} while files were being "
542 "checked out; aborting file generation.")
543 return tmp_self
545 return wrapper
547 @staticmethod
548 def decorate_checkin(func):
549 """
550 If generation and check in succeeded, commit changes to event history.
551 """
553 @functools.wraps(func)
554 def wrapper(self, gen_result, *args, **kwargs):
555 """
556 If generation and check in succeeded, commit changes to event
557 history. ``gen_result`` here refers to the ``GenerationResult`` to
558 be checked in.
559 """
560 self.git.init()
561 if not self.git.is_clean():
562 self.git.commit_changes(f"Changes before checking in {self}")
563 # it's possible that a later version of the file beat us to
564 # check-in. we should make sure that whatever's in the event
565 # directory is not obsolete before trying to replace it.
566 if self.exists() and not self.is_obsolete():
567 msg = (f"Current version of {self} is not obsolete; "
568 "it's possible that a more recent update of the "
569 "file finished before this attempt. Giving up "
570 f"on checking in {gen_result}.")
571 LOGGER.error(msg)
572 raise GenerationError(msg)
573 try:
574 result = func(self, gen_result, *args, **kwargs)
575 except: # noqa
576 LOGGER.error("Rolling back to last commit.")
577 proc = self.git.reset_hard()
578 LOGGER.error(f"Rolled back.")
579 raise
580 commit_msg = (
581 ("Done regenerating {}. Files added from manifest:"
582 "\n\n{}\n").format(self, '\n'.join(self.manifest))
583 )
584 self.git.commit_changes(commit_msg)
585 return result
587 return wrapper