Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# (c) Stefan Countryman, 2019 

2 

3""" 

4Tools for uploading manifests of data files to a DigitalOcean 

5Spaces/Amazon S3 object storage solution (for later user installation using 

6``llama install``). 

7""" 

8 

9import logging 

10from hashlib import sha256 

11from llama.com.s3 import DEFAULT_ENDPOINT, DEFAULT_BUCKET, upload_file 

12 

13LOGGER = logging.getLogger(__name__) 

14DEFAULT_GLOB = '**/*' 

15DEFAULT_ROOT = '.' 

16DEFAULT_KEY_ROOT = 'llama/objects/' 

17BLOCKSIZE = 2**20 # 1MB at a time 

18 

19 

20def upload_and_get_manifest(root: str = DEFAULT_ROOT, glob: str = DEFAULT_GLOB, 

21 key_prefix: str = DEFAULT_KEY_ROOT, 

22 key_uses_relpath: bool = False, 

23 bucket: str = DEFAULT_BUCKET, public: bool = True, 

24 endpoint_url: str = DEFAULT_ENDPOINT, 

25 dry_run: bool = False, **kwargs): 

26 """Upload files from the specified path to DigitalOcean Spaces/AWS S3 and 

27 return a manifest mapping the stored object URLs to local relative paths. 

28 The sha256 sum of the uploaded file will be the actual filename, allowing 

29 for versioning of files and avoiding redundant file uploads and downloads, 

30 with ``key_prefix`` prepended to aid in organization. Use this to offload 

31 large data files onto separate file storage and generate the MANIFEST 

32 constant (and related constants) for installation. 

33 

34 Parameters 

35 ---------- 

36 root : str, optional 

37 The path to the root directory that should be uploaded to cloud 

38 storage. All local paths in the returned manifest will be relative to 

39 this path as well. Can be relative or absolute. 

40 glob : str, optional 

41 The glob specifying which files to match from the provided ``root``. By 

42 default, recursively matches all files in all subdirectories. 

43 key_prefix : str, optional 

44 A prefix to prepend to the uploaded files' sha256 sums in order to 

45 create their object keys (i.e. remote filenames). Note that this is 

46 just a prefix, so if you want it to act/look like a containing 

47 directory for uploaded files, you will need to make sure it ends with 

48 ``/``. 

49 key_uses_relpath : bool, optional 

50 If ``True``, put the relative filepath from ``root`` of each file as a 

51 prefix in front of the sha256 sum when generating the key. In the 

52 filesystem analogy, this would put your remove files (on DigitalOcean, 

53 at least) at ``/<bucket>/<key_prefix>/<relative-path>/<sha256sum>``. 

54 Use this if you want it to be easier to find the file at a glance/want 

55 to organize things by filename on the object store (e.g. for one-off 

56 uploads); don't use this if you're planning on organizing things with 

57 the returned ``manifest``. 

58 bucket : str, optional 

59 The DigitalOcean Spaces/AWS S3 bucket to upload files to. For 

60 DigitalOcean this is just the name of the directory in your root Spaces 

61 directory. 

62 public : str, optional 

63 Whether to make files public. If you specify ``public=False``, the 

64 uploaded files will have ``None`` as their remote URLs in the returned 

65 manifest (which should not be surprising, since the returned manifest 

66 is intended for unauthenticated downloads). You want this to be 

67 ``True`` if you are uploading files for the purpose of public 

68 distribution. 

69 endpoint_url : str, optional 

70 The ``endpoint_url`` argument for ``llama.com.s3.get_client``. 

71 Specifies which S3 service you are using. 

72 dry_run : bool, optional 

73 If provided, don't upload the file. Instead, print the manifest that 

74 would be generated and quit. Use this to see where your files will be 

75 uploaded before actually doing it. 

76 **kwargs 

77 Keyword arguments to pass to ``llama.com.s3.get_client`` that set 

78 authentication parameters and choose the target space for uploads; 

79 see documentation for that function for details. 

80 

81 Returns 

82 ------- 

83 manifest : Dict[str, Tuple[str, str]] 

84 A dictionary whose keys are local paths of uploaded files relative to 

85 the ``root`` argument and whose values are tuples of the remote upload 

86 URL and sha256 sum of the file described by the key. Use this manifest 

87 to later download and install the correct versions of the uploaded 

88 files with the correct directory structure. Looks like ``{filename: 

89 (url, sha256sum)}``. 

90 

91 Examples 

92 -------- 

93 Try uploading some dummy files with known contents to a remote test 

94 directory to confirm that you have access rights. 

95 

96 >>> # coding: utf-8 

97 >>> import os 

98 >>> from llama.dev.upload import upload_and_get_manifest 

99 >>> from tempfile import TemporaryDirectory 

100 >>> from pathlib import Path 

101 >>> from requests import get 

102 >>> from hashlib import sha256 

103 >>> with TemporaryDirectory() as tmpdirpath: 

104 ... tmpdir = Path(tmpdirpath) 

105 ... with open(tmpdir/'foo', 'w') as foo: 

106 ... _ = foo.write('bar') 

107 ... with open(tmpdir/'baz', 'w') as baz: 

108 ... _ = baz.write('quux') 

109 ... manifest = upload_and_get_manifest(root=tmpdirpath, bucket='test', 

110 ... key_prefix='llama/dev/upload/', 

111 ... public=True) 

112 >>> sha256(get(manifest['foo'][0]).content).hexdigest() 

113 'fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9' 

114 >>> sha256(get(manifest['baz'][0]).content).hexdigest() 

115 '053057fda9a935f2d4fa8c7bc62a411a26926e00b491c07c1b2ec1909078a0a2' 

116 """ 

117 from pathlib import Path 

118 manifest = dict() 

119 for filepath in Path(root).glob(glob): 

120 if not filepath.is_file(): 

121 continue 

122 relpath = str(filepath.relative_to(root)) 

123 # files might be large, so read them in chunks in case memory is small 

124 with open(filepath, 'rb') as infile: 

125 sha = sha256() 

126 file_buffer = infile.read(BLOCKSIZE) 

127 while len(file_buffer) > 0: 

128 sha.update(file_buffer) 

129 file_buffer = infile.read(BLOCKSIZE) 

130 shasum = sha.hexdigest() 

131 if key_uses_relpath: 

132 key = key_prefix+relpath+'/'+shasum 

133 else: 

134 key = key_prefix+shasum 

135 LOGGER.debug("Uploading %s to bucket %s, key %s, public %s, kwargs %s", 

136 str(filepath), bucket, key, public, kwargs) 

137 if dry_run: 

138 url = f"{endpoint_url}/{bucket}/{key}" 

139 LOGGER.info("dry_run=True, skipping upload to %s", url) 

140 else: 

141 url = upload_file(str(filepath), key, bucket=bucket, public=public, 

142 **kwargs) 

143 manifest[relpath] = (url, shasum) 

144 return manifest