Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# (c) Stefan Countryman, 2019
3"""
4Tools for uploading manifests of data files to a DigitalOcean
5Spaces/Amazon S3 object storage solution (for later user installation using
6``llama install``).
7"""
9import logging
10from hashlib import sha256
11from llama.com.s3 import DEFAULT_ENDPOINT, DEFAULT_BUCKET, upload_file
13LOGGER = logging.getLogger(__name__)
14DEFAULT_GLOB = '**/*'
15DEFAULT_ROOT = '.'
16DEFAULT_KEY_ROOT = 'llama/objects/'
17BLOCKSIZE = 2**20 # 1MB at a time
20def upload_and_get_manifest(root: str = DEFAULT_ROOT, glob: str = DEFAULT_GLOB,
21 key_prefix: str = DEFAULT_KEY_ROOT,
22 key_uses_relpath: bool = False,
23 bucket: str = DEFAULT_BUCKET, public: bool = True,
24 endpoint_url: str = DEFAULT_ENDPOINT,
25 dry_run: bool = False, **kwargs):
26 """Upload files from the specified path to DigitalOcean Spaces/AWS S3 and
27 return a manifest mapping the stored object URLs to local relative paths.
28 The sha256 sum of the uploaded file will be the actual filename, allowing
29 for versioning of files and avoiding redundant file uploads and downloads,
30 with ``key_prefix`` prepended to aid in organization. Use this to offload
31 large data files onto separate file storage and generate the MANIFEST
32 constant (and related constants) for installation.
34 Parameters
35 ----------
36 root : str, optional
37 The path to the root directory that should be uploaded to cloud
38 storage. All local paths in the returned manifest will be relative to
39 this path as well. Can be relative or absolute.
40 glob : str, optional
41 The glob specifying which files to match from the provided ``root``. By
42 default, recursively matches all files in all subdirectories.
43 key_prefix : str, optional
44 A prefix to prepend to the uploaded files' sha256 sums in order to
45 create their object keys (i.e. remote filenames). Note that this is
46 just a prefix, so if you want it to act/look like a containing
47 directory for uploaded files, you will need to make sure it ends with
48 ``/``.
49 key_uses_relpath : bool, optional
50 If ``True``, put the relative filepath from ``root`` of each file as a
51 prefix in front of the sha256 sum when generating the key. In the
52 filesystem analogy, this would put your remove files (on DigitalOcean,
53 at least) at ``/<bucket>/<key_prefix>/<relative-path>/<sha256sum>``.
54 Use this if you want it to be easier to find the file at a glance/want
55 to organize things by filename on the object store (e.g. for one-off
56 uploads); don't use this if you're planning on organizing things with
57 the returned ``manifest``.
58 bucket : str, optional
59 The DigitalOcean Spaces/AWS S3 bucket to upload files to. For
60 DigitalOcean this is just the name of the directory in your root Spaces
61 directory.
62 public : str, optional
63 Whether to make files public. If you specify ``public=False``, the
64 uploaded files will have ``None`` as their remote URLs in the returned
65 manifest (which should not be surprising, since the returned manifest
66 is intended for unauthenticated downloads). You want this to be
67 ``True`` if you are uploading files for the purpose of public
68 distribution.
69 endpoint_url : str, optional
70 The ``endpoint_url`` argument for ``llama.com.s3.get_client``.
71 Specifies which S3 service you are using.
72 dry_run : bool, optional
73 If provided, don't upload the file. Instead, print the manifest that
74 would be generated and quit. Use this to see where your files will be
75 uploaded before actually doing it.
76 **kwargs
77 Keyword arguments to pass to ``llama.com.s3.get_client`` that set
78 authentication parameters and choose the target space for uploads;
79 see documentation for that function for details.
81 Returns
82 -------
83 manifest : Dict[str, Tuple[str, str]]
84 A dictionary whose keys are local paths of uploaded files relative to
85 the ``root`` argument and whose values are tuples of the remote upload
86 URL and sha256 sum of the file described by the key. Use this manifest
87 to later download and install the correct versions of the uploaded
88 files with the correct directory structure. Looks like ``{filename:
89 (url, sha256sum)}``.
91 Examples
92 --------
93 Try uploading some dummy files with known contents to a remote test
94 directory to confirm that you have access rights.
96 >>> # coding: utf-8
97 >>> import os
98 >>> from llama.dev.upload import upload_and_get_manifest
99 >>> from tempfile import TemporaryDirectory
100 >>> from pathlib import Path
101 >>> from requests import get
102 >>> from hashlib import sha256
103 >>> with TemporaryDirectory() as tmpdirpath:
104 ... tmpdir = Path(tmpdirpath)
105 ... with open(tmpdir/'foo', 'w') as foo:
106 ... _ = foo.write('bar')
107 ... with open(tmpdir/'baz', 'w') as baz:
108 ... _ = baz.write('quux')
109 ... manifest = upload_and_get_manifest(root=tmpdirpath, bucket='test',
110 ... key_prefix='llama/dev/upload/',
111 ... public=True)
112 >>> sha256(get(manifest['foo'][0]).content).hexdigest()
113 'fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9'
114 >>> sha256(get(manifest['baz'][0]).content).hexdigest()
115 '053057fda9a935f2d4fa8c7bc62a411a26926e00b491c07c1b2ec1909078a0a2'
116 """
117 from pathlib import Path
118 manifest = dict()
119 for filepath in Path(root).glob(glob):
120 if not filepath.is_file():
121 continue
122 relpath = str(filepath.relative_to(root))
123 # files might be large, so read them in chunks in case memory is small
124 with open(filepath, 'rb') as infile:
125 sha = sha256()
126 file_buffer = infile.read(BLOCKSIZE)
127 while len(file_buffer) > 0:
128 sha.update(file_buffer)
129 file_buffer = infile.read(BLOCKSIZE)
130 shasum = sha.hexdigest()
131 if key_uses_relpath:
132 key = key_prefix+relpath+'/'+shasum
133 else:
134 key = key_prefix+shasum
135 LOGGER.debug("Uploading %s to bucket %s, key %s, public %s, kwargs %s",
136 str(filepath), bucket, key, public, kwargs)
137 if dry_run:
138 url = f"{endpoint_url}/{bucket}/{key}"
139 LOGGER.info("dry_run=True, skipping upload to %s", url)
140 else:
141 url = upload_file(str(filepath), key, bucket=bucket, public=public,
142 **kwargs)
143 manifest[relpath] = (url, shasum)
144 return manifest