pillar/pillar/api/file_storage_backends/gcs.py

import os
import datetime
import logging
import typing

from bson import ObjectId
from gcloud.storage.client import Client
import gcloud.storage.blob
import gcloud.exceptions as gcloud_exc
from flask import current_app, g
from werkzeug.local import LocalProxy

from pillar.api import utils
from .abstract import Bucket, Blob, FileType

log = logging.getLogger(__name__)


def get_client() -> Client:
    """Stores the GCS client on the global Flask object.

    The GCS client is not user-specific anyway.
    """

    _gcs = getattr(g, '_gcs_client', None)
    if _gcs is None:
        _gcs = g._gcs_client = Client()
    return _gcs


# This hides the specifics of how/where we store the GCS client,
# and allows the rest of the code to use 'gcs' as a simple variable
# that does the right thing.
gcs: Client = LocalProxy(get_client)


class GoogleCloudStorageBucket(Bucket):
    """Cloud Storage bucket interface. We create a bucket for every project. In
    the bucket we create first level subdirs as follows:
    - '_' (will contain hashed assets, and stays on top of default listing)
    - 'svn' (svn checkout mirror)
    - 'shared' (any additional folder of static folder that is accessed via a
      node of 'storage' node_type)

    :type bucket_name: string
    :param bucket_name: Name of the bucket.

    :type subdir: string
    :param subdir: The local entry point to browse the bucket.

    """

    backend_name = 'gcs'

    def __init__(self, name: str, subdir='_') -> None:
        super().__init__(name=name)

        self._log = logging.getLogger(f'{__name__}.GoogleCloudStorageBucket')

        try:
            self._gcs_bucket = gcs.get_bucket(name)
        except gcloud_exc.NotFound:
            self._gcs_bucket = gcs.bucket(name)
            # Hardcode the bucket location to EU
            self._gcs_bucket.location = 'EU'
            # Optionally enable CORS from * (currently only used for vrview)
            # self.gcs_bucket.cors = [
            #     {
            #       "origin": ["*"],
            #       "responseHeader": ["Content-Type"],
            #       "method": ["GET", "HEAD", "DELETE"],
            #       "maxAgeSeconds": 3600
            #     }
            # ]
            self._gcs_bucket.create()
            log.info('Created GCS instance for project %s', name)

        self.subdir = subdir

    def blob(self, blob_name: str) -> 'GoogleCloudStorageBlob':
        return GoogleCloudStorageBlob(name=blob_name, bucket=self)

    def get_blob(self, internal_fname: str) -> typing.Optional['GoogleCloudStorageBlob']:
        blob = self.blob(internal_fname)
        if not blob.gblob.exists():
            return None
        return blob

    def _gcs_get(self, path: str, *, chunk_size=None) -> gcloud.storage.Blob:
        """Get selected file info if the path matches.

        :param path: The path to the file, relative to the bucket's subdir.
        """
        path = os.path.join(self.subdir, path)
        blob = self._gcs_bucket.blob(path, chunk_size=chunk_size)
        return blob

    def _gcs_post(self, full_path, *, path=None) -> typing.Optional[gcloud.storage.Blob]:
        """Create new blob and upload data to it.
        """
        path = path if path else os.path.join(self.subdir, os.path.basename(full_path))
        gblob = self._gcs_bucket.blob(path)
        if gblob.exists():
            self._log.error(f'Trying to upload to {path}, but that blob already exists. '
                            f'Not uploading.')
            return None

        gblob.upload_from_filename(full_path)
        return gblob
        # return self.blob_to_dict(blob) # Has issues with threading

    def delete_blob(self, path: str) -> bool:
        """Deletes the blob (when removing an asset or replacing a preview)"""

        # We want to get the actual blob to delete
        gblob = self._gcs_get(path)
        try:
            gblob.delete()
            return True
        except gcloud_exc.NotFound:
            return False

    def copy_blob(self, blob: Blob, to_bucket: Bucket):
        """Copies the given blob from this bucket to the other bucket.

        Returns the new blob.
        """

        assert isinstance(blob, GoogleCloudStorageBlob)
        assert isinstance(to_bucket, GoogleCloudStorageBucket)

        self._log.info('Copying %s to bucket %s', blob, to_bucket)

        return self._gcs_bucket.copy_blob(blob.gblob, to_bucket._gcs_bucket)

    def rename_blob(self, blob: 'GoogleCloudStorageBlob', new_name: str) \
            -> 'GoogleCloudStorageBlob':
        """Rename the blob, returning the new Blob."""

        assert isinstance(blob, GoogleCloudStorageBlob)

        new_name = os.path.join(self.subdir, new_name)

        self._log.info('Renaming %s to %r', blob, new_name)
        new_gblob = self._gcs_bucket.rename_blob(blob.gblob, new_name)
        return GoogleCloudStorageBlob(new_gblob.name, self, gblob=new_gblob)


class GoogleCloudStorageBlob(Blob):
    """GCS blob interface."""

    def __init__(self, name: str, bucket: GoogleCloudStorageBucket,
                 *, gblob: gcloud.storage.blob.Blob=None) -> None:
        super().__init__(name, bucket)

        self._log = logging.getLogger(f'{__name__}.GoogleCloudStorageBlob')
        self.gblob = gblob or bucket._gcs_get(name, chunk_size=256 * 1024 * 2)

    def create_from_file(self, file_obj: FileType, *,
                         content_type: str,
                         file_size: int = -1) -> None:
        from gcloud.streaming import transfer

        self._log.debug('Streaming file to GCS bucket %r, size=%i', self, file_size)

        # Files larger than this many bytes will be streamed directly from disk,
        # smaller ones will be read into memory and then uploaded.
        transfer.RESUMABLE_UPLOAD_THRESHOLD = 102400
        self.gblob.upload_from_file(file_obj,
                                    size=file_size,
                                    content_type=content_type)

        # Reload the blob to get the file size according to Google.
        self.gblob.reload()
        self._size_in_bytes = self.gblob.size

    def update_filename(self, filename: str):
        """Set the ContentDisposition metadata so that when a file is downloaded
        it has a human-readable name.
        """

        if '"' in filename:
            raise ValueError(f'Filename is not allowed to have double quote in it: {filename!r}')

        self.gblob.content_disposition = f'attachment; filename="{filename}"'
        self.gblob.patch()

    def get_url(self, *, is_public: bool) -> str:
        if is_public:
            return self.gblob.public_url

        expiration = utils.utcnow() + datetime.timedelta(days=1)
        return self.gblob.generate_signed_url(expiration)

    def make_public(self):
        self.gblob.make_public()

    def exists(self) -> bool:
        # Reload to get the actual file properties from Google.
        try:
            self.gblob.reload()
        except gcloud_exc.NotFound:
            return False
        return self.gblob.exists()


def update_file_name(node):
    """Assign to the CGS blob the same name of the asset node. This way when
    downloading an asset we get a human-readable name.
    """

    # Process only files that are not processing
    if node['properties'].get('status', '') == 'processing':
        return

    def _format_name(name, override_ext, size=None, map_type=''):
        root, _ = os.path.splitext(name)
        size = '-{}'.format(size) if size else ''
        map_type = '-{}'.format(map_type) if map_type else ''
        return '{}{}{}{}'.format(root, size, map_type, override_ext)

    def _update_name(file_id, file_props):
        files_collection = current_app.data.driver.db['files']
        file_doc = files_collection.find_one({'_id': ObjectId(file_id)})

        if file_doc is None or file_doc.get('backend') != 'gcs':
            return

        # For textures -- the map type should be part of the name.
        map_type = file_props.get('map_type', '')

        storage = GoogleCloudStorageBucket(str(node['project']))
        blob = storage.get_blob(file_doc['file_path'])
        if blob is None:
            log.warning('Unable to find blob for file %s in project %s',
                        file_doc['file_path'], file_doc['project'])
            return

        # Pick file extension from original filename
        _, ext = os.path.splitext(file_doc['filename'])
        name = _format_name(node['name'], ext, map_type=map_type)
        blob.update_filename(name)

        # Assign the same name to variations
        for v in file_doc.get('variations', []):
            _, override_ext = os.path.splitext(v['file_path'])
            name = _format_name(node['name'], override_ext, v['size'], map_type=map_type)
            blob = storage.get_blob(v['file_path'])
            if blob is None:
                log.info('Unable to find blob for file %s in project %s. This can happen if the '
                         'video encoding is still processing.', v['file_path'], node['project'])
                continue
            blob.update_filename(name)

    # Currently we search for 'file' and 'files' keys in the object properties.
    # This could become a bit more flexible and realy on a true reference of the
    # file object type from the schema.
    if 'file' in node['properties']:
        _update_name(node['properties']['file'], {})

    if 'files' in node['properties']:
        for file_props in node['properties']['files']:
            _update_name(file_props['file'], file_props)