470 lines
17 KiB
Python
470 lines
17 KiB
Python
import datetime
|
|
import logging
|
|
import os
|
|
from multiprocessing import Process
|
|
from hashlib import md5
|
|
|
|
import bson.tz_util
|
|
import eve.utils
|
|
import pymongo
|
|
from bson import ObjectId
|
|
from eve.methods.patch import patch_internal
|
|
from eve.methods.put import put_internal
|
|
from flask import Blueprint, safe_join
|
|
from flask import jsonify
|
|
from flask import request
|
|
from flask import abort
|
|
from flask import send_from_directory
|
|
from flask import url_for, helpers
|
|
|
|
from application import utils
|
|
from application.utils import remove_private_keys
|
|
from application.utils.cdn import hash_file_path
|
|
from application.utils.encoding import Encoder
|
|
from application.utils.gcs import GoogleCloudStorageBucket
|
|
from application.utils.imaging import ffmpeg_encode
|
|
from application.utils.imaging import generate_local_thumbnails
|
|
from application.utils.imaging import get_video_data
|
|
from application.utils.storage import push_to_storage
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
file_storage = Blueprint('file_storage', __name__,
|
|
template_folder='templates',
|
|
static_folder='../../static/storage', )
|
|
|
|
|
|
@file_storage.route('/gcs/<bucket_name>/<subdir>/')
|
|
@file_storage.route('/gcs/<bucket_name>/<subdir>/<path:file_path>')
|
|
def browse_gcs(bucket_name, subdir, file_path=None):
|
|
"""Browse the content of a Google Cloud Storage bucket"""
|
|
|
|
# Initialize storage client
|
|
storage = GoogleCloudStorageBucket(bucket_name, subdir=subdir)
|
|
if file_path:
|
|
# If we provided a file_path, we try to fetch it
|
|
file_object = storage.Get(file_path)
|
|
if file_object:
|
|
# If it exists, return file properties in a dictionary
|
|
return jsonify(file_object)
|
|
else:
|
|
listing = storage.List(file_path)
|
|
return jsonify(listing)
|
|
# We always return an empty listing even if the directory does not
|
|
# exist. This can be changed later.
|
|
# return abort(404)
|
|
|
|
else:
|
|
listing = storage.List('')
|
|
return jsonify(listing)
|
|
|
|
|
|
# @file_storage.route('/build_thumbnails/<path:file_path>')
|
|
def build_thumbnails(file_path=None, file_id=None):
|
|
"""Given a file path or file ObjectId pointing to an image file, fetch it
|
|
and generate a set of predefined variations (using generate_local_thumbnails).
|
|
Return a list of dictionaries containing the various image properties and
|
|
variation properties.
|
|
"""
|
|
|
|
from application import app
|
|
|
|
files_collection = app.data.driver.db['files']
|
|
if file_path:
|
|
# Search file with backend "pillar" and path=file_path
|
|
file_ = files_collection.find({"file_path": "{0}".format(file_path)})
|
|
file_ = file_[0]
|
|
|
|
if file_id:
|
|
file_ = files_collection.find_one({"_id": ObjectId(file_id)})
|
|
file_path = file_['name']
|
|
|
|
file_full_path = safe_join(safe_join(app.config['SHARED_DIR'], file_path[:2]),
|
|
file_path)
|
|
# Does the original file exist?
|
|
if not os.path.isfile(file_full_path):
|
|
return "", 404
|
|
else:
|
|
thumbnails = generate_local_thumbnails(file_full_path,
|
|
return_image_stats=True)
|
|
|
|
file_variations = []
|
|
for size, thumbnail in thumbnails.iteritems():
|
|
if thumbnail.get('exists'):
|
|
# If a thumbnail was already made, we just continue
|
|
continue
|
|
basename = os.path.basename(thumbnail['file_path'])
|
|
root, ext = os.path.splitext(basename)
|
|
file_variation = dict(
|
|
size=size,
|
|
format=ext[1:],
|
|
width=thumbnail['width'],
|
|
height=thumbnail['height'],
|
|
content_type=thumbnail['content_type'],
|
|
length=thumbnail['length'],
|
|
md5=thumbnail['md5'],
|
|
file_path=basename,
|
|
)
|
|
# XXX Inject is_public for size 't' (should be part of the upload),
|
|
# and currently we set it here and then on the fly during blob
|
|
# creation by simply parsing the extension of the filename. This is
|
|
# bad.
|
|
if size == 't':
|
|
file_variation['is_public'] = True
|
|
|
|
file_variations.append(file_variation)
|
|
|
|
return file_variations
|
|
|
|
|
|
@file_storage.route('/file', methods=['POST'])
|
|
@file_storage.route('/file/<path:file_name>', methods=['GET', 'POST'])
|
|
def index(file_name=None):
|
|
from application import app
|
|
|
|
# GET file -> read it
|
|
if request.method == 'GET':
|
|
return send_from_directory(app.config['STORAGE_DIR'], file_name)
|
|
|
|
# POST file -> save it
|
|
|
|
# Sanitize the filename; source: http://stackoverflow.com/questions/7406102/
|
|
file_name = request.form['name']
|
|
keepcharacters = {' ', '.', '_'}
|
|
file_name = ''.join(
|
|
c for c in file_name if c.isalnum() or c in keepcharacters).strip()
|
|
file_name = file_name.lstrip('.')
|
|
|
|
# Determine & create storage directory
|
|
folder_name = file_name[:2]
|
|
file_folder_path = helpers.safe_join(app.config['STORAGE_DIR'], folder_name)
|
|
if not os.path.exists(file_folder_path):
|
|
log.info('Creating folder path %r', file_folder_path)
|
|
os.mkdir(file_folder_path)
|
|
|
|
# Save uploaded file
|
|
file_path = helpers.safe_join(file_folder_path, file_name)
|
|
log.info('Saving file %r', file_path)
|
|
request.files['data'].save(file_path)
|
|
|
|
# TODO: possibly nicer to just return a redirect to the file's URL.
|
|
return jsonify({'url': url_for('file_storage.index', file_name=file_name)})
|
|
|
|
|
|
def process_file(file_id, src_file):
|
|
"""Process the file.
|
|
|
|
:param file_id: '_id' key of the file
|
|
:param src_file: POSTed data of the file, lacks private properties.
|
|
"""
|
|
from application import app
|
|
|
|
src_file = utils.remove_private_keys(src_file)
|
|
|
|
filename = src_file['name']
|
|
file_abs_path = safe_join(safe_join(app.config['SHARED_DIR'], filename[:2]), filename)
|
|
|
|
if not os.path.exists(file_abs_path):
|
|
log.warning("POSTed file document %r refers to non-existant file on file system %s!",
|
|
file_id, file_abs_path)
|
|
abort(422, "POSTed file document refers to non-existant file on file system!")
|
|
|
|
src_file['length'] = os.stat(file_abs_path).st_size
|
|
content_type = src_file['content_type'].split('/')
|
|
src_file['format'] = content_type[1]
|
|
mime_type = content_type[0]
|
|
src_file['file_path'] = filename
|
|
|
|
if mime_type == 'image':
|
|
from PIL import Image
|
|
im = Image.open(file_abs_path)
|
|
res = im.size
|
|
src_file['width'] = res[0]
|
|
src_file['height'] = res[1]
|
|
# Generate previews
|
|
src_file['variations'] = build_thumbnails(file_id=file_id)
|
|
elif mime_type == 'video':
|
|
pass
|
|
# Generate variations
|
|
src_video_data = get_video_data(file_abs_path)
|
|
variations = {
|
|
'mp4': None,
|
|
'webm': None
|
|
}
|
|
if src_video_data['duration']:
|
|
src_file['duration'] = src_video_data['duration']
|
|
|
|
# Properly resize the video according to 720p and 1080p resolutions
|
|
if src_video_data['res_y'] < 1080:
|
|
res_y = 720
|
|
elif src_video_data['res_y'] >= 1080:
|
|
res_y = 1080
|
|
|
|
# Add variations property to the file
|
|
src_file['variations'] = []
|
|
# Create variations
|
|
for v in variations:
|
|
root, ext = os.path.splitext(filename)
|
|
filename = "{0}-{1}p.{2}".format(root, res_y, v)
|
|
video_duration = None
|
|
if src_video_data['duration']:
|
|
video_duration = src_video_data['duration']
|
|
|
|
file_variation = dict(
|
|
size="{0}p".format(res_y),
|
|
duration=video_duration,
|
|
format=v,
|
|
width=src_video_data['res_x'],
|
|
height=src_video_data['res_y'],
|
|
content_type="video/{0}".format(v),
|
|
length=0, # Available after encode
|
|
md5="", # Available after encode
|
|
file_path=filename,
|
|
)
|
|
# Append file variation
|
|
src_file['variations'].append(file_variation)
|
|
|
|
def encode(src_path, src_file, res_y):
|
|
# For every variation in the list call video_encode
|
|
# print "encoding {0}".format(variations)
|
|
if app.config['ENCODING_BACKEND'] == 'zencoder':
|
|
# Move the source file in place on the remote storage (which can
|
|
# be accessed from zencoder)
|
|
push_to_storage(str(src_file['project']), src_path)
|
|
j = Encoder.job_create(src_file)
|
|
try:
|
|
if j:
|
|
src_file['processing'] = dict(
|
|
status='pending',
|
|
job_id="{0}".format(j['process_id']),
|
|
backend=j['backend'])
|
|
# Add the processing status to the file object
|
|
r = put_internal('files',
|
|
src_file, **{'_id': ObjectId(file_id)})
|
|
pass
|
|
except KeyError:
|
|
pass
|
|
elif app.config['ENCODING_BACKEND'] == 'local':
|
|
for v in src_file['variations']:
|
|
path = ffmpeg_encode(src_path, v['format'], res_y)
|
|
# Update size data after encoding
|
|
v['length'] = os.stat(path).st_size
|
|
|
|
r = put_internal('files', src_file, **{'_id': ObjectId(file_id)})
|
|
# When all encodes are done, delete source file
|
|
sync_path = os.path.split(src_path)[0]
|
|
push_to_storage(str(src_file['project']), sync_path)
|
|
|
|
p = Process(target=encode, args=(file_abs_path, src_file, res_y))
|
|
p.start()
|
|
else:
|
|
log.info("POSTed file was of type %r, which isn't thumbnailed/encoded.", mime_type)
|
|
|
|
if mime_type != 'video':
|
|
# Sync the whole subdir
|
|
sync_path = os.path.split(file_abs_path)[0]
|
|
# push_to_storage(str(src_file['project']), sync_path)
|
|
p = Process(target=push_to_storage, args=(
|
|
str(src_file['project']), sync_path))
|
|
p.start()
|
|
|
|
# Update the original file with additional info, e.g. image resolution
|
|
put_internal('files', src_file, _id=ObjectId(file_id))
|
|
|
|
|
|
def delete_file(file_item):
|
|
def process_file_delete(file_item):
|
|
"""Given a file item, delete the actual file from the storage backend.
|
|
This function can be probably made self-calling."""
|
|
if file_item['backend'] == 'gcs':
|
|
storage = GoogleCloudStorageBucket(str(file_item['project']))
|
|
storage.Delete(file_item['file_path'])
|
|
# Delete any file variation found in the file_item document
|
|
if 'variations' in file_item:
|
|
for v in file_item['variations']:
|
|
storage.Delete(v['file_path'])
|
|
return True
|
|
elif file_item['backend'] == 'pillar':
|
|
pass
|
|
elif file_item['backend'] == 'cdnsun':
|
|
pass
|
|
else:
|
|
pass
|
|
|
|
from application import app
|
|
|
|
files_collection = app.data.driver.db['files']
|
|
# Collect children (variations) of the original file
|
|
children = files_collection.find({'parent': file_item['_id']})
|
|
for child in children:
|
|
process_file_delete(child)
|
|
# Finally remove the original file
|
|
process_file_delete(file_item)
|
|
|
|
|
|
def generate_link(backend, file_path, project_id=None, is_public=False):
|
|
"""Hook to check the backend of a file resource, to build an appropriate link
|
|
that can be used by the client to retrieve the actual file.
|
|
"""
|
|
from application import app
|
|
|
|
if backend == 'gcs':
|
|
storage = GoogleCloudStorageBucket(project_id)
|
|
blob = storage.Get(file_path)
|
|
if blob and not is_public:
|
|
link = blob['signed_url']
|
|
elif blob and is_public:
|
|
link = blob['public_url']
|
|
else:
|
|
link = None
|
|
elif backend == 'pillar':
|
|
link = url_for('file_storage.index', file_name=file_path, _external=True,
|
|
_scheme=app.config['SCHEME'])
|
|
elif backend == 'cdnsun':
|
|
link = hash_file_path(file_path, None)
|
|
elif backend == 'unittest':
|
|
link = md5(file_path).hexdigest()
|
|
else:
|
|
link = None
|
|
return link
|
|
|
|
|
|
def before_returning_file(response):
|
|
ensure_valid_link(response)
|
|
|
|
|
|
def before_returning_files(response):
|
|
for item in response['_items']:
|
|
ensure_valid_link(item)
|
|
|
|
|
|
def ensure_valid_link(response):
|
|
"""Ensures the file item has valid file links using generate_link(...)."""
|
|
|
|
# log.debug('Inspecting link for file %s', response['_id'])
|
|
|
|
# Check link expiry.
|
|
now = datetime.datetime.now(tz=bson.tz_util.utc)
|
|
if 'link_expires' in response:
|
|
link_expires = response['link_expires']
|
|
if now < link_expires:
|
|
# Not expired yet, so don't bother regenerating anything.
|
|
log.debug('Link expires at %s, which is in the future, so not generating new link',
|
|
link_expires)
|
|
return
|
|
|
|
log.debug('Link expired at %s, which is in the past; generating new link', link_expires)
|
|
else:
|
|
log.debug('No expiry date for link; generating new link')
|
|
|
|
_generate_all_links(response, now)
|
|
|
|
|
|
def _generate_all_links(response, now):
|
|
"""Generate a new link for the file and all its variations.
|
|
|
|
:param response: the file document that should be updated.
|
|
:param now: datetime that reflects 'now', for consistent expiry generation.
|
|
"""
|
|
|
|
from application import app
|
|
|
|
project_id = str(
|
|
response['project']) if 'project' in response else None # TODO: add project id to all files
|
|
backend = response['backend']
|
|
response['link'] = generate_link(backend, response['file_path'], project_id)
|
|
if 'variations' in response:
|
|
for variation in response['variations']:
|
|
variation['link'] = generate_link(backend, variation['file_path'], project_id)
|
|
|
|
# Construct the new expiry datetime.
|
|
validity_secs = app.config['FILE_LINK_VALIDITY'][backend]
|
|
response['link_expires'] = now + datetime.timedelta(seconds=validity_secs)
|
|
|
|
patch_info = remove_private_keys(response)
|
|
(patch_resp, _, _, _) = patch_internal('files', patch_info, _id=ObjectId(response['_id']))
|
|
if patch_resp.get('_status') == 'ERR':
|
|
log.warning('Unable to save new links for file %s: %r', response['_id'], patch_resp)
|
|
# TODO: raise a snag.
|
|
response['_updated'] = now
|
|
else:
|
|
response['_updated'] = patch_resp['_updated']
|
|
|
|
|
|
def post_POST_files(request, payload):
|
|
"""After an file object has been created, we do the necessary processing
|
|
and further update it.
|
|
"""
|
|
|
|
if 200 <= payload.status_code < 300:
|
|
import json
|
|
posted_properties = json.loads(request.data)
|
|
private_properties = json.loads(payload.data)
|
|
file_id = private_properties['_id']
|
|
|
|
process_file(file_id, posted_properties)
|
|
|
|
|
|
def before_deleting_file(item):
|
|
delete_file(item)
|
|
|
|
|
|
def on_pre_get_files(_, lookup):
|
|
from application import app
|
|
|
|
# Override the HTTP header, we always want to fetch the document from MongoDB.
|
|
parsed_req = eve.utils.parse_request('files')
|
|
parsed_req.if_modified_since = None
|
|
|
|
# Only fetch it if the date got expired.
|
|
now = datetime.datetime.now(tz=bson.tz_util.utc)
|
|
lookup_expired = lookup.copy()
|
|
lookup_expired['link_expires'] = {'$lte': now}
|
|
|
|
cursor = app.data.find('files', parsed_req, lookup_expired)
|
|
for file_doc in cursor:
|
|
log.debug('Updating expired links for file %r.', file_doc['_id'])
|
|
_generate_all_links(file_doc, now)
|
|
|
|
|
|
def refresh_links_for_project(project_uuid, chunk_size, expiry_seconds):
|
|
from application import app
|
|
|
|
if chunk_size:
|
|
log.info('Refreshing the first %i links for project %s', chunk_size, project_uuid)
|
|
else:
|
|
log.info('Refreshing all links for project %s', project_uuid)
|
|
|
|
# Retrieve expired links.
|
|
files_collection = app.data.driver.db['files']
|
|
|
|
now = datetime.datetime.now(tz=bson.tz_util.utc)
|
|
expire_before = now + datetime.timedelta(seconds=expiry_seconds)
|
|
log.info('Limiting to links that expire before %s', expire_before)
|
|
|
|
to_refresh = files_collection.find(
|
|
{'project': ObjectId(project_uuid),
|
|
'link_expires': {'$lt': expire_before},
|
|
}).sort([('link_expires', pymongo.ASCENDING)]).limit(chunk_size)
|
|
|
|
if to_refresh.count() == 0:
|
|
log.info('No links to refresh.')
|
|
return
|
|
|
|
for file_doc in to_refresh:
|
|
log.debug('Refreshing links for file %s', file_doc['_id'])
|
|
_generate_all_links(file_doc, now)
|
|
|
|
log.info('Refreshed %i links', min(chunk_size, to_refresh.count()))
|
|
|
|
def setup_app(app, url_prefix):
|
|
app.on_pre_GET_files += on_pre_get_files
|
|
app.on_post_POST_files += post_POST_files
|
|
|
|
app.on_fetched_item_files += before_returning_file
|
|
app.on_fetched_resource_files += before_returning_files
|
|
|
|
app.on_delete_item_files += before_deleting_file
|
|
|
|
app.register_blueprint(file_storage, url_prefix='/storage')
|