diff --git a/pillar/cli/maintenance.py b/pillar/cli/maintenance.py index 9f0c764a..f4acd3ea 100644 --- a/pillar/cli/maintenance.py +++ b/pillar/cli/maintenance.py @@ -1,13 +1,26 @@ +import copy import logging +import typing import bson.tz_util -import copy from bson import ObjectId from bson.errors import InvalidId from flask_script import Manager from pillar import current_app +# Collections to skip when finding file references (during orphan file detection). +# This collection can be added to from PillarExtension.setup_app(). +ORPHAN_FINDER_SKIP_COLLECTIONS = { + # Skipping the files collection under the assumption that we have no files + # referencing other files. + 'files', + + # Authentication tokens never refer to files, and it's a big collection so + # good to skip. + 'tokens', +} + log = logging.getLogger(__name__) manager_maintenance = Manager( @@ -518,3 +531,104 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False): return 3 handle_project(proj) + + +def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]: + """Finds all non-referenced files for the given project. + + Returns an iterable of all orphan file IDs. + """ + + log.debug('Finding orphan files for project %s', project_id) + + # Get all file IDs that belong to this project. + files_coll = current_app.db('files') + cursor = files_coll.find({'project': project_id}, projection={'_id': 1}) + file_ids = {doc['_id'] for doc in cursor} + if not file_ids: + log.debug('Project %s has no files', project_id) + return set() + + total_file_count = len(file_ids) + log.debug('Project %s has %d files in total', project_id, total_file_count) + + def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]: + if isinstance(something, bson.ObjectId): + yield something + elif isinstance(something, (list, set, tuple)): + for item in something: + yield from find_object_ids(item) + elif isinstance(something, dict): + for item in something.values(): + yield from find_object_ids(item) + + # Find all references by iterating through the project itself and every document that has a + # 'project' key set to this ObjectId. + db = current_app.db() + for coll_name in sorted(db.collection_names(include_system_collections=False)): + if coll_name in ORPHAN_FINDER_SKIP_COLLECTIONS: + continue + + doc_filter = {'_deleted': {'$ne': True}} + if coll_name == 'projects': + doc_filter['_id'] = project_id + else: + doc_filter['project'] = project_id + + log.debug(' - inspecting collection %r with filter %r', coll_name, doc_filter) + coll = db[coll_name] + for doc in coll.find(doc_filter): + for obj_id in find_object_ids(doc): + # Found an Object ID that is in use, so discard it from our set of file IDs. + file_ids.discard(obj_id) + + orphan_count = len(file_ids) + log.info('Project %s has %d files or which %d are orphaned (%d%%)', + project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count) + + return file_ids + + +@manager_maintenance.command +@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?', + help='Project URL, use "all" to check all projects') +def find_orphan_files(proj_url): + """Finds unused files in the given project. + + This is a heavy operation that inspects *everything* in MongoDB. Use with care. + """ + from jinja2.filters import do_filesizeformat + + projects_coll = current_app.db('projects') + files_coll = current_app.db('files') + + if proj_url == 'all': + log.warning('Iterating over ALL projects, may take a while') + orphans = set() + for project in projects_coll.find({'_deleted': False}, projection={'_id': 1}): + proj_orphans = _find_orphan_files(project['_id']) + orphans.update(proj_orphans) + else: + project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1}) + if not project: + log.error('Project url=%r not found', proj_url) + return 1 + + orphans = _find_orphan_files(project['_id']) + + aggr = files_coll.aggregate([ + {'$match': {'_id': {'$in': list(orphans)}}}, + {'$group': { + '_id': None, + 'size': {'$sum': '$length_aggregate_in_bytes'}, + }} + ]) + + total_size = list(aggr)[0]['size'] + log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True)) + if proj_url == 'all': + orphan_count = len(orphans) + total_count = files_coll.count() + log.info('Total nr of orphan files: %d', orphan_count) + log.info('Total nr of files : %d', total_count) + log.info('Orphan percentage : %d%%', 100 * orphan_count / total_count) diff --git a/tests/test_orphan_files.py b/tests/test_orphan_files.py new file mode 100644 index 00000000..9ecfbb19 --- /dev/null +++ b/tests/test_orphan_files.py @@ -0,0 +1,96 @@ +import collections +import datetime + +from bson import ObjectId, tz_util +from pymongo.results import UpdateResult + +from pillar.tests import AbstractPillarTest + + +class OrphanFilesTest(AbstractPillarTest): + def test_find_orphan_files(self): + self.enter_app_context() + + public1, _ = self.create_project_with_admin( + 24 * 'a', project_overrides={'_id': ObjectId(), 'is_private': False}) + public2, _ = self.create_project_with_admin( + 24 * 'b', project_overrides={'_id': ObjectId(), 'is_private': False}) + private1, _ = self.create_project_with_admin( + 24 * 'c', project_overrides={'_id': ObjectId(), 'is_private': True}) + private2, _ = self.create_project_with_admin( + 24 * 'd', project_overrides={'_id': ObjectId(), 'is_private': None}) + self.assertEqual(4, self.app.db('projects').count()) + + # Create files, some orphan and some used. + project_ids = (public1, public2, private1, private2) + file_ids = collections.defaultdict(list) + for pidx, pid in enumerate(project_ids): + for filenum in range(5): + generated_file_id = ObjectId(f'{pidx}{filenum}' + 22 * 'a') + file_id, _ = self.ensure_file_exists({ + '_id': generated_file_id, + 'project': pid, + 'name': f'Test file p{pid} num {filenum}' + }) + file_ids[pid].append(file_id) + + proj_coll = self.app.db('projects') + for pid in project_ids: + fids = file_ids[pid] + + # Use fids[4] as project image + res: UpdateResult = proj_coll.update_one({'_id': pid}, + {'$set': {'picture': fids[4]}}) + self.assertEqual(1, res.matched_count) + self.assertEqual(1, res.modified_count) + + # Asset linking directly to fids[0] + self.create_node({ + '_id': ObjectId(), + 'project': pid, + 'picture': ObjectId('572761f39837730efe8e1210'), + 'description': '', + 'node_type': 'asset', + 'user': ObjectId(24 * 'a'), + 'properties': { + 'status': 'published', + 'content_type': 'image', + 'file': fids[0], + }, + 'name': 'Image direct link', + '_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc), + '_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc), + '_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687' + }) + # Some other node type that has some random field pointing to fids[1]. + self.create_node({ + '_id': ObjectId(), + 'project': pid, + 'picture': ObjectId('572761f39837730efe8e1210'), + 'description': '', + 'node_type': 'totally-unknown', + 'user': ObjectId(24 * 'a'), + 'properties': { + 'status': 'published', + 'content_type': 'image', + 'file': fids[0], + 'random': {'field': [fids[1]]} + }, + 'name': 'Image random field', + '_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc), + '_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc), + '_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687' + }) + # Completely unknown collection with document that points to fids[2] + unknown_coll = self.app.db('unknown') + unknown_coll.insert_one({ + 'project': pid, + 'random': {'field': [fids[2]]} + }) + # fids[3] is an orphan. + + from pillar.cli.maintenance import _find_orphan_files + + for pid in project_ids: + orphans = _find_orphan_files(pid) + self.assertEqual({file_ids[pid][3]}, orphans)