Added orphan file finder. Works per project or pass 'all' for all projects.

This is quite a heavy thing to run, since it goes over all files of a project, and then goes over every document in (almost) every collection which has a property 'project' that's set to the project ID. It then goes over every document to find all ObjectIDs and removes those from the set of file ObjectIDs for that project. The remaining ObjectIDs are considered orphans. This is a very thorough search, but it doesn't require any knowledge of the document and collection structure, so it should be future-proof.
2017-09-12 16:15:08 +02:00 · 2017-09-12 16:15:08 +02:00 · b1d69b2304
commit b1d69b2304
parent 9ac870e0a5
2 changed files with 211 additions and 1 deletions
--- a/pillar/cli/maintenance.py
+++ b/pillar/cli/maintenance.py
@ -1,13 +1,26 @@
 import copy
 import logging
 import typing
 import bson.tz_util
 import copy
 from bson import ObjectId
 from bson.errors import InvalidId
 from flask_script import Manager
 from pillar import current_app
 # Collections to skip when finding file references (during orphan file detection).
 # This collection can be added to from PillarExtension.setup_app().
 ORPHAN_FINDER_SKIP_COLLECTIONS = {
    # Skipping the files collection under the assumption that we have no files
    # referencing other files.
    'files',
    # Authentication tokens never refer to files, and it's a big collection so
    # good to skip.
    'tokens',
 }
 log = logging.getLogger(__name__)
 manager_maintenance = Manager(
@ -518,3 +531,104 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False):
        return 3
    handle_project(proj)
 def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
    """Finds all non-referenced files for the given project.
    Returns an iterable of all orphan file IDs.
    """
    log.debug('Finding orphan files for project %s', project_id)
    # Get all file IDs that belong to this project.
    files_coll = current_app.db('files')
    cursor = files_coll.find({'project': project_id}, projection={'_id': 1})
    file_ids = {doc['_id'] for doc in cursor}
    if not file_ids:
        log.debug('Project %s has no files', project_id)
        return set()
    total_file_count = len(file_ids)
    log.debug('Project %s has %d files in total', project_id, total_file_count)
    def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
        if isinstance(something, bson.ObjectId):
            yield something
        elif isinstance(something, (list, set, tuple)):
            for item in something:
                yield from find_object_ids(item)
        elif isinstance(something, dict):
            for item in something.values():
                yield from find_object_ids(item)
    # Find all references by iterating through the project itself and every document that has a
    # 'project' key set to this ObjectId.
    db = current_app.db()
    for coll_name in sorted(db.collection_names(include_system_collections=False)):
        if coll_name in ORPHAN_FINDER_SKIP_COLLECTIONS:
            continue
        doc_filter = {'_deleted': {'$ne': True}}
        if coll_name == 'projects':
            doc_filter['_id'] = project_id
        else:
            doc_filter['project'] = project_id
        log.debug('   - inspecting collection %r with filter %r', coll_name, doc_filter)
        coll = db[coll_name]
        for doc in coll.find(doc_filter):
            for obj_id in find_object_ids(doc):
                # Found an Object ID that is in use, so discard it from our set of file IDs.
                file_ids.discard(obj_id)
    orphan_count = len(file_ids)
    log.info('Project %s has %d files or which %d are orphaned (%d%%)',
             project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count)
    return file_ids
@manager_maintenance.command
@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?',
                            help='Project URL, use "all" to check all projects')
 def find_orphan_files(proj_url):
    """Finds unused files in the given project.
    This is a heavy operation that inspects *everything* in MongoDB. Use with care.
    """
    from jinja2.filters import do_filesizeformat
    projects_coll = current_app.db('projects')
    files_coll = current_app.db('files')
    if proj_url == 'all':
        log.warning('Iterating over ALL projects, may take a while')
        orphans = set()
        for project in projects_coll.find({'_deleted': False}, projection={'_id': 1}):
            proj_orphans = _find_orphan_files(project['_id'])
            orphans.update(proj_orphans)
    else:
        project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1})
        if not project:
            log.error('Project url=%r not found', proj_url)
            return 1
        orphans = _find_orphan_files(project['_id'])
    aggr = files_coll.aggregate([
        {'$match': {'_id': {'$in': list(orphans)}}},
        {'$group': {
            '_id': None,
            'size': {'$sum': '$length_aggregate_in_bytes'},
        }}
    ])
    total_size = list(aggr)[0]['size']
    log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
    if proj_url == 'all':
        orphan_count = len(orphans)
        total_count = files_coll.count()
        log.info('Total nr of orphan files: %d', orphan_count)
        log.info('Total nr of files       : %d', total_count)
        log.info('Orphan percentage       : %d%%', 100 * orphan_count / total_count)
--- a/tests/test_orphan_files.py
+++ b/tests/test_orphan_files.py
@ -0,0 +1,96 @@
 import collections
 import datetime
 from bson import ObjectId, tz_util
 from pymongo.results import UpdateResult
 from pillar.tests import AbstractPillarTest
 class OrphanFilesTest(AbstractPillarTest):
    def test_find_orphan_files(self):
        self.enter_app_context()
        public1, _ = self.create_project_with_admin(
            24 * 'a', project_overrides={'_id': ObjectId(), 'is_private': False})
        public2, _ = self.create_project_with_admin(
            24 * 'b', project_overrides={'_id': ObjectId(), 'is_private': False})
        private1, _ = self.create_project_with_admin(
            24 * 'c', project_overrides={'_id': ObjectId(), 'is_private': True})
        private2, _ = self.create_project_with_admin(
            24 * 'd', project_overrides={'_id': ObjectId(), 'is_private': None})
        self.assertEqual(4, self.app.db('projects').count())
        # Create files, some orphan and some used.
        project_ids = (public1, public2, private1, private2)
        file_ids = collections.defaultdict(list)
        for pidx, pid in enumerate(project_ids):
            for filenum in range(5):
                generated_file_id = ObjectId(f'{pidx}{filenum}' + 22 * 'a')
                file_id, _ = self.ensure_file_exists({
                    '_id': generated_file_id,
                    'project': pid,
                    'name': f'Test file p{pid} num {filenum}'
                })
                file_ids[pid].append(file_id)
        proj_coll = self.app.db('projects')
        for pid in project_ids:
            fids = file_ids[pid]
            # Use fids[4] as project image
            res: UpdateResult = proj_coll.update_one({'_id': pid},
                                 {'$set': {'picture': fids[4]}})
            self.assertEqual(1, res.matched_count)
            self.assertEqual(1, res.modified_count)
            # Asset linking directly to fids[0]
            self.create_node({
                '_id': ObjectId(),
                'project': pid,
                'picture': ObjectId('572761f39837730efe8e1210'),
                'description': '',
                'node_type': 'asset',
                'user': ObjectId(24 * 'a'),
                'properties': {
                    'status': 'published',
                    'content_type': 'image',
                    'file': fids[0],
                },
                'name': 'Image direct link',
                '_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc),
                '_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc),
                '_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687'
            })
            # Some other node type that has some random field pointing to fids[1].
            self.create_node({
                '_id': ObjectId(),
                'project': pid,
                'picture': ObjectId('572761f39837730efe8e1210'),
                'description': '',
                'node_type': 'totally-unknown',
                'user': ObjectId(24 * 'a'),
                'properties': {
                    'status': 'published',
                    'content_type': 'image',
                    'file': fids[0],
                    'random': {'field': [fids[1]]}
                },
                'name': 'Image random field',
                '_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc),
                '_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc),
                '_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687'
            })
            # Completely unknown collection with document that points to fids[2]
            unknown_coll = self.app.db('unknown')
            unknown_coll.insert_one({
                'project': pid,
                'random': {'field': [fids[2]]}
            })
            # fids[3] is an orphan.
        from pillar.cli.maintenance import _find_orphan_files
        for pid in project_ids:
            orphans = _find_orphan_files(pid)
            self.assertEqual({file_ids[pid][3]}, orphans)