Orphan finder: drop the per-project finding

Overall finding is much faster, at the expense of a bit more RAM.
2017-09-13 17:27:32 +02:00
parent be6746f7ab
commit 3be47056a0
2 changed files with 18 additions and 47 deletions
--- a/pillar/cli/maintenance.py
+++ b/pillar/cli/maintenance.py
@@ -534,25 +534,24 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False):
    handle_project(proj)


-def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
+def _find_orphan_files() -> typing.Set[bson.ObjectId]:
    """Finds all non-referenced files for the given project.

    Returns an iterable of all orphan file IDs.
    """

-    log.debug('Finding orphan files for project %s', project_id)
+    log.debug('Finding orphan files')

    # Get all file IDs that belong to this project.
    files_coll = current_app.db('files')
-    file_filter = {'project': project_id, '_deleted': {'$ne': True}}
-    cursor = files_coll.find(file_filter, projection={'_id': 1})
+    cursor = files_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1})
    file_ids = {doc['_id'] for doc in cursor}
    if not file_ids:
-        log.debug('Project %s has no files', project_id)
+        log.debug('No files found')
        return set()

    total_file_count = len(file_ids)
-    log.debug('Project %s has %d files in total', project_id, total_file_count)
+    log.debug('Found %d files in total', total_file_count)

    def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
        if isinstance(something, bson.ObjectId):
@@ -572,11 +571,6 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
            continue

        doc_filter = {'_deleted': {'$ne': True}}
-        if coll_name == 'projects':
-            doc_filter['_id'] = project_id
-        else:
-            doc_filter['project'] = project_id
-
        log.debug('   - inspecting collection %r with filter %r', coll_name, doc_filter)
        coll = db[coll_name]
        for doc in coll.find(doc_filter):
@@ -585,16 +579,14 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
                file_ids.discard(obj_id)

    orphan_count = len(file_ids)
-    log.info('Project %s has %d files or which %d are orphaned (%d%%)',
-             project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count)
+    log.info('Found %d files or which %d are orphaned (%d%%)',
+             total_file_count, orphan_count, 100 * orphan_count / total_file_count)

    return file_ids


@manager_maintenance.command
-@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?',
-                            help='Project URL, use "all" to check all projects')
-def find_orphan_files(proj_url):
+def find_orphan_files():
    """Finds unused files in the given project.

    This is a heavy operation that inspects *everything* in MongoDB. Use with care.
@@ -608,32 +600,12 @@ def find_orphan_files(proj_url):
        return 1

    start_timestamp = datetime.datetime.now()
-
-    projects_coll = current_app.db('projects')
-    files_coll = current_app.db('files')
-
-    if proj_url == 'all':
-        log.warning('Iterating over ALL projects, may take a while')
-        orphans = set()
-        try:
-            for project in projects_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1}):
-                proj_orphans = _find_orphan_files(project['_id'])
-                orphans.update(proj_orphans)
-        except KeyboardInterrupt:
-            log.warning('Keyboard interrupt received, stopping now '
-                        'and showing intermediary results.')
-    else:
-        project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1})
-        if not project:
-            log.error('Project url=%r not found', proj_url)
-            return 1
-
-        orphans = _find_orphan_files(project['_id'])
-
+    orphans = _find_orphan_files()
    if not orphans:
        log.info('No orphan files found, congratulations.')
        return 0

+    files_coll = current_app.db('files')
    aggr = files_coll.aggregate([
        {'$match': {'_id': {'$in': list(orphans)}}},
        {'$group': {
@@ -644,7 +616,6 @@ def find_orphan_files(proj_url):

    total_size = list(aggr)[0]['size']
    log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
-    if proj_url == 'all':
    orphan_count = len(orphans)
    total_count = files_coll.count()
    log.info('Total nr of orphan files: %d', orphan_count)
--- a/tests/test_orphan_files.py
+++ b/tests/test_orphan_files.py
@@ -91,6 +91,6 @@ class OrphanFilesTest(AbstractPillarTest):

        from pillar.cli.maintenance import _find_orphan_files

-        for pid in project_ids:
-            orphans = _find_orphan_files(pid)
-            self.assertEqual({file_ids[pid][3]}, orphans)
+        expect_orphans = {file_ids[pid][3] for pid in project_ids}
+        found_orphans = _find_orphan_files()
+        self.assertEqual(expect_orphans, found_orphans)