Orphan finder: drop the per-project finding

Overall finding is much faster, at the expense of a bit more RAM.
This commit is contained in:
Sybren A. Stüvel 2017-09-13 17:27:32 +02:00
parent be6746f7ab
commit 3be47056a0
2 changed files with 18 additions and 47 deletions

View File

@ -534,25 +534,24 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False):
handle_project(proj)
def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
def _find_orphan_files() -> typing.Set[bson.ObjectId]:
"""Finds all non-referenced files for the given project.
Returns an iterable of all orphan file IDs.
"""
log.debug('Finding orphan files for project %s', project_id)
log.debug('Finding orphan files')
# Get all file IDs that belong to this project.
files_coll = current_app.db('files')
file_filter = {'project': project_id, '_deleted': {'$ne': True}}
cursor = files_coll.find(file_filter, projection={'_id': 1})
cursor = files_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1})
file_ids = {doc['_id'] for doc in cursor}
if not file_ids:
log.debug('Project %s has no files', project_id)
log.debug('No files found')
return set()
total_file_count = len(file_ids)
log.debug('Project %s has %d files in total', project_id, total_file_count)
log.debug('Found %d files in total', total_file_count)
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
if isinstance(something, bson.ObjectId):
@ -572,11 +571,6 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
continue
doc_filter = {'_deleted': {'$ne': True}}
if coll_name == 'projects':
doc_filter['_id'] = project_id
else:
doc_filter['project'] = project_id
log.debug(' - inspecting collection %r with filter %r', coll_name, doc_filter)
coll = db[coll_name]
for doc in coll.find(doc_filter):
@ -585,16 +579,14 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
file_ids.discard(obj_id)
orphan_count = len(file_ids)
log.info('Project %s has %d files or which %d are orphaned (%d%%)',
project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count)
log.info('Found %d files or which %d are orphaned (%d%%)',
total_file_count, orphan_count, 100 * orphan_count / total_file_count)
return file_ids
@manager_maintenance.command
@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?',
help='Project URL, use "all" to check all projects')
def find_orphan_files(proj_url):
def find_orphan_files():
"""Finds unused files in the given project.
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
@ -608,32 +600,12 @@ def find_orphan_files(proj_url):
return 1
start_timestamp = datetime.datetime.now()
projects_coll = current_app.db('projects')
files_coll = current_app.db('files')
if proj_url == 'all':
log.warning('Iterating over ALL projects, may take a while')
orphans = set()
try:
for project in projects_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1}):
proj_orphans = _find_orphan_files(project['_id'])
orphans.update(proj_orphans)
except KeyboardInterrupt:
log.warning('Keyboard interrupt received, stopping now '
'and showing intermediary results.')
else:
project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1})
if not project:
log.error('Project url=%r not found', proj_url)
return 1
orphans = _find_orphan_files(project['_id'])
orphans = _find_orphan_files()
if not orphans:
log.info('No orphan files found, congratulations.')
return 0
files_coll = current_app.db('files')
aggr = files_coll.aggregate([
{'$match': {'_id': {'$in': list(orphans)}}},
{'$group': {
@ -644,7 +616,6 @@ def find_orphan_files(proj_url):
total_size = list(aggr)[0]['size']
log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
if proj_url == 'all':
orphan_count = len(orphans)
total_count = files_coll.count()
log.info('Total nr of orphan files: %d', orphan_count)

View File

@ -91,6 +91,6 @@ class OrphanFilesTest(AbstractPillarTest):
from pillar.cli.maintenance import _find_orphan_files
for pid in project_ids:
orphans = _find_orphan_files(pid)
self.assertEqual({file_ids[pid][3]}, orphans)
expect_orphans = {file_ids[pid][3] for pid in project_ids}
found_orphans = _find_orphan_files()
self.assertEqual(expect_orphans, found_orphans)