Orphan finder: drop the per-project finding
Overall finding is much faster, at the expense of a bit more RAM.
This commit is contained in:
parent
be6746f7ab
commit
3be47056a0
@ -534,25 +534,24 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False):
|
|||||||
handle_project(proj)
|
handle_project(proj)
|
||||||
|
|
||||||
|
|
||||||
def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
|
def _find_orphan_files() -> typing.Set[bson.ObjectId]:
|
||||||
"""Finds all non-referenced files for the given project.
|
"""Finds all non-referenced files for the given project.
|
||||||
|
|
||||||
Returns an iterable of all orphan file IDs.
|
Returns an iterable of all orphan file IDs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
log.debug('Finding orphan files for project %s', project_id)
|
log.debug('Finding orphan files')
|
||||||
|
|
||||||
# Get all file IDs that belong to this project.
|
# Get all file IDs that belong to this project.
|
||||||
files_coll = current_app.db('files')
|
files_coll = current_app.db('files')
|
||||||
file_filter = {'project': project_id, '_deleted': {'$ne': True}}
|
cursor = files_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1})
|
||||||
cursor = files_coll.find(file_filter, projection={'_id': 1})
|
|
||||||
file_ids = {doc['_id'] for doc in cursor}
|
file_ids = {doc['_id'] for doc in cursor}
|
||||||
if not file_ids:
|
if not file_ids:
|
||||||
log.debug('Project %s has no files', project_id)
|
log.debug('No files found')
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
total_file_count = len(file_ids)
|
total_file_count = len(file_ids)
|
||||||
log.debug('Project %s has %d files in total', project_id, total_file_count)
|
log.debug('Found %d files in total', total_file_count)
|
||||||
|
|
||||||
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
|
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
|
||||||
if isinstance(something, bson.ObjectId):
|
if isinstance(something, bson.ObjectId):
|
||||||
@ -572,11 +571,6 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
doc_filter = {'_deleted': {'$ne': True}}
|
doc_filter = {'_deleted': {'$ne': True}}
|
||||||
if coll_name == 'projects':
|
|
||||||
doc_filter['_id'] = project_id
|
|
||||||
else:
|
|
||||||
doc_filter['project'] = project_id
|
|
||||||
|
|
||||||
log.debug(' - inspecting collection %r with filter %r', coll_name, doc_filter)
|
log.debug(' - inspecting collection %r with filter %r', coll_name, doc_filter)
|
||||||
coll = db[coll_name]
|
coll = db[coll_name]
|
||||||
for doc in coll.find(doc_filter):
|
for doc in coll.find(doc_filter):
|
||||||
@ -585,16 +579,14 @@ def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
|
|||||||
file_ids.discard(obj_id)
|
file_ids.discard(obj_id)
|
||||||
|
|
||||||
orphan_count = len(file_ids)
|
orphan_count = len(file_ids)
|
||||||
log.info('Project %s has %d files or which %d are orphaned (%d%%)',
|
log.info('Found %d files or which %d are orphaned (%d%%)',
|
||||||
project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count)
|
total_file_count, orphan_count, 100 * orphan_count / total_file_count)
|
||||||
|
|
||||||
return file_ids
|
return file_ids
|
||||||
|
|
||||||
|
|
||||||
@manager_maintenance.command
|
@manager_maintenance.command
|
||||||
@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?',
|
def find_orphan_files():
|
||||||
help='Project URL, use "all" to check all projects')
|
|
||||||
def find_orphan_files(proj_url):
|
|
||||||
"""Finds unused files in the given project.
|
"""Finds unused files in the given project.
|
||||||
|
|
||||||
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
||||||
@ -608,32 +600,12 @@ def find_orphan_files(proj_url):
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
start_timestamp = datetime.datetime.now()
|
start_timestamp = datetime.datetime.now()
|
||||||
|
orphans = _find_orphan_files()
|
||||||
projects_coll = current_app.db('projects')
|
|
||||||
files_coll = current_app.db('files')
|
|
||||||
|
|
||||||
if proj_url == 'all':
|
|
||||||
log.warning('Iterating over ALL projects, may take a while')
|
|
||||||
orphans = set()
|
|
||||||
try:
|
|
||||||
for project in projects_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1}):
|
|
||||||
proj_orphans = _find_orphan_files(project['_id'])
|
|
||||||
orphans.update(proj_orphans)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
log.warning('Keyboard interrupt received, stopping now '
|
|
||||||
'and showing intermediary results.')
|
|
||||||
else:
|
|
||||||
project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1})
|
|
||||||
if not project:
|
|
||||||
log.error('Project url=%r not found', proj_url)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
orphans = _find_orphan_files(project['_id'])
|
|
||||||
|
|
||||||
if not orphans:
|
if not orphans:
|
||||||
log.info('No orphan files found, congratulations.')
|
log.info('No orphan files found, congratulations.')
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
files_coll = current_app.db('files')
|
||||||
aggr = files_coll.aggregate([
|
aggr = files_coll.aggregate([
|
||||||
{'$match': {'_id': {'$in': list(orphans)}}},
|
{'$match': {'_id': {'$in': list(orphans)}}},
|
||||||
{'$group': {
|
{'$group': {
|
||||||
@ -644,12 +616,11 @@ def find_orphan_files(proj_url):
|
|||||||
|
|
||||||
total_size = list(aggr)[0]['size']
|
total_size = list(aggr)[0]['size']
|
||||||
log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
|
log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
|
||||||
if proj_url == 'all':
|
orphan_count = len(orphans)
|
||||||
orphan_count = len(orphans)
|
total_count = files_coll.count()
|
||||||
total_count = files_coll.count()
|
log.info('Total nr of orphan files: %d', orphan_count)
|
||||||
log.info('Total nr of orphan files: %d', orphan_count)
|
log.info('Total nr of files : %d', total_count)
|
||||||
log.info('Total nr of files : %d', total_count)
|
log.info('Orphan percentage : %d%%', 100 * orphan_count / total_count)
|
||||||
log.info('Orphan percentage : %d%%', 100 * orphan_count / total_count)
|
|
||||||
|
|
||||||
end_timestamp = datetime.datetime.now()
|
end_timestamp = datetime.datetime.now()
|
||||||
duration = end_timestamp - start_timestamp
|
duration = end_timestamp - start_timestamp
|
||||||
|
@ -91,6 +91,6 @@ class OrphanFilesTest(AbstractPillarTest):
|
|||||||
|
|
||||||
from pillar.cli.maintenance import _find_orphan_files
|
from pillar.cli.maintenance import _find_orphan_files
|
||||||
|
|
||||||
for pid in project_ids:
|
expect_orphans = {file_ids[pid][3] for pid in project_ids}
|
||||||
orphans = _find_orphan_files(pid)
|
found_orphans = _find_orphan_files()
|
||||||
self.assertEqual({file_ids[pid][3]}, orphans)
|
self.assertEqual(expect_orphans, found_orphans)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user