Added orphan file finder. Works per project or pass 'all' for all projects.
This is quite a heavy thing to run, since it goes over all files of a project, and then goes over every document in (almost) every collection which has a property 'project' that's set to the project ID. It then goes over every document to find all ObjectIDs and removes those from the set of file ObjectIDs for that project. The remaining ObjectIDs are considered orphans. This is a very thorough search, but it doesn't require any knowledge of the document and collection structure, so it should be future-proof.
This commit is contained in:
parent
9ac870e0a5
commit
b1d69b2304
@ -1,13 +1,26 @@
|
|||||||
|
import copy
|
||||||
import logging
|
import logging
|
||||||
|
import typing
|
||||||
|
|
||||||
import bson.tz_util
|
import bson.tz_util
|
||||||
import copy
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
from bson.errors import InvalidId
|
from bson.errors import InvalidId
|
||||||
from flask_script import Manager
|
from flask_script import Manager
|
||||||
|
|
||||||
from pillar import current_app
|
from pillar import current_app
|
||||||
|
|
||||||
|
# Collections to skip when finding file references (during orphan file detection).
|
||||||
|
# This collection can be added to from PillarExtension.setup_app().
|
||||||
|
ORPHAN_FINDER_SKIP_COLLECTIONS = {
|
||||||
|
# Skipping the files collection under the assumption that we have no files
|
||||||
|
# referencing other files.
|
||||||
|
'files',
|
||||||
|
|
||||||
|
# Authentication tokens never refer to files, and it's a big collection so
|
||||||
|
# good to skip.
|
||||||
|
'tokens',
|
||||||
|
}
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
manager_maintenance = Manager(
|
manager_maintenance = Manager(
|
||||||
@ -518,3 +531,104 @@ def upgrade_attachment_schema(proj_url=None, all_projects=False):
|
|||||||
return 3
|
return 3
|
||||||
|
|
||||||
handle_project(proj)
|
handle_project(proj)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_orphan_files(project_id: bson.ObjectId) -> typing.Set[bson.ObjectId]:
|
||||||
|
"""Finds all non-referenced files for the given project.
|
||||||
|
|
||||||
|
Returns an iterable of all orphan file IDs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
log.debug('Finding orphan files for project %s', project_id)
|
||||||
|
|
||||||
|
# Get all file IDs that belong to this project.
|
||||||
|
files_coll = current_app.db('files')
|
||||||
|
cursor = files_coll.find({'project': project_id}, projection={'_id': 1})
|
||||||
|
file_ids = {doc['_id'] for doc in cursor}
|
||||||
|
if not file_ids:
|
||||||
|
log.debug('Project %s has no files', project_id)
|
||||||
|
return set()
|
||||||
|
|
||||||
|
total_file_count = len(file_ids)
|
||||||
|
log.debug('Project %s has %d files in total', project_id, total_file_count)
|
||||||
|
|
||||||
|
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
|
||||||
|
if isinstance(something, bson.ObjectId):
|
||||||
|
yield something
|
||||||
|
elif isinstance(something, (list, set, tuple)):
|
||||||
|
for item in something:
|
||||||
|
yield from find_object_ids(item)
|
||||||
|
elif isinstance(something, dict):
|
||||||
|
for item in something.values():
|
||||||
|
yield from find_object_ids(item)
|
||||||
|
|
||||||
|
# Find all references by iterating through the project itself and every document that has a
|
||||||
|
# 'project' key set to this ObjectId.
|
||||||
|
db = current_app.db()
|
||||||
|
for coll_name in sorted(db.collection_names(include_system_collections=False)):
|
||||||
|
if coll_name in ORPHAN_FINDER_SKIP_COLLECTIONS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc_filter = {'_deleted': {'$ne': True}}
|
||||||
|
if coll_name == 'projects':
|
||||||
|
doc_filter['_id'] = project_id
|
||||||
|
else:
|
||||||
|
doc_filter['project'] = project_id
|
||||||
|
|
||||||
|
log.debug(' - inspecting collection %r with filter %r', coll_name, doc_filter)
|
||||||
|
coll = db[coll_name]
|
||||||
|
for doc in coll.find(doc_filter):
|
||||||
|
for obj_id in find_object_ids(doc):
|
||||||
|
# Found an Object ID that is in use, so discard it from our set of file IDs.
|
||||||
|
file_ids.discard(obj_id)
|
||||||
|
|
||||||
|
orphan_count = len(file_ids)
|
||||||
|
log.info('Project %s has %d files or which %d are orphaned (%d%%)',
|
||||||
|
project_id, total_file_count, orphan_count, 100 * orphan_count / total_file_count)
|
||||||
|
|
||||||
|
return file_ids
|
||||||
|
|
||||||
|
|
||||||
|
@manager_maintenance.command
|
||||||
|
@manager_maintenance.option('-p', '--project', dest='proj_url', nargs='?',
|
||||||
|
help='Project URL, use "all" to check all projects')
|
||||||
|
def find_orphan_files(proj_url):
|
||||||
|
"""Finds unused files in the given project.
|
||||||
|
|
||||||
|
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
||||||
|
"""
|
||||||
|
from jinja2.filters import do_filesizeformat
|
||||||
|
|
||||||
|
projects_coll = current_app.db('projects')
|
||||||
|
files_coll = current_app.db('files')
|
||||||
|
|
||||||
|
if proj_url == 'all':
|
||||||
|
log.warning('Iterating over ALL projects, may take a while')
|
||||||
|
orphans = set()
|
||||||
|
for project in projects_coll.find({'_deleted': False}, projection={'_id': 1}):
|
||||||
|
proj_orphans = _find_orphan_files(project['_id'])
|
||||||
|
orphans.update(proj_orphans)
|
||||||
|
else:
|
||||||
|
project = projects_coll.find_one({'url': proj_url}, projection={'_id': 1})
|
||||||
|
if not project:
|
||||||
|
log.error('Project url=%r not found', proj_url)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
orphans = _find_orphan_files(project['_id'])
|
||||||
|
|
||||||
|
aggr = files_coll.aggregate([
|
||||||
|
{'$match': {'_id': {'$in': list(orphans)}}},
|
||||||
|
{'$group': {
|
||||||
|
'_id': None,
|
||||||
|
'size': {'$sum': '$length_aggregate_in_bytes'},
|
||||||
|
}}
|
||||||
|
])
|
||||||
|
|
||||||
|
total_size = list(aggr)[0]['size']
|
||||||
|
log.info('Total orphan file size: %s', do_filesizeformat(total_size, binary=True))
|
||||||
|
if proj_url == 'all':
|
||||||
|
orphan_count = len(orphans)
|
||||||
|
total_count = files_coll.count()
|
||||||
|
log.info('Total nr of orphan files: %d', orphan_count)
|
||||||
|
log.info('Total nr of files : %d', total_count)
|
||||||
|
log.info('Orphan percentage : %d%%', 100 * orphan_count / total_count)
|
||||||
|
96
tests/test_orphan_files.py
Normal file
96
tests/test_orphan_files.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import collections
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from bson import ObjectId, tz_util
|
||||||
|
from pymongo.results import UpdateResult
|
||||||
|
|
||||||
|
from pillar.tests import AbstractPillarTest
|
||||||
|
|
||||||
|
|
||||||
|
class OrphanFilesTest(AbstractPillarTest):
|
||||||
|
def test_find_orphan_files(self):
|
||||||
|
self.enter_app_context()
|
||||||
|
|
||||||
|
public1, _ = self.create_project_with_admin(
|
||||||
|
24 * 'a', project_overrides={'_id': ObjectId(), 'is_private': False})
|
||||||
|
public2, _ = self.create_project_with_admin(
|
||||||
|
24 * 'b', project_overrides={'_id': ObjectId(), 'is_private': False})
|
||||||
|
private1, _ = self.create_project_with_admin(
|
||||||
|
24 * 'c', project_overrides={'_id': ObjectId(), 'is_private': True})
|
||||||
|
private2, _ = self.create_project_with_admin(
|
||||||
|
24 * 'd', project_overrides={'_id': ObjectId(), 'is_private': None})
|
||||||
|
self.assertEqual(4, self.app.db('projects').count())
|
||||||
|
|
||||||
|
# Create files, some orphan and some used.
|
||||||
|
project_ids = (public1, public2, private1, private2)
|
||||||
|
file_ids = collections.defaultdict(list)
|
||||||
|
for pidx, pid in enumerate(project_ids):
|
||||||
|
for filenum in range(5):
|
||||||
|
generated_file_id = ObjectId(f'{pidx}{filenum}' + 22 * 'a')
|
||||||
|
file_id, _ = self.ensure_file_exists({
|
||||||
|
'_id': generated_file_id,
|
||||||
|
'project': pid,
|
||||||
|
'name': f'Test file p{pid} num {filenum}'
|
||||||
|
})
|
||||||
|
file_ids[pid].append(file_id)
|
||||||
|
|
||||||
|
proj_coll = self.app.db('projects')
|
||||||
|
for pid in project_ids:
|
||||||
|
fids = file_ids[pid]
|
||||||
|
|
||||||
|
# Use fids[4] as project image
|
||||||
|
res: UpdateResult = proj_coll.update_one({'_id': pid},
|
||||||
|
{'$set': {'picture': fids[4]}})
|
||||||
|
self.assertEqual(1, res.matched_count)
|
||||||
|
self.assertEqual(1, res.modified_count)
|
||||||
|
|
||||||
|
# Asset linking directly to fids[0]
|
||||||
|
self.create_node({
|
||||||
|
'_id': ObjectId(),
|
||||||
|
'project': pid,
|
||||||
|
'picture': ObjectId('572761f39837730efe8e1210'),
|
||||||
|
'description': '',
|
||||||
|
'node_type': 'asset',
|
||||||
|
'user': ObjectId(24 * 'a'),
|
||||||
|
'properties': {
|
||||||
|
'status': 'published',
|
||||||
|
'content_type': 'image',
|
||||||
|
'file': fids[0],
|
||||||
|
},
|
||||||
|
'name': 'Image direct link',
|
||||||
|
'_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc),
|
||||||
|
'_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc),
|
||||||
|
'_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687'
|
||||||
|
})
|
||||||
|
# Some other node type that has some random field pointing to fids[1].
|
||||||
|
self.create_node({
|
||||||
|
'_id': ObjectId(),
|
||||||
|
'project': pid,
|
||||||
|
'picture': ObjectId('572761f39837730efe8e1210'),
|
||||||
|
'description': '',
|
||||||
|
'node_type': 'totally-unknown',
|
||||||
|
'user': ObjectId(24 * 'a'),
|
||||||
|
'properties': {
|
||||||
|
'status': 'published',
|
||||||
|
'content_type': 'image',
|
||||||
|
'file': fids[0],
|
||||||
|
'random': {'field': [fids[1]]}
|
||||||
|
},
|
||||||
|
'name': 'Image random field',
|
||||||
|
'_updated': datetime.datetime(2016, 5, 2, 14, 19, 58, 0, tzinfo=tz_util.utc),
|
||||||
|
'_created': datetime.datetime(2016, 5, 2, 14, 19, 37, 0, tzinfo=tz_util.utc),
|
||||||
|
'_etag': '6b8589b42c880e3626f43f3e82a5c5b946742687'
|
||||||
|
})
|
||||||
|
# Completely unknown collection with document that points to fids[2]
|
||||||
|
unknown_coll = self.app.db('unknown')
|
||||||
|
unknown_coll.insert_one({
|
||||||
|
'project': pid,
|
||||||
|
'random': {'field': [fids[2]]}
|
||||||
|
})
|
||||||
|
# fids[3] is an orphan.
|
||||||
|
|
||||||
|
from pillar.cli.maintenance import _find_orphan_files
|
||||||
|
|
||||||
|
for pid in project_ids:
|
||||||
|
orphans = _find_orphan_files(pid)
|
||||||
|
self.assertEqual({file_ids[pid][3]}, orphans)
|
Loading…
x
Reference in New Issue
Block a user