Added two more maintenance cmds for finding & fixing projectless files
This is about fixing file documents that do not have a `project` key at all. Those were deleted by the `delete_projectless_files` management command and restored manually. These commands can fix those file documents properly, by checking which project they're referenced in, and setting their `project` property. Finding the references (`manage.py maintenance find_projects_for_files`) is a heavy operation as it inspects all nodes and all projects. This can be done offline on a cloned database, and the result stored in a JSON file. This JSON file can then be processed on the production server (`manage.py maintenance fix_projects_for_files /path/to/file.json --go`) to perform the fix.
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
|
import collections
|
||||||
import copy
|
import copy
|
||||||
import datetime
|
import datetime
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import PurePosixPath
|
from pathlib import PurePosixPath, Path
|
||||||
import re
|
import re
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
@@ -14,7 +16,6 @@ import pymongo
|
|||||||
from pillar import current_app
|
from pillar import current_app
|
||||||
import pillar.api.utils
|
import pillar.api.utils
|
||||||
|
|
||||||
|
|
||||||
# Collections to skip when finding file references (during orphan file detection).
|
# Collections to skip when finding file references (during orphan file detection).
|
||||||
# This collection can be added to from PillarExtension.setup_app().
|
# This collection can be added to from PillarExtension.setup_app().
|
||||||
ORPHAN_FINDER_SKIP_COLLECTIONS = {
|
ORPHAN_FINDER_SKIP_COLLECTIONS = {
|
||||||
@@ -884,25 +885,12 @@ def _db_projects(proj_url: str, all_projects: bool, project_id='', *, go: bool)
|
|||||||
log.info('Command took %s', duration)
|
log.info('Command took %s', duration)
|
||||||
|
|
||||||
|
|
||||||
def _find_orphan_files() -> typing.Set[bson.ObjectId]:
|
|
||||||
"""Finds all non-referenced files for the given project.
|
|
||||||
|
|
||||||
Returns an iterable of all orphan file IDs.
|
|
||||||
"""
|
|
||||||
log.debug('Finding orphan files')
|
|
||||||
|
|
||||||
# Get all file IDs that belong to this project.
|
|
||||||
files_coll = current_app.db('files')
|
|
||||||
cursor = files_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1})
|
|
||||||
file_ids = {doc['_id'] for doc in cursor}
|
|
||||||
if not file_ids:
|
|
||||||
log.debug('No files found')
|
|
||||||
return set()
|
|
||||||
|
|
||||||
total_file_count = len(file_ids)
|
|
||||||
log.debug('Found %d files in total', total_file_count)
|
|
||||||
|
|
||||||
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
|
def find_object_ids(something: typing.Any) -> typing.Iterable[bson.ObjectId]:
|
||||||
|
"""Generator, yields all ObjectIDs referenced by the given object.
|
||||||
|
|
||||||
|
Assumes 'something' comes from a MongoDB. This function wasn't made for
|
||||||
|
generic Python objects.
|
||||||
|
"""
|
||||||
if isinstance(something, bson.ObjectId):
|
if isinstance(something, bson.ObjectId):
|
||||||
yield something
|
yield something
|
||||||
elif isinstance(something, str) and len(something) == 24:
|
elif isinstance(something, str) and len(something) == 24:
|
||||||
@@ -915,9 +903,30 @@ def _find_orphan_files() -> typing.Set[bson.ObjectId]:
|
|||||||
for item in something:
|
for item in something:
|
||||||
yield from find_object_ids(item)
|
yield from find_object_ids(item)
|
||||||
elif isinstance(something, dict):
|
elif isinstance(something, dict):
|
||||||
|
for item in something.keys():
|
||||||
|
yield from find_object_ids(item)
|
||||||
for item in something.values():
|
for item in something.values():
|
||||||
yield from find_object_ids(item)
|
yield from find_object_ids(item)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_orphan_files() -> typing.Set[bson.ObjectId]:
|
||||||
|
"""Finds all non-referenced files.
|
||||||
|
|
||||||
|
Returns an iterable of all orphan file IDs.
|
||||||
|
"""
|
||||||
|
log.debug('Finding orphan files')
|
||||||
|
|
||||||
|
# Get all file IDs and make a set; we'll remove any referenced object ID later.
|
||||||
|
files_coll = current_app.db('files')
|
||||||
|
cursor = files_coll.find({'_deleted': {'$ne': True}}, projection={'_id': 1})
|
||||||
|
file_ids = {doc['_id'] for doc in cursor}
|
||||||
|
if not file_ids:
|
||||||
|
log.debug('No files found')
|
||||||
|
return set()
|
||||||
|
|
||||||
|
total_file_count = len(file_ids)
|
||||||
|
log.debug('Found %d files in total', total_file_count)
|
||||||
|
|
||||||
# Find all references by iterating through the project itself and every document that has a
|
# Find all references by iterating through the project itself and every document that has a
|
||||||
# 'project' key set to this ObjectId.
|
# 'project' key set to this ObjectId.
|
||||||
db = current_app.db()
|
db = current_app.db()
|
||||||
@@ -947,7 +956,6 @@ def find_orphan_files():
|
|||||||
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
||||||
"""
|
"""
|
||||||
from jinja2.filters import do_filesizeformat
|
from jinja2.filters import do_filesizeformat
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'orphan-files.txt'
|
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'orphan-files.txt'
|
||||||
if output_fpath.exists():
|
if output_fpath.exists():
|
||||||
@@ -993,7 +1001,6 @@ def delete_orphan_files():
|
|||||||
Use 'find_orphan_files' first to generate orphan-files.txt.
|
Use 'find_orphan_files' first to generate orphan-files.txt.
|
||||||
"""
|
"""
|
||||||
import pymongo.results
|
import pymongo.results
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'orphan-files.txt'
|
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'orphan-files.txt'
|
||||||
with output_fpath.open('r', encoding='ascii') as infile:
|
with output_fpath.open('r', encoding='ascii') as infile:
|
||||||
@@ -1032,7 +1039,6 @@ def find_video_files_without_duration():
|
|||||||
|
|
||||||
This is a heavy operation. Use with care.
|
This is a heavy operation. Use with care.
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'video_files_without_duration.txt'
|
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'video_files_without_duration.txt'
|
||||||
if output_fpath.exists():
|
if output_fpath.exists():
|
||||||
@@ -1071,7 +1077,6 @@ def find_video_nodes_without_duration():
|
|||||||
|
|
||||||
This is a heavy operation. Use with care.
|
This is a heavy operation. Use with care.
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'video_nodes_without_duration.txt'
|
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'video_nodes_without_duration.txt'
|
||||||
if output_fpath.exists():
|
if output_fpath.exists():
|
||||||
@@ -1184,7 +1189,11 @@ def reconcile_node_video_duration(nodes_to_update=None, all_nodes=False, go=Fals
|
|||||||
@manager_maintenance.option('-g', '--go', dest='go', action='store_true', default=False,
|
@manager_maintenance.option('-g', '--go', dest='go', action='store_true', default=False,
|
||||||
help='Actually perform the changes (otherwise just show as dry-run).')
|
help='Actually perform the changes (otherwise just show as dry-run).')
|
||||||
def delete_projectless_files(go=False):
|
def delete_projectless_files(go=False):
|
||||||
"""Soft-deletes files of projects that have been deleted."""
|
"""Soft-deletes file documents of projects that have been deleted.
|
||||||
|
|
||||||
|
WARNING: this also soft-deletes file documents that do not have a project
|
||||||
|
property at all.
|
||||||
|
"""
|
||||||
|
|
||||||
start_timestamp = datetime.datetime.now()
|
start_timestamp = datetime.datetime.now()
|
||||||
|
|
||||||
@@ -1236,3 +1245,132 @@ def delete_projectless_files(go=False):
|
|||||||
else:
|
else:
|
||||||
verb = 'Finding'
|
verb = 'Finding'
|
||||||
log.info('%s orphans took %s', verb, duration)
|
log.info('%s orphans took %s', verb, duration)
|
||||||
|
|
||||||
|
|
||||||
|
@manager_maintenance.command
|
||||||
|
def find_projects_for_files():
|
||||||
|
"""For file documents without project, tries to find in which project files are used.
|
||||||
|
|
||||||
|
This is a heavy operation that inspects *everything* in MongoDB. Use with care.
|
||||||
|
"""
|
||||||
|
|
||||||
|
output_fpath = Path(current_app.config['STORAGE_DIR']) / 'files-without-project.json'
|
||||||
|
if output_fpath.exists():
|
||||||
|
log.error('Output filename %s already exists, remove it first.', output_fpath)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
start_timestamp = datetime.datetime.now()
|
||||||
|
|
||||||
|
log.info('Finding files to fix...')
|
||||||
|
files_coll = current_app.db('files')
|
||||||
|
query = {'project': {'$exists': False},
|
||||||
|
'_deleted': {'$ne': True}}
|
||||||
|
|
||||||
|
files_to_fix = {file_doc['_id']: None for file_doc in files_coll.find(query)}
|
||||||
|
if not files_to_fix:
|
||||||
|
log.info('No files without projects found, congratulations.')
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Find all references by iterating through every node and project, and
|
||||||
|
# hoping that they reference the file.
|
||||||
|
projects_coll = current_app.db('projects')
|
||||||
|
existing_projects: typing.MutableSet[ObjectId] = set()
|
||||||
|
for doc in projects_coll.find():
|
||||||
|
project_id = doc['_id']
|
||||||
|
existing_projects.add(project_id)
|
||||||
|
|
||||||
|
for obj_id in find_object_ids(doc):
|
||||||
|
if obj_id not in files_to_fix:
|
||||||
|
continue
|
||||||
|
|
||||||
|
files_to_fix[obj_id] = project_id
|
||||||
|
|
||||||
|
nodes_coll = current_app.db('nodes')
|
||||||
|
for doc in nodes_coll.find():
|
||||||
|
project_id = doc.get('project')
|
||||||
|
if not project_id:
|
||||||
|
log.warning('Skipping node %s, as it is not part of any project', doc['_id'])
|
||||||
|
continue
|
||||||
|
if project_id not in existing_projects:
|
||||||
|
log.warning('Skipping node %s, as its project %s does not exist',
|
||||||
|
doc['_id'], project_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for obj_id in find_object_ids(doc):
|
||||||
|
if obj_id not in files_to_fix:
|
||||||
|
continue
|
||||||
|
|
||||||
|
files_to_fix[obj_id] = project_id
|
||||||
|
|
||||||
|
orphans = {oid for oid, project_id in files_to_fix.items()
|
||||||
|
if project_id is None}
|
||||||
|
fixable = {str(oid): str(project_id)
|
||||||
|
for oid, project_id in files_to_fix.items()
|
||||||
|
if project_id is not None}
|
||||||
|
|
||||||
|
log.info('Total nr of orphan files : %d', len(orphans))
|
||||||
|
log.info('Total nr of fixable files: %d', len(fixable))
|
||||||
|
|
||||||
|
projects = set(fixable.values())
|
||||||
|
log.info('Fixable project count : %d', len(projects))
|
||||||
|
for project_id in projects:
|
||||||
|
project = projects_coll.find_one(ObjectId(project_id))
|
||||||
|
log.info(' - %40s /p/%-20s created on %s, ',
|
||||||
|
project['name'], project['url'], project['_created'])
|
||||||
|
|
||||||
|
end_timestamp = datetime.datetime.now()
|
||||||
|
duration = end_timestamp - start_timestamp
|
||||||
|
log.info('Finding projects took %s', duration)
|
||||||
|
|
||||||
|
log.info('Writing {file_id: project_id} mapping to %s', output_fpath)
|
||||||
|
with output_fpath.open('w', encoding='ascii') as outfile:
|
||||||
|
json.dump(fixable, outfile, indent=4, sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
|
@manager_maintenance.option('filepath', type=Path,
|
||||||
|
help='JSON file produced by find_projects_for_files')
|
||||||
|
@manager_maintenance.option('-g', '--go', dest='go', action='store_true', default=False,
|
||||||
|
help='Actually perform the changes (otherwise just show as dry-run).')
|
||||||
|
def fix_projects_for_files(filepath: Path, go=False):
|
||||||
|
"""Assigns file documents to projects.
|
||||||
|
|
||||||
|
Use 'manage.py maintenance find_projects_for_files` to produce the JSON
|
||||||
|
file that contains the file ID to project ID mapping.
|
||||||
|
"""
|
||||||
|
|
||||||
|
log.info('Loading %s', filepath)
|
||||||
|
with filepath.open('r', encoding='ascii') as infile:
|
||||||
|
mapping: typing.Mapping[str, str] = json.load(infile)
|
||||||
|
|
||||||
|
# Group IDs per project for more efficient querying.
|
||||||
|
log.info('Grouping per project')
|
||||||
|
project_to_file_ids: typing.Mapping[ObjectId, typing.List[ObjectId]] = \
|
||||||
|
collections.defaultdict(list)
|
||||||
|
for file_id, project_id in mapping.items():
|
||||||
|
project_to_file_ids[ObjectId(project_id)].append(ObjectId(file_id))
|
||||||
|
|
||||||
|
MockUpdateResult = collections.namedtuple('MockUpdateResult', 'matched_count modified_count')
|
||||||
|
|
||||||
|
files_coll = current_app.db('files')
|
||||||
|
total_matched = total_modified = 0
|
||||||
|
for project_oid, file_oids in project_to_file_ids.items():
|
||||||
|
query = {'_id': {'$in': file_oids}}
|
||||||
|
|
||||||
|
if go:
|
||||||
|
result = files_coll.update_many(query, {'$set': {'project': project_oid}})
|
||||||
|
else:
|
||||||
|
found = files_coll.count_documents(query)
|
||||||
|
result = MockUpdateResult(found, 0)
|
||||||
|
|
||||||
|
total_matched += result.matched_count
|
||||||
|
total_modified += result.modified_count
|
||||||
|
|
||||||
|
if result.matched_count != len(file_oids):
|
||||||
|
log.warning('Matched only %d of %d files; modified %d; for project %s',
|
||||||
|
result.matched_count, len(file_oids), result.modified_count, project_oid)
|
||||||
|
else:
|
||||||
|
log.info('Matched all %d files; modified %d; for project %s',
|
||||||
|
result.matched_count, result.modified_count, project_oid)
|
||||||
|
|
||||||
|
log.info('Done updating %d files (found %d, modified %d) on %d projects',
|
||||||
|
len(mapping), total_matched, total_modified, len(project_to_file_ids))
|
||||||
|
Reference in New Issue
Block a user