From ae60cdd3d3270074debfb02f6c5f1096d6b1eea7 Mon Sep 17 00:00:00 2001 From: Oleg Komarov Date: Sun, 23 Jun 2024 23:30:47 +0200 Subject: [PATCH 1/4] wip --- files/signals.py | 4 ++-- files/tasks.py | 23 ++++++++++++---------- files/utils.py | 50 +++++++++++++++++++++++++++++++++++++++++++++--- requirements.txt | 1 + 4 files changed, 63 insertions(+), 15 deletions(-) diff --git a/files/signals.py b/files/signals.py index 88bdf616..17e08dfe 100644 --- a/files/signals.py +++ b/files/signals.py @@ -23,8 +23,8 @@ def _record_changes( def schedule_scan(file: files.models.File) -> None: """Schedule a scan of a given file.""" logger.info('Scheduling a scan for file pk=%s', file.pk) - verbose_name = f'clamdscan of "{file.source.name}"' - files.tasks.clamdscan(file_id=file.pk, creator=file, verbose_name=verbose_name) + verbose_name = f'scan of "{file.source.name}"' + files.tasks.scan_file(file_id=file.pk, creator=file, verbose_name=verbose_name) @receiver(post_save, sender=files.models.File) diff --git a/files/tasks.py b/files/tasks.py index 0052b5f8..8aae7347 100644 --- a/files/tasks.py +++ b/files/tasks.py @@ -12,21 +12,24 @@ logger = logging.getLogger(__name__) @background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING}) -def clamdscan(file_id: int): +def scan_file(file_id: int): """Run a scan of a given file and save its output as a FileValidation record.""" file = files.models.File.objects.get(pk=file_id) abs_path = os.path.join(settings.MEDIA_ROOT, file.source.path) - scan_status, scan_found = files.utils.run_clamdscan(abs_path) - logger.info('File pk=%s scanned: %s', file.pk, (scan_status, scan_found)) - scan_result = {'clamdscan': [scan_status, scan_found]} - is_ok = scan_status == 'OK' - file_validation, is_new = files.models.FileValidation.objects.get_or_create( + clamd_scan_status, clamd_scan_found = files.utils.run_clamdscan(abs_path) + logger.info('File pk=%s scanned by clamd: %s', file.pk, (clamd_scan_status, clamd_scan_found)) + scan_result = {'clamdscan': [clamd_scan_status, clamd_scan_found]} + is_ok = clamd_scan_status == 'OK' + if is_ok and (wheels := file.metadata.get('wheels', None)): + invalid_wheels = files.utils.validate_wheels(abs_path, wheels) + if invalid_wheels: + logger.info('File pk=%s has invalid wheels: %s', file.pk, invalid_wheels) + is_ok = False + scan_result['invalid_wheels'] = invalid_wheels + + files.models.FileValidation.objects.update_or_create( file=file, defaults={'results': scan_result, 'is_ok': is_ok} ) - if not is_new: - file_validation.results = scan_result - file_validation.is_ok = is_ok - file_validation.save(update_fields={'results', 'is_ok', 'date_modified'}) @background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING}) diff --git a/files/utils.py b/files/utils.py index 4a03d911..95e13c4e 100644 --- a/files/utils.py +++ b/files/utils.py @@ -11,6 +11,7 @@ import toml import typing import zipfile +from packaging.utils import InvalidWheelFilename, parse_wheel_filename from PIL import Image from django.conf import settings from django.core.files.storage import default_storage @@ -18,6 +19,7 @@ from ffmpeg import FFmpeg, FFmpegFileNotFound, FFmpegInvalidCommand, FFmpegError from lxml import etree import clamd import magic +import requests from constants.base import THUMBNAIL_FORMAT, THUMBNAIL_SIZES, THUMBNAIL_QUALITY @@ -29,6 +31,7 @@ FORBIDDEN_FILEPATHS = [ 'Thumbs.db', 'ehthumbs.db', ] +MANIFEST_NAME = 'blender_manifest.toml' MODULE_DIR = Path(__file__).resolve().parent THEME_SCHEMA = [] @@ -113,7 +116,6 @@ def read_manifest_from_zip(archive_path): └─ (...) ``` """ - manifest_name = 'blender_manifest.toml' error_codes = [] file_list = [] manifest_content = None @@ -127,10 +129,10 @@ def read_manifest_from_zip(archive_path): return None, error_codes file_list = myzip.namelist() - manifest_filepath = find_path_by_name(file_list, manifest_name) + manifest_filepath = find_path_by_name(file_list, MANIFEST_NAME) if manifest_filepath is None: - logger.info(f"File '{manifest_name}' not found in the archive.") + logger.info(f"File '{MANIFEST_NAME}' not found in the archive.") error_codes.append('missing_manifest_toml') return None, error_codes @@ -352,3 +354,45 @@ def extract_frame(source_path: str, output_path: str, at_time: str = '00:00:00.0 except (FFmpegError, FFmpegFileNotFound, FFmpegInvalidCommand) as e: logger.exception(f'Failed to extract a frame: {e.message}, {" ".join(ffmpeg.arguments)}') raise + + +def get_wheel_sha256_from_pypi(wheel_name, session): + try: + name, version, *_ = parse_wheel_filename(wheel_name) + except InvalidWheelFilename: + return (None, 'invalid wheel filename') + url = f'https://pypi.org/pypi/{name}/{version}/json' + r = session.get( + url, + headers={'User-Agent': 'extensions.blender.org '}, + timeout=10, + ) + if r.status_code == 404: + return (None, f'wheel not found: {url}') + if r.status_code >= 500: + raise Exception(f'{url} returned {r.status_code} error') + data = r.json() + for item in data.get('urls', []): + if item['filename'] == wheel_name and item['packagetype'] == 'bdist_wheel': + return (item['digests']['sha256'], None) + return (None, 'no matching $.urls item in json response') + + +def validate_wheels(archive_path, wheels): + results = {} + with zipfile.ZipFile(archive_path) as myzip: + manifest_filepath = find_path_by_name(myzip.namelist(), MANIFEST_NAME) + session = requests.Session() + for wheel in wheels: + wheel_path_in_archive = _canonical_path(wheel, manifest_filepath) + wheel_digest = None + with myzip.open(wheel_path_in_archive) as wheel_file: + wheel_digest = get_sha256(wheel_file) + wheel_name = os.path.basename(wheel) + pypi_digest, err = get_wheel_sha256_from_pypi(wheel_name, session) + if err: + results[wheel] = err + continue + if pypi_digest != wheel_digest: + results[wheel] = f'digest in archive={wheel_digest}, digest on pypi={pypi_digest}' + return results diff --git a/requirements.txt b/requirements.txt index cf5221fc..4b051e8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,6 +39,7 @@ maxminddb==2.2.0 mistune==2.0.4 multidict==6.0.2 oauthlib==3.2.0 +packaging==24.1 Pillow==9.2.0 python-ffmpeg==2.0.12 python-magic==0.4.27 -- 2.30.2 From c6ef475c00509d629dd57ffeefaf112f4e828eda Mon Sep 17 00:00:00 2001 From: Oleg Komarov Date: Tue, 9 Jul 2024 17:07:53 +0200 Subject: [PATCH 2/4] update wheel fetching code --- files/tasks.py | 5 ++--- files/utils.py | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/files/tasks.py b/files/tasks.py index 8aae7347..0d7dc379 100644 --- a/files/tasks.py +++ b/files/tasks.py @@ -20,9 +20,8 @@ def scan_file(file_id: int): logger.info('File pk=%s scanned by clamd: %s', file.pk, (clamd_scan_status, clamd_scan_found)) scan_result = {'clamdscan': [clamd_scan_status, clamd_scan_found]} is_ok = clamd_scan_status == 'OK' - if is_ok and (wheels := file.metadata.get('wheels', None)): - invalid_wheels = files.utils.validate_wheels(abs_path, wheels) - if invalid_wheels: + if is_ok and (wheels := files.utils.get_wheels_from_manifest(file.metadata)): + if invalid_wheels := files.utils.validate_wheels(abs_path, wheels): logger.info('File pk=%s has invalid wheels: %s', file.pk, invalid_wheels) is_ok = False scan_result['invalid_wheels'] = invalid_wheels diff --git a/files/utils.py b/files/utils.py index 30b1f049..54e32af2 100644 --- a/files/utils.py +++ b/files/utils.py @@ -171,6 +171,19 @@ def find_forbidden_filepaths(file_list): return result +def get_wheels_from_manifest(manifest): + wheels = None + if ( + 'build' in manifest + and 'generated' in manifest['build'] + and 'wheels' in manifest['build']['generated'] + ): + wheels = manifest['build']['generated']['wheels'] + else: + wheels = manifest.get('wheels') + return wheels + + def validate_file_list(toml_content, manifest_filepath, file_list): """Check the files in in the archive against manifest.""" error_codes = [] @@ -196,16 +209,7 @@ def validate_file_list(toml_content, manifest_filepath, file_list): init_filepath = find_exact_path(file_list, expected_init_path) if not init_filepath: error_codes.append('invalid_missing_init') - wheels = None - if ( - 'build' in toml_content - and 'generated' in toml_content['build'] - and 'wheels' in toml_content['build']['generated'] - ): - wheels = toml_content['build']['generated']['wheels'] - else: - wheels = toml_content.get('wheels') - if wheels: + if wheels := get_wheels_from_manifest(toml_content): for wheel in wheels: expected_wheel_path = _canonical_path(wheel, manifest_filepath) wheel_filepath = find_exact_path(file_list, expected_wheel_path) -- 2.30.2 From ad86cf5bec91ef75ac2131e0071c182e461ea7fb Mon Sep 17 00:00:00 2001 From: Oleg Komarov Date: Tue, 9 Jul 2024 18:20:13 +0200 Subject: [PATCH 3/4] renames in test --- files/tests/test_signals.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/files/tests/test_signals.py b/files/tests/test_signals.py index 9c176fc6..0a99370b 100644 --- a/files/tests/test_signals.py +++ b/files/tests/test_signals.py @@ -38,12 +38,12 @@ class FileScanTest(TestCase): # A background task should have been created task = Task.objects.created_by(creator=file).first() self.assertIsNotNone(task) - self.assertEqual(task.task_name, 'files.tasks.clamdscan') + self.assertEqual(task.task_name, 'files.tasks.scan_file') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') # Actually run the task as if by background runner task_args, task_kwargs = task.params() - files.tasks.clamdscan.task_function(*task_args, **task_kwargs) + files.tasks.scan_file.task_function(*task_args, **task_kwargs) file.refresh_from_db() self.assertFalse(file.validation.is_ok) @@ -68,12 +68,12 @@ class FileScanTest(TestCase): # A background task should have been created task = Task.objects.created_by(creator=file).first() self.assertIsNotNone(task) - self.assertEqual(task.task_name, 'files.tasks.clamdscan') + self.assertEqual(task.task_name, 'files.tasks.scan_file') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') # Actually run the task as if by background runner task_args, task_kwargs = task.params() - files.tasks.clamdscan.task_function(*task_args, **task_kwargs) + files.tasks.scan_file.task_function(*task_args, **task_kwargs) self.assertFalse(file.validation.is_ok) file.validation.refresh_from_db() @@ -95,12 +95,12 @@ class FileScanTest(TestCase): # A background task should have been created task = Task.objects.created_by(creator=file).first() self.assertIsNotNone(task) - self.assertEqual(task.task_name, 'files.tasks.clamdscan') + self.assertEqual(task.task_name, 'files.tasks.scan_file') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') # Actually run the task as if by background runner task_args, task_kwargs = task.params() - files.tasks.clamdscan.task_function(*task_args, **task_kwargs) + files.tasks.scan_file.task_function(*task_args, **task_kwargs) file.refresh_from_db() self.assertTrue(file.validation.is_ok) -- 2.30.2 From dc4fa3cef50c78ce7504b2dc80373bb1e948b20e Mon Sep 17 00:00:00 2001 From: Oleg Komarov Date: Tue, 9 Jul 2024 19:45:45 +0200 Subject: [PATCH 4/4] add test for validate_wheels --- files/tests/test_utils.py | 64 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/files/tests/test_utils.py b/files/tests/test_utils.py index 763ac284..b1a01d38 100644 --- a/files/tests/test_utils.py +++ b/files/tests/test_utils.py @@ -1,7 +1,10 @@ from pathlib import Path from unittest.mock import patch, ANY import dataclasses +import io +import os import tempfile +import zipfile from django.test import TestCase @@ -11,8 +14,10 @@ from files.utils import ( find_exact_path, find_path_by_name, get_thumbnail_upload_to, + get_wheels_from_manifest, make_thumbnails, validate_file_list, + validate_wheels, ) # Reusing test files from the extensions app @@ -290,3 +295,62 @@ class UtilsTest(TestCase): validate_file_list(test.toml_content, test.manifest_filepath, test.file_list), test.name, ) + + def test_get_wheels_from_manifest(self): + @dataclasses.dataclass + class TestParams: + name: str + toml_content: dict + expected: list + + for test in [ + TestParams( + name='no wheels', + toml_content={'type': 'add-on'}, + expected=None, + ), + TestParams( + name='top-level wheels', + toml_content={ + 'type': 'add-on', + 'wheels': ['./wheels/1.whl', './wheels/2.whl'], + }, + expected=['./wheels/1.whl', './wheels/2.whl'], + ), + TestParams( + name='build.generated wheels', + toml_content={ + 'type': 'add-on', + 'wheels': ['./wheels/1.whl', './wheels/2.whl'], + 'build': {'generated': {'wheels': ['./wheels/1.whl']}}, + }, + expected=['./wheels/1.whl'], + ), + ]: + with self.subTest(**dataclasses.asdict(test)): + self.assertEqual( + test.expected, + get_wheels_from_manifest(test.toml_content), + test.name, + ) + + @patch( + 'files.utils.get_wheel_sha256_from_pypi', + lambda _, __: ('blahblah', None), + ) + def test_validate_wheels(self): + buff = io.BytesIO() + with tempfile.TemporaryDirectory() as output_dir: + test_file_path = os.path.join(output_dir, 'test_file.zip') + with zipfile.ZipFile(buff, mode='w') as file: + file.writestr('blender_manifest.toml', b'wheels = ["wheels/1.whl"]') + file.writestr('wheels/1.whl', b'') + + with open(test_file_path, 'wb') as f: + f.write(buff.getvalue()) + + self.assertEqual( + validate_wheels(test_file_path, ['wheels/1.whl']).get('wheels/1.whl'), + 'digest in archive=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855' + ', digest on pypi=blahblah', + ) -- 2.30.2