2024-07-11 10:45:24 +02:00
6 changed files with 146 additions and 31 deletions
--- a/files/signals.py
+++ b/files/signals.py
@ -23,8 +23,8 @@ def _record_changes(
 def schedule_scan(file: files.models.File) -> None:
    """Schedule a scan of a given file."""
    logger.info('Scheduling a scan for file pk=%s', file.pk)
-    verbose_name = f'clamdscan of "{file.source.name}"'
-    files.tasks.clamdscan(file_id=file.pk, creator=file, verbose_name=verbose_name)
+    verbose_name = f'scan of "{file.source.name}"'
+    files.tasks.scan_file(file_id=file.pk, creator=file, verbose_name=verbose_name)


@receiver(post_save, sender=files.models.File)
--- a/files/tasks.py
+++ b/files/tasks.py
@ -12,21 +12,23 @@ logger = logging.getLogger(__name__)


@background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING})
-def clamdscan(file_id: int):
+def scan_file(file_id: int):
    """Run a scan of a given file and save its output as a FileValidation record."""
    file = files.models.File.objects.get(pk=file_id)
    abs_path = os.path.join(settings.MEDIA_ROOT, file.source.path)
-    scan_status, scan_found = files.utils.run_clamdscan(abs_path)
-    logger.info('File pk=%s scanned: %s', file.pk, (scan_status, scan_found))
-    scan_result = {'clamdscan': [scan_status, scan_found]}
-    is_ok = scan_status == 'OK'
-    file_validation, is_new = files.models.FileValidation.objects.get_or_create(
+    clamd_scan_status, clamd_scan_found = files.utils.run_clamdscan(abs_path)
+    logger.info('File pk=%s scanned by clamd: %s', file.pk, (clamd_scan_status, clamd_scan_found))
+    scan_result = {'clamdscan': [clamd_scan_status, clamd_scan_found]}
+    is_ok = clamd_scan_status == 'OK'
+    if is_ok and (wheels := files.utils.get_wheels_from_manifest(file.metadata)):
+        if invalid_wheels := files.utils.validate_wheels(abs_path, wheels):
+            logger.info('File pk=%s has invalid wheels: %s', file.pk, invalid_wheels)
+            is_ok = False
+            scan_result['invalid_wheels'] = invalid_wheels
+
+    files.models.FileValidation.objects.update_or_create(
        file=file, defaults={'results': scan_result, 'is_ok': is_ok}
    )
-    if not is_new:
-        file_validation.results = scan_result
-        file_validation.is_ok = is_ok
-        file_validation.save(update_fields={'results', 'is_ok', 'date_modified'})


@background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING})
--- a/files/tests/test_signals.py
+++ b/files/tests/test_signals.py
@ -38,12 +38,12 @@ class FileScanTest(TestCase):
        # A background task should have been created
        task = Task.objects.created_by(creator=file).first()
        self.assertIsNotNone(task)
-        self.assertEqual(task.task_name, 'files.tasks.clamdscan')
+        self.assertEqual(task.task_name, 'files.tasks.scan_file')
        self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')

        # Actually run the task as if by background runner
        task_args, task_kwargs = task.params()
-        files.tasks.clamdscan.task_function(*task_args, **task_kwargs)
+        files.tasks.scan_file.task_function(*task_args, **task_kwargs)

        file.refresh_from_db()
        self.assertFalse(file.validation.is_ok)
@ -68,12 +68,12 @@ class FileScanTest(TestCase):
        # A background task should have been created
        task = Task.objects.created_by(creator=file).first()
        self.assertIsNotNone(task)
-        self.assertEqual(task.task_name, 'files.tasks.clamdscan')
+        self.assertEqual(task.task_name, 'files.tasks.scan_file')
        self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')

        # Actually run the task as if by background runner
        task_args, task_kwargs = task.params()
-        files.tasks.clamdscan.task_function(*task_args, **task_kwargs)
+        files.tasks.scan_file.task_function(*task_args, **task_kwargs)

        self.assertFalse(file.validation.is_ok)
        file.validation.refresh_from_db()
@ -95,12 +95,12 @@ class FileScanTest(TestCase):
        # A background task should have been created
        task = Task.objects.created_by(creator=file).first()
        self.assertIsNotNone(task)
-        self.assertEqual(task.task_name, 'files.tasks.clamdscan')
+        self.assertEqual(task.task_name, 'files.tasks.scan_file')
        self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')

        # Actually run the task as if by background runner
        task_args, task_kwargs = task.params()
-        files.tasks.clamdscan.task_function(*task_args, **task_kwargs)
+        files.tasks.scan_file.task_function(*task_args, **task_kwargs)

        file.refresh_from_db()
        self.assertTrue(file.validation.is_ok)
--- a/files/tests/test_utils.py
+++ b/files/tests/test_utils.py
@ -1,7 +1,10 @@
 from pathlib import Path
 from unittest.mock import patch, ANY
 import dataclasses
+import io
+import os
 import tempfile
+import zipfile

 from django.test import TestCase

@ -11,8 +14,10 @@ from files.utils import (
    find_exact_path,
    find_path_by_name,
    get_thumbnail_upload_to,
+    get_wheels_from_manifest,
    make_thumbnails,
    validate_file_list,
+    validate_wheels,
 )

 # Reusing test files from the extensions app
@ -290,3 +295,62 @@ class UtilsTest(TestCase):
                    validate_file_list(test.toml_content, test.manifest_filepath, test.file_list),
                    test.name,
                )
+
+    def test_get_wheels_from_manifest(self):
+        @dataclasses.dataclass
+        class TestParams:
+            name: str
+            toml_content: dict
+            expected: list
+
+        for test in [
+            TestParams(
+                name='no wheels',
+                toml_content={'type': 'add-on'},
+                expected=None,
+            ),
+            TestParams(
+                name='top-level wheels',
+                toml_content={
+                    'type': 'add-on',
+                    'wheels': ['./wheels/1.whl', './wheels/2.whl'],
+                },
+                expected=['./wheels/1.whl', './wheels/2.whl'],
+            ),
+            TestParams(
+                name='build.generated wheels',
+                toml_content={
+                    'type': 'add-on',
+                    'wheels': ['./wheels/1.whl', './wheels/2.whl'],
+                    'build': {'generated': {'wheels': ['./wheels/1.whl']}},
+                },
+                expected=['./wheels/1.whl'],
+            ),
+        ]:
+            with self.subTest(**dataclasses.asdict(test)):
+                self.assertEqual(
+                    test.expected,
+                    get_wheels_from_manifest(test.toml_content),
+                    test.name,
+                )
+
+    @patch(
+        'files.utils.get_wheel_sha256_from_pypi',
+        lambda _, __: ('blahblah', None),
+    )
+    def test_validate_wheels(self):
+        buff = io.BytesIO()
+        with tempfile.TemporaryDirectory() as output_dir:
+            test_file_path = os.path.join(output_dir, 'test_file.zip')
+            with zipfile.ZipFile(buff, mode='w') as file:
+                file.writestr('blender_manifest.toml', b'wheels = ["wheels/1.whl"]')
+                file.writestr('wheels/1.whl', b'')
+
+            with open(test_file_path, 'wb') as f:
+                f.write(buff.getvalue())
+
+            self.assertEqual(
+                validate_wheels(test_file_path, ['wheels/1.whl']).get('wheels/1.whl'),
+                'digest in archive=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+                ', digest on pypi=blahblah',
+            )
--- a/files/utils.py
+++ b/files/utils.py
@ -11,6 +11,7 @@ import toml
 import typing
 import zipfile

+from packaging.utils import InvalidWheelFilename, parse_wheel_filename
 from PIL import Image
 from django.conf import settings
 from django.core.files.storage import default_storage
@ -18,6 +19,7 @@ from ffmpeg import FFmpeg, FFmpegFileNotFound, FFmpegInvalidCommand, FFmpegError
 from lxml import etree
 import clamd
 import magic
+import requests

 from constants.base import THUMBNAIL_FORMAT, THUMBNAIL_SIZES, THUMBNAIL_QUALITY

@ -29,6 +31,7 @@ FORBIDDEN_FILEPATHS = [
    'Thumbs.db',
    'ehthumbs.db',
 ]
+MANIFEST_NAME = 'blender_manifest.toml'
 MODULE_DIR = Path(__file__).resolve().parent
 THEME_SCHEMA = []

@ -113,7 +116,6 @@ def read_manifest_from_zip(archive_path):
                └─ (...)
            ```
    """
-    manifest_name = 'blender_manifest.toml'
    error_codes = []
    file_list = []
    manifest_content = None
@ -127,10 +129,10 @@ def read_manifest_from_zip(archive_path):
                return None, error_codes

            file_list = myzip.namelist()
-            manifest_filepath = find_path_by_name(file_list, manifest_name)
+            manifest_filepath = find_path_by_name(file_list, MANIFEST_NAME)

            if manifest_filepath is None:
-                logger.info(f"File '{manifest_name}' not found in the archive.")
+                logger.info(f"File '{MANIFEST_NAME}' not found in the archive.")
                error_codes.append('missing_manifest_toml')
                return None, error_codes

@ -169,6 +171,19 @@ def find_forbidden_filepaths(file_list):
    return result


+def get_wheels_from_manifest(manifest):
+    wheels = None
+    if (
+        'build' in manifest
+        and 'generated' in manifest['build']
+        and 'wheels' in manifest['build']['generated']
+    ):
+        wheels = manifest['build']['generated']['wheels']
+    else:
+        wheels = manifest.get('wheels')
+    return wheels
+
+
 def validate_file_list(toml_content, manifest_filepath, file_list):
    """Check the files in in the archive against manifest."""
    error_codes = []
@ -194,16 +209,7 @@ def validate_file_list(toml_content, manifest_filepath, file_list):
        init_filepath = find_exact_path(file_list, expected_init_path)
        if not init_filepath:
            error_codes.append('invalid_missing_init')
-    wheels = None
-    if (
-        'build' in toml_content
-        and 'generated' in toml_content['build']
-        and 'wheels' in toml_content['build']['generated']
-    ):
-        wheels = toml_content['build']['generated']['wheels']
-    else:
-        wheels = toml_content.get('wheels')
-    if wheels:
+    if wheels := get_wheels_from_manifest(toml_content):
        for wheel in wheels:
            expected_wheel_path = _canonical_path(wheel, manifest_filepath)
            wheel_filepath = find_exact_path(file_list, expected_wheel_path)
@ -363,3 +369,45 @@ def extract_frame(source_path: str, output_path: str, at_time: str = '00:00:00.0
    except (FFmpegError, FFmpegFileNotFound, FFmpegInvalidCommand) as e:
        logger.exception(f'Failed to extract a frame: {e.message}, {" ".join(ffmpeg.arguments)}')
        raise
+
+
+def get_wheel_sha256_from_pypi(wheel_name, session):
+    try:
+        name, version, *_ = parse_wheel_filename(wheel_name)
+    except InvalidWheelFilename:
+        return (None, 'invalid wheel filename')
+    url = f'https://pypi.org/pypi/{name}/{version}/json'
+    r = session.get(
+        url,
+        headers={'User-Agent': 'extensions.blender.org <extensions@blender.org>'},
+        timeout=10,
+    )
+    if r.status_code == 404:
+        return (None, f'wheel not found: {url}')
+    if r.status_code >= 500:
+        raise Exception(f'{url} returned {r.status_code} error')
+    data = r.json()
+    for item in data.get('urls', []):
+        if item['filename'] == wheel_name and item['packagetype'] == 'bdist_wheel':
+            return (item['digests']['sha256'], None)
+    return (None, 'no matching $.urls item in json response')
+
+
+def validate_wheels(archive_path, wheels):
+    results = {}
+    with zipfile.ZipFile(archive_path) as myzip:
+        manifest_filepath = find_path_by_name(myzip.namelist(), MANIFEST_NAME)
+        session = requests.Session()
+        for wheel in wheels:
+            wheel_path_in_archive = _canonical_path(wheel, manifest_filepath)
+            wheel_digest = None
+            with myzip.open(wheel_path_in_archive) as wheel_file:
+                wheel_digest = get_sha256(wheel_file)
+            wheel_name = os.path.basename(wheel)
+            pypi_digest, err = get_wheel_sha256_from_pypi(wheel_name, session)
+            if err:
+                results[wheel] = err
+                continue
+            if pypi_digest != wheel_digest:
+                results[wheel] = f'digest in archive={wheel_digest}, digest on pypi={pypi_digest}'
+    return results
--- a/requirements.txt
+++ b/requirements.txt
@ -39,6 +39,7 @@ maxminddb==2.2.0
 mistune==2.0.4
 multidict==6.0.2
 oauthlib==3.2.0
+packaging==24.1
 Pillow==9.2.0
 python-ffmpeg==2.0.12
 python-magic==0.4.27