File scanning: validate wheel digests against pypi.org #199

Merged
Oleg-Komarov merged 5 commits from validate-wheels into main 2024-07-11 10:45:24 +02:00
6 changed files with 146 additions and 31 deletions

View File

@ -23,8 +23,8 @@ def _record_changes(
def schedule_scan(file: files.models.File) -> None: def schedule_scan(file: files.models.File) -> None:
"""Schedule a scan of a given file.""" """Schedule a scan of a given file."""
logger.info('Scheduling a scan for file pk=%s', file.pk) logger.info('Scheduling a scan for file pk=%s', file.pk)
verbose_name = f'clamdscan of "{file.source.name}"' verbose_name = f'scan of "{file.source.name}"'
files.tasks.clamdscan(file_id=file.pk, creator=file, verbose_name=verbose_name) files.tasks.scan_file(file_id=file.pk, creator=file, verbose_name=verbose_name)
@receiver(post_save, sender=files.models.File) @receiver(post_save, sender=files.models.File)

View File

@ -12,21 +12,23 @@ logger = logging.getLogger(__name__)
@background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING}) @background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING})
def clamdscan(file_id: int): def scan_file(file_id: int):
"""Run a scan of a given file and save its output as a FileValidation record.""" """Run a scan of a given file and save its output as a FileValidation record."""
file = files.models.File.objects.get(pk=file_id) file = files.models.File.objects.get(pk=file_id)
abs_path = os.path.join(settings.MEDIA_ROOT, file.source.path) abs_path = os.path.join(settings.MEDIA_ROOT, file.source.path)
scan_status, scan_found = files.utils.run_clamdscan(abs_path) clamd_scan_status, clamd_scan_found = files.utils.run_clamdscan(abs_path)
logger.info('File pk=%s scanned: %s', file.pk, (scan_status, scan_found)) logger.info('File pk=%s scanned by clamd: %s', file.pk, (clamd_scan_status, clamd_scan_found))
scan_result = {'clamdscan': [scan_status, scan_found]} scan_result = {'clamdscan': [clamd_scan_status, clamd_scan_found]}
is_ok = scan_status == 'OK' is_ok = clamd_scan_status == 'OK'
file_validation, is_new = files.models.FileValidation.objects.get_or_create( if is_ok and (wheels := files.utils.get_wheels_from_manifest(file.metadata)):
if invalid_wheels := files.utils.validate_wheels(abs_path, wheels):
logger.info('File pk=%s has invalid wheels: %s', file.pk, invalid_wheels)
is_ok = False
scan_result['invalid_wheels'] = invalid_wheels
files.models.FileValidation.objects.update_or_create(
file=file, defaults={'results': scan_result, 'is_ok': is_ok} file=file, defaults={'results': scan_result, 'is_ok': is_ok}
) )
if not is_new:
file_validation.results = scan_result
file_validation.is_ok = is_ok
file_validation.save(update_fields={'results', 'is_ok', 'date_modified'})
@background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING}) @background(schedule={'action': TaskSchedule.RESCHEDULE_EXISTING})

View File

@ -38,12 +38,12 @@ class FileScanTest(TestCase):
# A background task should have been created # A background task should have been created
task = Task.objects.created_by(creator=file).first() task = Task.objects.created_by(creator=file).first()
self.assertIsNotNone(task) self.assertIsNotNone(task)
self.assertEqual(task.task_name, 'files.tasks.clamdscan') self.assertEqual(task.task_name, 'files.tasks.scan_file')
self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')
# Actually run the task as if by background runner # Actually run the task as if by background runner
task_args, task_kwargs = task.params() task_args, task_kwargs = task.params()
files.tasks.clamdscan.task_function(*task_args, **task_kwargs) files.tasks.scan_file.task_function(*task_args, **task_kwargs)
file.refresh_from_db() file.refresh_from_db()
self.assertFalse(file.validation.is_ok) self.assertFalse(file.validation.is_ok)
@ -68,12 +68,12 @@ class FileScanTest(TestCase):
# A background task should have been created # A background task should have been created
task = Task.objects.created_by(creator=file).first() task = Task.objects.created_by(creator=file).first()
self.assertIsNotNone(task) self.assertIsNotNone(task)
self.assertEqual(task.task_name, 'files.tasks.clamdscan') self.assertEqual(task.task_name, 'files.tasks.scan_file')
self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')
# Actually run the task as if by background runner # Actually run the task as if by background runner
task_args, task_kwargs = task.params() task_args, task_kwargs = task.params()
files.tasks.clamdscan.task_function(*task_args, **task_kwargs) files.tasks.scan_file.task_function(*task_args, **task_kwargs)
self.assertFalse(file.validation.is_ok) self.assertFalse(file.validation.is_ok)
file.validation.refresh_from_db() file.validation.refresh_from_db()
@ -95,12 +95,12 @@ class FileScanTest(TestCase):
# A background task should have been created # A background task should have been created
task = Task.objects.created_by(creator=file).first() task = Task.objects.created_by(creator=file).first()
self.assertIsNotNone(task) self.assertIsNotNone(task)
self.assertEqual(task.task_name, 'files.tasks.clamdscan') self.assertEqual(task.task_name, 'files.tasks.scan_file')
self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]') self.assertEqual(task.task_params, f'[[], {{"file_id": {file.pk}}}]')
# Actually run the task as if by background runner # Actually run the task as if by background runner
task_args, task_kwargs = task.params() task_args, task_kwargs = task.params()
files.tasks.clamdscan.task_function(*task_args, **task_kwargs) files.tasks.scan_file.task_function(*task_args, **task_kwargs)
file.refresh_from_db() file.refresh_from_db()
self.assertTrue(file.validation.is_ok) self.assertTrue(file.validation.is_ok)

View File

@ -1,7 +1,10 @@
from pathlib import Path from pathlib import Path
from unittest.mock import patch, ANY from unittest.mock import patch, ANY
import dataclasses import dataclasses
import io
import os
import tempfile import tempfile
import zipfile
from django.test import TestCase from django.test import TestCase
@ -11,8 +14,10 @@ from files.utils import (
find_exact_path, find_exact_path,
find_path_by_name, find_path_by_name,
get_thumbnail_upload_to, get_thumbnail_upload_to,
get_wheels_from_manifest,
make_thumbnails, make_thumbnails,
validate_file_list, validate_file_list,
validate_wheels,
) )
# Reusing test files from the extensions app # Reusing test files from the extensions app
@ -290,3 +295,62 @@ class UtilsTest(TestCase):
validate_file_list(test.toml_content, test.manifest_filepath, test.file_list), validate_file_list(test.toml_content, test.manifest_filepath, test.file_list),
test.name, test.name,
) )
def test_get_wheels_from_manifest(self):
@dataclasses.dataclass
class TestParams:
name: str
toml_content: dict
expected: list
for test in [
TestParams(
name='no wheels',
toml_content={'type': 'add-on'},
expected=None,
),
TestParams(
name='top-level wheels',
toml_content={
'type': 'add-on',
'wheels': ['./wheels/1.whl', './wheels/2.whl'],
},
expected=['./wheels/1.whl', './wheels/2.whl'],
),
TestParams(
name='build.generated wheels',
toml_content={
'type': 'add-on',
'wheels': ['./wheels/1.whl', './wheels/2.whl'],
'build': {'generated': {'wheels': ['./wheels/1.whl']}},
},
expected=['./wheels/1.whl'],
),
]:
with self.subTest(**dataclasses.asdict(test)):
self.assertEqual(
test.expected,
get_wheels_from_manifest(test.toml_content),
test.name,
)
@patch(
'files.utils.get_wheel_sha256_from_pypi',
lambda _, __: ('blahblah', None),
)
def test_validate_wheels(self):
buff = io.BytesIO()
with tempfile.TemporaryDirectory() as output_dir:
test_file_path = os.path.join(output_dir, 'test_file.zip')
with zipfile.ZipFile(buff, mode='w') as file:
file.writestr('blender_manifest.toml', b'wheels = ["wheels/1.whl"]')
file.writestr('wheels/1.whl', b'')
with open(test_file_path, 'wb') as f:
f.write(buff.getvalue())
self.assertEqual(
validate_wheels(test_file_path, ['wheels/1.whl']).get('wheels/1.whl'),
'digest in archive=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
', digest on pypi=blahblah',
)

View File

@ -11,6 +11,7 @@ import toml
import typing import typing
import zipfile import zipfile
from packaging.utils import InvalidWheelFilename, parse_wheel_filename
from PIL import Image from PIL import Image
from django.conf import settings from django.conf import settings
from django.core.files.storage import default_storage from django.core.files.storage import default_storage
@ -18,6 +19,7 @@ from ffmpeg import FFmpeg, FFmpegFileNotFound, FFmpegInvalidCommand, FFmpegError
from lxml import etree from lxml import etree
import clamd import clamd
import magic import magic
import requests
from constants.base import THUMBNAIL_FORMAT, THUMBNAIL_SIZES, THUMBNAIL_QUALITY from constants.base import THUMBNAIL_FORMAT, THUMBNAIL_SIZES, THUMBNAIL_QUALITY
@ -29,6 +31,7 @@ FORBIDDEN_FILEPATHS = [
'Thumbs.db', 'Thumbs.db',
'ehthumbs.db', 'ehthumbs.db',
] ]
MANIFEST_NAME = 'blender_manifest.toml'
MODULE_DIR = Path(__file__).resolve().parent MODULE_DIR = Path(__file__).resolve().parent
THEME_SCHEMA = [] THEME_SCHEMA = []
@ -113,7 +116,6 @@ def read_manifest_from_zip(archive_path):
(...) (...)
``` ```
""" """
manifest_name = 'blender_manifest.toml'
error_codes = [] error_codes = []
file_list = [] file_list = []
manifest_content = None manifest_content = None
@ -127,10 +129,10 @@ def read_manifest_from_zip(archive_path):
return None, error_codes return None, error_codes
file_list = myzip.namelist() file_list = myzip.namelist()
manifest_filepath = find_path_by_name(file_list, manifest_name) manifest_filepath = find_path_by_name(file_list, MANIFEST_NAME)
if manifest_filepath is None: if manifest_filepath is None:
logger.info(f"File '{manifest_name}' not found in the archive.") logger.info(f"File '{MANIFEST_NAME}' not found in the archive.")
error_codes.append('missing_manifest_toml') error_codes.append('missing_manifest_toml')
return None, error_codes return None, error_codes
@ -169,6 +171,19 @@ def find_forbidden_filepaths(file_list):
return result return result
def get_wheels_from_manifest(manifest):
wheels = None
if (
'build' in manifest
and 'generated' in manifest['build']
and 'wheels' in manifest['build']['generated']
):
wheels = manifest['build']['generated']['wheels']
else:
wheels = manifest.get('wheels')
return wheels
def validate_file_list(toml_content, manifest_filepath, file_list): def validate_file_list(toml_content, manifest_filepath, file_list):
"""Check the files in in the archive against manifest.""" """Check the files in in the archive against manifest."""
error_codes = [] error_codes = []
@ -194,16 +209,7 @@ def validate_file_list(toml_content, manifest_filepath, file_list):
init_filepath = find_exact_path(file_list, expected_init_path) init_filepath = find_exact_path(file_list, expected_init_path)
if not init_filepath: if not init_filepath:
error_codes.append('invalid_missing_init') error_codes.append('invalid_missing_init')
wheels = None if wheels := get_wheels_from_manifest(toml_content):
if (
'build' in toml_content
and 'generated' in toml_content['build']
and 'wheels' in toml_content['build']['generated']
):
wheels = toml_content['build']['generated']['wheels']
else:
wheels = toml_content.get('wheels')
if wheels:
for wheel in wheels: for wheel in wheels:
expected_wheel_path = _canonical_path(wheel, manifest_filepath) expected_wheel_path = _canonical_path(wheel, manifest_filepath)
wheel_filepath = find_exact_path(file_list, expected_wheel_path) wheel_filepath = find_exact_path(file_list, expected_wheel_path)
@ -363,3 +369,45 @@ def extract_frame(source_path: str, output_path: str, at_time: str = '00:00:00.0
except (FFmpegError, FFmpegFileNotFound, FFmpegInvalidCommand) as e: except (FFmpegError, FFmpegFileNotFound, FFmpegInvalidCommand) as e:
logger.exception(f'Failed to extract a frame: {e.message}, {" ".join(ffmpeg.arguments)}') logger.exception(f'Failed to extract a frame: {e.message}, {" ".join(ffmpeg.arguments)}')
raise raise
def get_wheel_sha256_from_pypi(wheel_name, session):
try:
name, version, *_ = parse_wheel_filename(wheel_name)
except InvalidWheelFilename:
return (None, 'invalid wheel filename')
url = f'https://pypi.org/pypi/{name}/{version}/json'
r = session.get(
url,
headers={'User-Agent': 'extensions.blender.org <extensions@blender.org>'},
timeout=10,
)
if r.status_code == 404:
return (None, f'wheel not found: {url}')
if r.status_code >= 500:
raise Exception(f'{url} returned {r.status_code} error')
data = r.json()
for item in data.get('urls', []):
if item['filename'] == wheel_name and item['packagetype'] == 'bdist_wheel':
return (item['digests']['sha256'], None)
return (None, 'no matching $.urls item in json response')
def validate_wheels(archive_path, wheels):
results = {}
with zipfile.ZipFile(archive_path) as myzip:
manifest_filepath = find_path_by_name(myzip.namelist(), MANIFEST_NAME)
session = requests.Session()
for wheel in wheels:
wheel_path_in_archive = _canonical_path(wheel, manifest_filepath)
wheel_digest = None
with myzip.open(wheel_path_in_archive) as wheel_file:
wheel_digest = get_sha256(wheel_file)
wheel_name = os.path.basename(wheel)
pypi_digest, err = get_wheel_sha256_from_pypi(wheel_name, session)
if err:
results[wheel] = err
continue
if pypi_digest != wheel_digest:
results[wheel] = f'digest in archive={wheel_digest}, digest on pypi={pypi_digest}'
return results

View File

@ -39,6 +39,7 @@ maxminddb==2.2.0
mistune==2.0.4 mistune==2.0.4
multidict==6.0.2 multidict==6.0.2
oauthlib==3.2.0 oauthlib==3.2.0
packaging==24.1
Pillow==9.2.0 Pillow==9.2.0
python-ffmpeg==2.0.12 python-ffmpeg==2.0.12
python-magic==0.4.27 python-magic==0.4.27