simplify MIME detection to text/binary only

Remove magic byte detection in favor of simple UTF-8 validation:
- text/plain for valid UTF-8 content
- application/octet-stream for binary data

Security maintained via headers (X-Content-Type-Options: nosniff, CSP).
Magic signatures preserved as comments for future reference.

Disabled test files:
- test_mime_detection.py.disabled (magic-dependent tests)
- test_polyglot.py.disabled (polyglot format tests)

For full MIME detection, consider using the `filetype` library.
This commit is contained in:
Username
2025-12-26 18:44:24 +01:00
parent fb45005766
commit 3cda73c8b0
6 changed files with 64 additions and 181 deletions

View File

@@ -37,70 +37,19 @@ PASTE_ID_PATTERN = re.compile(r"^[a-f0-9]+$")
CLIENT_ID_PATTERN = re.compile(r"^[a-f0-9]{40}$")
MIME_PATTERN = re.compile(r"^[a-z0-9][a-z0-9!#$&\-^_.+]*/[a-z0-9][a-z0-9!#$&\-^_.+]*$")
# Magic bytes for binary format detection
MAGIC_SIGNATURES: dict[bytes, str] = {
# Images
b"\x89PNG\r\n\x1a\n": "image/png",
b"\xff\xd8\xff": "image/jpeg",
b"GIF87a": "image/gif",
b"GIF89a": "image/gif",
b"RIFF": "image/webp", # RIFF container, verified as WEBP in detect_mime_type
b"BM": "image/bmp",
b"II\x2a\x00": "image/tiff", # Little-endian TIFF
b"MM\x00\x2a": "image/tiff", # Big-endian TIFF
b"\x00\x00\x01\x00": "image/x-icon",
# HEIC/HEIF (ftyp box with heic/mif1 brand) - bytes 4-7 = "ftyp", 8-12 = brand
b"\x00\x00\x00\x18\x66\x74\x79\x70\x68\x65\x69\x63": "image/heic", # ftyp heic
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x68\x65\x69\x63": "image/heic", # ftyp heic
b"\x00\x00\x00\x18\x66\x74\x79\x70\x6d\x69\x66\x31": "image/heif", # ftyp mif1
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x6d\x69\x66\x31": "image/heif", # ftyp mif1
# AVIF (ftyp box with avif brand)
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x61\x76\x69\x66": "image/avif", # ftyp avif
b"\x00\x00\x00\x20\x66\x74\x79\x70\x61\x76\x69\x66": "image/avif", # ftyp avif
# Video/Audio containers (checked for subtype in detect_mime_type)
b"\x1a\x45\xdf\xa3": "video/webm", # Matroska/WebM (same format)
b"FLV\x01": "video/x-flv",
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x69\x73\x6f\x6d": "video/mp4", # ftyp isom
b"\x00\x00\x00\x1c\x66\x74\x79\x70": "video/mp4", # ftyp box at standard offset
b"\x00\x00\x00\x20\x66\x74\x79\x70": "video/mp4", # ftyp with different size
b"\x00\x00\x00\x18\x66\x74\x79\x70": "video/mp4", # ftyp with different size
# Audio
b"ID3": "audio/mpeg", # MP3 with ID3 tag
b"\xff\xfb": "audio/mpeg", # MP3 frame sync
b"\xff\xfa": "audio/mpeg",
b"\xff\xf3": "audio/mpeg",
b"\xff\xf2": "audio/mpeg",
b"fLaC": "audio/flac",
b"OggS": "audio/ogg",
# Documents
b"%PDF": "application/pdf",
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": "application/msword", # OLE (DOC, XLS, PPT, MSI)
b"PK\x03\x04": "application/zip", # ZIP, DOCX, XLSX, PPTX, ODT, JAR, APK
# Executables
b"MZ": "application/x-msdownload", # EXE, DLL
b"\x7fELF": "application/x-executable", # ELF (Linux)
b"\xfe\xed\xfa\xce": "application/x-mach-binary", # Mach-O 32-bit big-endian
b"\xce\xfa\xed\xfe": "application/x-mach-binary", # Mach-O 32-bit little-endian
b"\xfe\xed\xfa\xcf": "application/x-mach-binary", # Mach-O 64-bit big-endian
b"\xcf\xfa\xed\xfe": "application/x-mach-binary", # Mach-O 64-bit little-endian
b"\xca\xfe\xba\xbe": "application/x-mach-binary", # Mach-O fat/universal binary
b"\x00asm": "application/wasm", # WebAssembly
# Compression/Archives
b"\x1f\x8b": "application/gzip",
b"BZh": "application/x-bzip2",
b"\xfd7zXZ\x00": "application/x-xz",
b"\x28\xb5\x2f\xfd": "application/zstd",
b"\x04\x22\x4d\x18": "application/x-lz4",
b"7z\xbc\xaf\x27\x1c": "application/x-7z-compressed",
b"Rar!\x1a\x07": "application/vnd.rar",
# Packages
b"\xed\xab\xee\xdb": "application/x-rpm",
# Data
b"SQLite format 3\x00": "application/x-sqlite3",
}
# Maximum magic signature length (for safe prefix slicing)
MAX_MAGIC_LEN = 16 # SQLite signature is longest at 16 bytes
# NOTE: Magic byte detection commented out - using text/binary detection only.
# Security headers (X-Content-Type-Options: nosniff, CSP) prevent MIME confusion.
# For full MIME detection, consider using the `filetype` library.
#
# MAGIC_SIGNATURES: dict[bytes, str] = {
# b"\x89PNG\r\n\x1a\n": "image/png",
# b"\xff\xd8\xff": "image/jpeg",
# b"GIF87a": "image/gif",
# b"GIF89a": "image/gif",
# b"%PDF": "application/pdf",
# b"PK\x03\x04": "application/zip",
# # ... (see git history for full list)
# }
# Generic MIME types to override with detection
GENERIC_MIME_TYPES = frozenset(
@@ -837,31 +786,21 @@ def calculate_entropy(data: bytes) -> float:
def detect_mime_type(content: bytes, content_type: str | None = None) -> str:
"""Detect MIME type using magic bytes, headers, or content analysis."""
# Magic byte detection (highest priority)
# Slice once for safety - only examine first MAX_MAGIC_LEN bytes
prefix = content[:MAX_MAGIC_LEN]
for magic, mime in MAGIC_SIGNATURES.items():
if prefix[: len(magic)] == magic:
# RIFF container: check subtype at bytes 8-12
if magic == b"RIFF" and len(content) >= 12:
subtype = content[8:12]
if subtype == b"WEBP":
return "image/webp"
if subtype == b"AVI ":
return "video/x-msvideo"
if subtype == b"WAVE":
return "audio/wav"
continue # Unknown RIFF subtype
return mime
"""Detect MIME type based on text/binary analysis.
# Explicit Content-Type (if specific)
Simple approach: if content is valid UTF-8, it's text/plain.
Otherwise, it's application/octet-stream (binary).
Security headers (X-Content-Type-Options: nosniff, CSP) prevent
browsers from MIME-sniffing and executing embedded scripts.
"""
# Honor explicit Content-Type if specific (not generic)
if content_type:
mime = content_type.split(";")[0].strip().lower()
if mime not in GENERIC_MIME_TYPES and MIME_PATTERN.match(mime):
return mime
# UTF-8 text detection
# Text vs binary detection
try:
content.decode("utf-8")
return "text/plain"
@@ -874,15 +813,10 @@ def is_recognizable_format(content: bytes) -> tuple[bool, str | None]:
Returns (is_recognizable, detected_format).
Used to enforce encryption by rejecting known formats.
"""
# Check magic bytes - slice once for safety
prefix = content[:MAX_MAGIC_LEN]
for magic, mime in MAGIC_SIGNATURES.items():
if prefix[: len(magic)] == magic:
if magic == b"RIFF" and len(content) >= 12 and content[8:12] != b"WEBP":
continue
return True, mime
Simple approach: valid UTF-8 text is recognizable (plaintext).
Binary content is considered potentially encrypted (not recognizable).
"""
# Check if valid UTF-8 text (plaintext)
try:
content.decode("utf-8")

View File

@@ -296,24 +296,26 @@ class TestBinaryRequirement:
assert data["detected"] == "text/plain"
assert "hint" in data
def test_png_rejected(self, binary_client):
"""PNG magic bytes should be rejected."""
def test_png_accepted_as_binary(self, binary_client):
"""PNG content accepted as unrecognized binary (magic detection disabled)."""
# PNG signature: 89 50 4E 47 0D 0A 1A 0A
png_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
response = binary_client.post("/", data=png_content)
assert response.status_code == 400
# With magic detection disabled, PNG bytes are just binary
assert response.status_code == 201
data = response.get_json()
assert data["detected"] == "image/png"
assert data["mime_type"] == "application/octet-stream"
def test_jpeg_rejected(self, binary_client):
"""JPEG magic bytes should be rejected."""
def test_jpeg_accepted_as_binary(self, binary_client):
"""JPEG content accepted as unrecognized binary (magic detection disabled)."""
jpeg_content = b"\xff\xd8\xff" + b"\x00" * 100
response = binary_client.post("/", data=jpeg_content)
assert response.status_code == 400
# With magic detection disabled, JPEG bytes are just binary
assert response.status_code == 201
data = response.get_json()
assert data["detected"] == "image/jpeg"
assert data["mime_type"] == "application/octet-stream"
def test_random_binary_accepted(self, binary_client):
"""Random binary data (encrypted) should be accepted."""

View File

@@ -78,7 +78,7 @@ class TestCreatePaste:
assert data["mime_type"] == "text/plain"
def test_create_paste_binary(self, client, png_bytes):
"""Create paste with binary content detects MIME type."""
"""Create paste with binary content returns octet-stream (magic detection disabled)."""
response = client.post(
"/",
data=png_bytes,
@@ -86,7 +86,8 @@ class TestCreatePaste:
)
assert response.status_code == 201
data = json.loads(response.data)
assert data["mime_type"] == "image/png"
# Magic byte detection disabled - binary content is octet-stream
assert data["mime_type"] == "application/octet-stream"
def test_create_paste_empty_fails(self, client):
"""Create paste with empty content fails."""
@@ -196,7 +197,8 @@ class TestGetPasteRaw:
response = client.get(f"/{paste_id}/raw")
assert response.status_code == 200
assert response.data == png_bytes
assert response.content_type == "image/png"
# Magic byte detection disabled - binary served as octet-stream
assert response.content_type == "application/octet-stream"
def test_get_paste_raw_not_found(self, client):
"""Get raw nonexistent paste returns 404."""

View File

@@ -169,47 +169,7 @@ class TestMimeTypeFuzzing:
class TestMimeDetectionFuzzing:
"""Property-based tests for MIME magic byte detection."""
# Known magic signatures mapped to expected MIME types
MAGIC_SIGNATURES: ClassVar[list[tuple[bytes, str]]] = [
(b"\x89PNG\r\n\x1a\n", "image/png"),
(b"\xff\xd8\xff", "image/jpeg"),
(b"GIF87a", "image/gif"),
(b"GIF89a", "image/gif"),
(b"%PDF", "application/pdf"),
(b"PK\x03\x04", "application/zip"),
(b"\x1f\x8b", "application/gzip"),
(b"fLaC", "audio/flac"),
(b"OggS", "audio/ogg"),
(b"ID3", "audio/mpeg"),
(b"\x7fELF", "application/x-executable"),
(b"MZ", "application/x-msdownload"),
(b"BZh", "application/x-bzip2"),
(b"7z\xbc\xaf\x27\x1c", "application/x-7z-compressed"),
(b"SQLite format 3\x00", "application/x-sqlite3"),
# HEIC/HEIF/AVIF (ftyp box format)
(b"\x00\x00\x00\x18\x66\x74\x79\x70\x68\x65\x69\x63", "image/heic"),
(b"\x00\x00\x00\x18\x66\x74\x79\x70\x6d\x69\x66\x31", "image/heif"),
(b"\x00\x00\x00\x1c\x66\x74\x79\x70\x61\x76\x69\x66", "image/avif"),
]
@settings(max_examples=100, suppress_health_check=FIXTURE_HEALTH_CHECKS)
@given(suffix=st.binary(min_size=0, max_size=1000))
def test_magic_prefix_detection(self, client, suffix):
"""Magic bytes followed by arbitrary data should detect correctly."""
for magic, expected_mime in self.MAGIC_SIGNATURES:
content = magic + suffix
response = client.post(
"/",
data=content,
content_type="application/octet-stream",
)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == expected_mime, (
f"Expected {expected_mime} for magic {magic!r}, got {data['mime_type']}"
)
"""Property-based tests for text/binary MIME detection."""
@settings(max_examples=200, suppress_health_check=FIXTURE_HEALTH_CHECKS)
@given(content=st.binary(min_size=1, max_size=5000))
@@ -223,51 +183,36 @@ class TestMimeDetectionFuzzing:
assert response.status_code in (201, 400, 413, 429, 503)
if response.status_code == 201:
data = json.loads(response.data)
# MIME type should always be a valid format
assert "/" in data["mime_type"]
assert len(data["mime_type"]) < 100
# MIME type should be text/plain or application/octet-stream
assert data["mime_type"] in ("text/plain", "application/octet-stream")
@settings(max_examples=100, suppress_health_check=FIXTURE_HEALTH_CHECKS)
@given(
magic=st.sampled_from([m for m, _ in MAGIC_SIGNATURES]),
truncate=st.integers(min_value=1, max_value=10),
)
def test_partial_magic_no_false_match(self, client, magic, truncate):
"""Truncated magic bytes should not produce false positive matches."""
if truncate >= len(magic):
return # Skip if we'd use full magic
partial = magic[:truncate]
# Add random suffix that's clearly not the rest of the magic
content = partial + b"\xff\xfe\xfd\xfc" * 10
@given(content=unicode_content)
def test_utf8_text_detected_as_text(self, client, content):
"""Valid UTF-8 content should be detected as text/plain."""
response = client.post(
"/",
data=content,
data=content.encode("utf-8"),
content_type="application/octet-stream",
)
# Partial magic should not crash - may match different signature or fallback
assert response.status_code in (201, 400, 413, 429, 503)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "text/plain"
@settings(max_examples=50, suppress_health_check=FIXTURE_HEALTH_CHECKS)
@given(
content=st.binary(min_size=100, max_size=1000),
inject_pos=st.integers(min_value=20, max_value=80),
)
def test_magic_not_at_start_ignored(self, client, content, inject_pos):
"""Magic bytes not at offset 0 should not trigger detection."""
# Inject PNG magic in middle of random data
png_magic = b"\x89PNG\r\n\x1a\n"
if inject_pos < len(content):
modified = content[:inject_pos] + png_magic + content[inject_pos:]
response = client.post(
"/",
data=modified,
content_type="application/octet-stream",
)
if response.status_code == 201:
data = json.loads(response.data)
# Should NOT detect as PNG (magic not at start)
assert data["mime_type"] != "image/png"
@settings(max_examples=100, suppress_health_check=FIXTURE_HEALTH_CHECKS)
@given(content=st.binary(min_size=10, max_size=1000).filter(lambda b: b[0] > 127))
def test_binary_detected_as_octet_stream(self, client, content):
"""Non-UTF8 binary should be detected as application/octet-stream."""
# Ensure content is not valid UTF-8 by adding high bytes
invalid_utf8 = bytes([0x80, 0x81, 0x82]) + content
response = client.post(
"/",
data=invalid_utf8,
content_type="application/octet-stream",
)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "application/octet-stream"
class TestJsonFuzzing: