add polyglot generator and MIME confusion tests

- polyglot_generator.py: creates files valid in multiple formats
- 41 new tests verify MIME detection handles polyglots correctly
- Document rate limiting behavior under attack
- Clarify DMG/ISO/DOCX detection limitations
This commit is contained in:
Username
2025-12-26 18:25:46 +01:00
parent 98694ba1cc
commit fb45005766
3 changed files with 463 additions and 5 deletions

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env python3
"""Polyglot file generator for MIME confusion testing.
Creates files that are technically valid in multiple formats to test
that MIME detection correctly identifies the primary format based on
magic bytes at offset 0.
"""
import argparse
import sys
from pathlib import Path
# Magic byte signatures
SIGNATURES = {
"png": b"\x89PNG\r\n\x1a\n",
"gif": b"GIF89a",
"jpeg": b"\xff\xd8\xff\xe0\x00\x10JFIF",
"pdf": b"%PDF-1.4\n",
"zip": b"PK\x03\x04",
"gzip": b"\x1f\x8b\x08",
"elf": b"\x7fELF",
"pe": b"MZ",
}
# Payloads that could be dangerous if executed
PAYLOADS = {
"html": b"<html><body><script>alert('XSS')</script></body></html>",
"js": b"/**/alert('XSS')//",
"php": b"<?php system($_GET['cmd']); ?>",
"shell": b"#!/bin/sh\necho pwned\n",
"svg": b'<svg xmlns="http://www.w3.org/2000/svg"><script>alert(1)</script></svg>',
}
def generate_polyglot(primary: str, payload: str, size: int = 1024) -> bytes:
"""Generate a polyglot file with primary format magic and embedded payload.
Args:
primary: Primary format (png, gif, jpeg, pdf, zip, etc.)
payload: Payload type to embed (html, js, php, shell, svg)
size: Minimum file size (padded with nulls)
Returns:
Polyglot file content
"""
if primary not in SIGNATURES:
raise ValueError(f"Unknown primary format: {primary}")
if payload not in PAYLOADS:
raise ValueError(f"Unknown payload type: {payload}")
magic = SIGNATURES[primary]
payload_bytes = PAYLOADS[payload]
# Build polyglot: magic + padding + payload + padding
content = magic + b"\x00" * 32 + payload_bytes
# Pad to minimum size
if len(content) < size:
content += b"\x00" * (size - len(content))
return content
def generate_gif_js() -> bytes:
"""Generate GIF/JavaScript polyglot.
GIF89a header followed by JS that ignores the binary prefix.
"""
# GIF header that's also valid JS start
# GIF89a = valid GIF magic
# The trick: wrap binary in JS comment
gif_header = b"GIF89a"
# Minimal GIF structure
gif_data = (
b"\x01\x00\x01\x00" # 1x1 dimensions
b"\x00" # no global color table
b"\x00" # background color
b"\x00" # aspect ratio
b"\x2c" # image descriptor
b"\x00\x00\x00\x00" # position
b"\x01\x00\x01\x00" # dimensions
b"\x00" # no local color table
b"\x02\x01\x01\x00\x3b" # minimal image data + trailer
)
# JS payload after GIF (browsers may try to execute)
js_payload = b"/**/=1;alert('XSS')//"
return gif_header + gif_data + js_payload
def generate_pdf_js() -> bytes:
"""Generate PDF with embedded JavaScript."""
# PDF header
pdf = b"%PDF-1.4\n"
# Minimal PDF structure with JS
pdf += b"1 0 obj<</Type/Catalog/Pages 2 0 R/OpenAction 3 0 R>>endobj\n"
pdf += b"2 0 obj<</Type/Pages/Kids[]/Count 0>>endobj\n"
pdf += b"3 0 obj<</S/JavaScript/JS(app.alert('XSS'))>>endobj\n"
pdf += b"xref\n0 4\n"
pdf += b"0000000000 65535 f \n"
pdf += b"0000000009 00000 n \n"
pdf += b"0000000058 00000 n \n"
pdf += b"0000000101 00000 n \n"
pdf += b"trailer<</Size 4/Root 1 0 R>>\n"
pdf += b"startxref\n154\n%%EOF"
return pdf
def generate_zip_html() -> bytes:
"""Generate ZIP with HTML file inside."""
# PK signature
zip_data = b"PK\x03\x04"
# Version needed
zip_data += b"\x14\x00"
# Flags
zip_data += b"\x00\x00"
# Compression (store)
zip_data += b"\x00\x00"
# Time/date
zip_data += b"\x00\x00\x00\x00"
# CRC32 (placeholder)
zip_data += b"\x00\x00\x00\x00"
# Compressed/uncompressed size
html = b"<script>alert(1)</script>"
size = len(html).to_bytes(4, "little")
zip_data += size + size
# Filename length
filename = b"index.html"
zip_data += len(filename).to_bytes(2, "little")
# Extra field length
zip_data += b"\x00\x00"
# Filename
zip_data += filename
# File content
zip_data += html
return zip_data
def generate_png_html() -> bytes:
"""Generate PNG with HTML in trailing data."""
# Minimal valid PNG
png = b"\x89PNG\r\n\x1a\n"
# IHDR chunk
ihdr_data = (
b"\x00\x00\x00\x01" # width
b"\x00\x00\x00\x01" # height
b"\x08" # bit depth
b"\x02" # color type (RGB)
b"\x00" # compression
b"\x00" # filter
b"\x00" # interlace
)
ihdr_crc = b"\x00\x00\x00\x00" # placeholder
png += b"\x00\x00\x00\x0d" + b"IHDR" + ihdr_data + ihdr_crc
# IDAT chunk (minimal)
idat_data = b"\x08\xd7\x63\xf8\x0f\x00\x00\x01\x01\x00"
idat_crc = b"\x00\x00\x00\x00"
png += len(idat_data).to_bytes(4, "big") + b"IDAT" + idat_data + idat_crc
# IEND chunk
png += b"\x00\x00\x00\x00" + b"IEND" + b"\xae\x42\x60\x82"
# HTML payload after PNG (should be ignored)
png += b"<html><script>alert(1)</script></html>"
return png
# Polyglot generators registry
POLYGLOTS = {
"gif-js": ("GIF with embedded JavaScript", generate_gif_js),
"pdf-js": ("PDF with JavaScript action", generate_pdf_js),
"zip-html": ("ZIP containing HTML", generate_zip_html),
"png-html": ("PNG with trailing HTML", generate_png_html),
}
def list_polyglots() -> None:
"""List available polyglot types."""
print("Available polyglots:")
print()
for name, (desc, _) in POLYGLOTS.items():
print(f" {name:12} {desc}")
print()
print("Generic formats:")
print(f" primary: {', '.join(SIGNATURES.keys())}")
print(f" payloads: {', '.join(PAYLOADS.keys())}")
def main() -> int:
parser = argparse.ArgumentParser(
description="Generate polyglot files for MIME confusion testing"
)
parser.add_argument(
"type",
nargs="?",
help="Polyglot type (e.g., gif-js, png-html) or primary:payload",
)
parser.add_argument("-o", "--output", help="Output file (default: stdout)")
parser.add_argument("-l", "--list", action="store_true", help="List polyglot types")
parser.add_argument("-s", "--size", type=int, default=1024, help="Minimum size (default: 1024)")
args = parser.parse_args()
if args.list or not args.type:
list_polyglots()
return 0
# Generate polyglot
if args.type in POLYGLOTS:
_, generator = POLYGLOTS[args.type]
content = generator()
elif ":" in args.type:
primary, payload = args.type.split(":", 1)
content = generate_polyglot(primary, payload, args.size)
else:
print(f"Unknown polyglot type: {args.type}", file=sys.stderr)
print("Use --list to see available types", file=sys.stderr)
return 1
# Output
if args.output:
Path(args.output).write_bytes(content)
print(f"Written {len(content)} bytes to {args.output}")
else:
sys.stdout.buffer.write(content)
return 0
if __name__ == "__main__":
sys.exit(main())

146
tests/test_polyglot.py Normal file
View File

@@ -0,0 +1,146 @@
"""Tests for polyglot file MIME detection.
Verifies that polyglot files (valid in multiple formats) are detected
by their primary magic bytes at offset 0, not by embedded payloads.
"""
import json
import sys
import pytest
sys.path.insert(0, "tests/security")
from polyglot_generator import (
generate_gif_js,
generate_pdf_js,
generate_png_html,
generate_polyglot,
generate_zip_html,
)
class TestPolyglotDetection:
"""Verify polyglot files are detected by primary magic."""
def test_gif_js_detected_as_gif(self, client):
"""GIF/JS polyglot should be detected as GIF."""
content = generate_gif_js()
response = client.post("/", data=content)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "image/gif"
def test_pdf_js_detected_as_pdf(self, client):
"""PDF with JavaScript should be detected as PDF."""
content = generate_pdf_js()
response = client.post("/", data=content)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "application/pdf"
def test_zip_html_detected_as_zip(self, client):
"""ZIP containing HTML should be detected as ZIP."""
content = generate_zip_html()
response = client.post("/", data=content)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "application/zip"
def test_png_html_detected_as_png(self, client):
"""PNG with trailing HTML should be detected as PNG."""
content = generate_png_html()
response = client.post("/", data=content)
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == "image/png"
class TestGenericPolyglots:
"""Test generic primary:payload combinations."""
@pytest.mark.parametrize(
"primary,expected_mime",
[
("png", "image/png"),
("gif", "image/gif"),
("jpeg", "image/jpeg"),
("pdf", "application/pdf"),
("zip", "application/zip"),
("gzip", "application/gzip"),
("elf", "application/x-executable"),
("pe", "application/x-msdownload"),
],
)
@pytest.mark.parametrize("payload", ["html", "js", "php", "shell"])
def test_primary_format_wins(self, client, primary, expected_mime, payload):
"""Primary format magic should determine MIME type, not payload."""
content = generate_polyglot(primary, payload)
response = client.post("/", data=content, content_type="application/octet-stream")
if response.status_code == 201:
data = json.loads(response.data)
assert data["mime_type"] == expected_mime, (
f"{primary}:{payload} detected as {data['mime_type']}, expected {expected_mime}"
)
class TestSecurityHeaders:
"""Verify security headers prevent polyglot execution."""
def test_nosniff_header_on_polyglot(self, client):
"""X-Content-Type-Options: nosniff should be present."""
content = generate_gif_js()
create = client.post("/", data=content)
if create.status_code == 201:
data = json.loads(create.data)
paste_id = data["id"]
raw = client.get(f"/{paste_id}/raw")
assert raw.headers.get("X-Content-Type-Options") == "nosniff"
def test_csp_header_on_polyglot(self, client):
"""CSP should prevent script execution."""
content = generate_png_html()
create = client.post("/", data=content)
if create.status_code == 201:
data = json.loads(create.data)
paste_id = data["id"]
raw = client.get(f"/{paste_id}/raw")
csp = raw.headers.get("Content-Security-Policy", "")
assert "default-src 'none'" in csp
def test_xframe_options_on_polyglot(self, client):
"""X-Frame-Options should prevent framing."""
content = generate_pdf_js()
create = client.post("/", data=content)
if create.status_code == 201:
data = json.loads(create.data)
paste_id = data["id"]
raw = client.get(f"/{paste_id}/raw")
assert raw.headers.get("X-Frame-Options") == "DENY"
class TestPayloadNotExecuted:
"""Verify embedded payloads are returned literally."""
def test_html_payload_literal(self, client):
"""HTML payload should be returned as-is, not rendered."""
content = generate_polyglot("png", "html")
create = client.post("/", data=content)
if create.status_code == 201:
data = json.loads(create.data)
paste_id = data["id"]
raw = client.get(f"/{paste_id}/raw")
# Content should contain literal script tag
assert b"<script>" in raw.data
# But Content-Type should be image/png
assert "image/png" in raw.content_type
def test_php_payload_literal(self, client):
"""PHP payload should be returned as-is."""
content = generate_polyglot("gif", "php")
create = client.post("/", data=content)
if create.status_code == 201:
data = json.loads(create.data)
paste_id = data["id"]
raw = client.get(f"/{paste_id}/raw")
assert b"<?php" in raw.data
assert "image/gif" in raw.content_type