#!/usr/bin/env python3 """Polyglot file generator for MIME confusion testing. Creates files that are technically valid in multiple formats to test that MIME detection correctly identifies the primary format based on magic bytes at offset 0. """ import argparse import sys from pathlib import Path # Magic byte signatures SIGNATURES = { "png": b"\x89PNG\r\n\x1a\n", "gif": b"GIF89a", "jpeg": b"\xff\xd8\xff\xe0\x00\x10JFIF", "pdf": b"%PDF-1.4\n", "zip": b"PK\x03\x04", "gzip": b"\x1f\x8b\x08", "elf": b"\x7fELF", "pe": b"MZ", } # Payloads that could be dangerous if executed PAYLOADS = { "html": b"
", "js": b"/**/alert('XSS')//", "php": b"", "shell": b"#!/bin/sh\necho pwned\n", "svg": b'', } def generate_polyglot(primary: str, payload: str, size: int = 1024) -> bytes: """Generate a polyglot file with primary format magic and embedded payload. Args: primary: Primary format (png, gif, jpeg, pdf, zip, etc.) payload: Payload type to embed (html, js, php, shell, svg) size: Minimum file size (padded with nulls) Returns: Polyglot file content """ if primary not in SIGNATURES: raise ValueError(f"Unknown primary format: {primary}") if payload not in PAYLOADS: raise ValueError(f"Unknown payload type: {payload}") magic = SIGNATURES[primary] payload_bytes = PAYLOADS[payload] # Build polyglot: magic + padding + payload + padding content = magic + b"\x00" * 32 + payload_bytes # Pad to minimum size if len(content) < size: content += b"\x00" * (size - len(content)) return content def generate_gif_js() -> bytes: """Generate GIF/JavaScript polyglot. GIF89a header followed by JS that ignores the binary prefix. """ # GIF header that's also valid JS start # GIF89a = valid GIF magic # The trick: wrap binary in JS comment gif_header = b"GIF89a" # Minimal GIF structure gif_data = ( b"\x01\x00\x01\x00" # 1x1 dimensions b"\x00" # no global color table b"\x00" # background color b"\x00" # aspect ratio b"\x2c" # image descriptor b"\x00\x00\x00\x00" # position b"\x01\x00\x01\x00" # dimensions b"\x00" # no local color table b"\x02\x01\x01\x00\x3b" # minimal image data + trailer ) # JS payload after GIF (browsers may try to execute) js_payload = b"/**/=1;alert('XSS')//" return gif_header + gif_data + js_payload def generate_pdf_js() -> bytes: """Generate PDF with embedded JavaScript.""" # PDF header pdf = b"%PDF-1.4\n" # Minimal PDF structure with JS pdf += b"1 0 obj<>endobj\n" pdf += b"2 0 obj<>endobj\n" pdf += b"3 0 obj<>endobj\n" pdf += b"xref\n0 4\n" pdf += b"0000000000 65535 f \n" pdf += b"0000000009 00000 n \n" pdf += b"0000000058 00000 n \n" pdf += b"0000000101 00000 n \n" pdf += b"trailer<>\n" pdf += b"startxref\n154\n%%EOF" return pdf def generate_zip_html() -> bytes: """Generate ZIP with HTML file inside.""" # PK signature zip_data = b"PK\x03\x04" # Version needed zip_data += b"\x14\x00" # Flags zip_data += b"\x00\x00" # Compression (store) zip_data += b"\x00\x00" # Time/date zip_data += b"\x00\x00\x00\x00" # CRC32 (placeholder) zip_data += b"\x00\x00\x00\x00" # Compressed/uncompressed size html = b"" size = len(html).to_bytes(4, "little") zip_data += size + size # Filename length filename = b"index.html" zip_data += len(filename).to_bytes(2, "little") # Extra field length zip_data += b"\x00\x00" # Filename zip_data += filename # File content zip_data += html return zip_data def generate_png_html() -> bytes: """Generate PNG with HTML in trailing data.""" # Minimal valid PNG png = b"\x89PNG\r\n\x1a\n" # IHDR chunk ihdr_data = ( b"\x00\x00\x00\x01" # width b"\x00\x00\x00\x01" # height b"\x08" # bit depth b"\x02" # color type (RGB) b"\x00" # compression b"\x00" # filter b"\x00" # interlace ) ihdr_crc = b"\x00\x00\x00\x00" # placeholder png += b"\x00\x00\x00\x0d" + b"IHDR" + ihdr_data + ihdr_crc # IDAT chunk (minimal) idat_data = b"\x08\xd7\x63\xf8\x0f\x00\x00\x01\x01\x00" idat_crc = b"\x00\x00\x00\x00" png += len(idat_data).to_bytes(4, "big") + b"IDAT" + idat_data + idat_crc # IEND chunk png += b"\x00\x00\x00\x00" + b"IEND" + b"\xae\x42\x60\x82" # HTML payload after PNG (should be ignored) png += b"" return png # Polyglot generators registry POLYGLOTS = { "gif-js": ("GIF with embedded JavaScript", generate_gif_js), "pdf-js": ("PDF with JavaScript action", generate_pdf_js), "zip-html": ("ZIP containing HTML", generate_zip_html), "png-html": ("PNG with trailing HTML", generate_png_html), } def list_polyglots() -> None: """List available polyglot types.""" print("Available polyglots:") print() for name, (desc, _) in POLYGLOTS.items(): print(f" {name:12} {desc}") print() print("Generic formats:") print(f" primary: {', '.join(SIGNATURES.keys())}") print(f" payloads: {', '.join(PAYLOADS.keys())}") def main() -> int: parser = argparse.ArgumentParser( description="Generate polyglot files for MIME confusion testing" ) parser.add_argument( "type", nargs="?", help="Polyglot type (e.g., gif-js, png-html) or primary:payload", ) parser.add_argument("-o", "--output", help="Output file (default: stdout)") parser.add_argument("-l", "--list", action="store_true", help="List polyglot types") parser.add_argument("-s", "--size", type=int, default=1024, help="Minimum size (default: 1024)") args = parser.parse_args() if args.list or not args.type: list_polyglots() return 0 # Generate polyglot if args.type in POLYGLOTS: _, generator = POLYGLOTS[args.type] content = generator() elif ":" in args.type: primary, payload = args.type.split(":", 1) content = generate_polyglot(primary, payload, args.size) else: print(f"Unknown polyglot type: {args.type}", file=sys.stderr) print("Use --list to see available types", file=sys.stderr) return 1 # Output if args.output: Path(args.output).write_bytes(content) print(f"Written {len(content)} bytes to {args.output}") else: sys.stdout.buffer.write(content) return 0 if __name__ == "__main__": sys.exit(main())