forked from username/flaskpaste
simplify MIME detection to text/binary only
Remove magic byte detection in favor of simple UTF-8 validation: - text/plain for valid UTF-8 content - application/octet-stream for binary data Security maintained via headers (X-Content-Type-Options: nosniff, CSP). Magic signatures preserved as comments for future reference. Disabled test files: - test_mime_detection.py.disabled (magic-dependent tests) - test_polyglot.py.disabled (polyglot format tests) For full MIME detection, consider using the `filetype` library.
This commit is contained in:
@@ -37,70 +37,19 @@ PASTE_ID_PATTERN = re.compile(r"^[a-f0-9]+$")
|
||||
CLIENT_ID_PATTERN = re.compile(r"^[a-f0-9]{40}$")
|
||||
MIME_PATTERN = re.compile(r"^[a-z0-9][a-z0-9!#$&\-^_.+]*/[a-z0-9][a-z0-9!#$&\-^_.+]*$")
|
||||
|
||||
# Magic bytes for binary format detection
|
||||
MAGIC_SIGNATURES: dict[bytes, str] = {
|
||||
# Images
|
||||
b"\x89PNG\r\n\x1a\n": "image/png",
|
||||
b"\xff\xd8\xff": "image/jpeg",
|
||||
b"GIF87a": "image/gif",
|
||||
b"GIF89a": "image/gif",
|
||||
b"RIFF": "image/webp", # RIFF container, verified as WEBP in detect_mime_type
|
||||
b"BM": "image/bmp",
|
||||
b"II\x2a\x00": "image/tiff", # Little-endian TIFF
|
||||
b"MM\x00\x2a": "image/tiff", # Big-endian TIFF
|
||||
b"\x00\x00\x01\x00": "image/x-icon",
|
||||
# HEIC/HEIF (ftyp box with heic/mif1 brand) - bytes 4-7 = "ftyp", 8-12 = brand
|
||||
b"\x00\x00\x00\x18\x66\x74\x79\x70\x68\x65\x69\x63": "image/heic", # ftyp heic
|
||||
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x68\x65\x69\x63": "image/heic", # ftyp heic
|
||||
b"\x00\x00\x00\x18\x66\x74\x79\x70\x6d\x69\x66\x31": "image/heif", # ftyp mif1
|
||||
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x6d\x69\x66\x31": "image/heif", # ftyp mif1
|
||||
# AVIF (ftyp box with avif brand)
|
||||
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x61\x76\x69\x66": "image/avif", # ftyp avif
|
||||
b"\x00\x00\x00\x20\x66\x74\x79\x70\x61\x76\x69\x66": "image/avif", # ftyp avif
|
||||
# Video/Audio containers (checked for subtype in detect_mime_type)
|
||||
b"\x1a\x45\xdf\xa3": "video/webm", # Matroska/WebM (same format)
|
||||
b"FLV\x01": "video/x-flv",
|
||||
b"\x00\x00\x00\x1c\x66\x74\x79\x70\x69\x73\x6f\x6d": "video/mp4", # ftyp isom
|
||||
b"\x00\x00\x00\x1c\x66\x74\x79\x70": "video/mp4", # ftyp box at standard offset
|
||||
b"\x00\x00\x00\x20\x66\x74\x79\x70": "video/mp4", # ftyp with different size
|
||||
b"\x00\x00\x00\x18\x66\x74\x79\x70": "video/mp4", # ftyp with different size
|
||||
# Audio
|
||||
b"ID3": "audio/mpeg", # MP3 with ID3 tag
|
||||
b"\xff\xfb": "audio/mpeg", # MP3 frame sync
|
||||
b"\xff\xfa": "audio/mpeg",
|
||||
b"\xff\xf3": "audio/mpeg",
|
||||
b"\xff\xf2": "audio/mpeg",
|
||||
b"fLaC": "audio/flac",
|
||||
b"OggS": "audio/ogg",
|
||||
# Documents
|
||||
b"%PDF": "application/pdf",
|
||||
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": "application/msword", # OLE (DOC, XLS, PPT, MSI)
|
||||
b"PK\x03\x04": "application/zip", # ZIP, DOCX, XLSX, PPTX, ODT, JAR, APK
|
||||
# Executables
|
||||
b"MZ": "application/x-msdownload", # EXE, DLL
|
||||
b"\x7fELF": "application/x-executable", # ELF (Linux)
|
||||
b"\xfe\xed\xfa\xce": "application/x-mach-binary", # Mach-O 32-bit big-endian
|
||||
b"\xce\xfa\xed\xfe": "application/x-mach-binary", # Mach-O 32-bit little-endian
|
||||
b"\xfe\xed\xfa\xcf": "application/x-mach-binary", # Mach-O 64-bit big-endian
|
||||
b"\xcf\xfa\xed\xfe": "application/x-mach-binary", # Mach-O 64-bit little-endian
|
||||
b"\xca\xfe\xba\xbe": "application/x-mach-binary", # Mach-O fat/universal binary
|
||||
b"\x00asm": "application/wasm", # WebAssembly
|
||||
# Compression/Archives
|
||||
b"\x1f\x8b": "application/gzip",
|
||||
b"BZh": "application/x-bzip2",
|
||||
b"\xfd7zXZ\x00": "application/x-xz",
|
||||
b"\x28\xb5\x2f\xfd": "application/zstd",
|
||||
b"\x04\x22\x4d\x18": "application/x-lz4",
|
||||
b"7z\xbc\xaf\x27\x1c": "application/x-7z-compressed",
|
||||
b"Rar!\x1a\x07": "application/vnd.rar",
|
||||
# Packages
|
||||
b"\xed\xab\xee\xdb": "application/x-rpm",
|
||||
# Data
|
||||
b"SQLite format 3\x00": "application/x-sqlite3",
|
||||
}
|
||||
|
||||
# Maximum magic signature length (for safe prefix slicing)
|
||||
MAX_MAGIC_LEN = 16 # SQLite signature is longest at 16 bytes
|
||||
# NOTE: Magic byte detection commented out - using text/binary detection only.
|
||||
# Security headers (X-Content-Type-Options: nosniff, CSP) prevent MIME confusion.
|
||||
# For full MIME detection, consider using the `filetype` library.
|
||||
#
|
||||
# MAGIC_SIGNATURES: dict[bytes, str] = {
|
||||
# b"\x89PNG\r\n\x1a\n": "image/png",
|
||||
# b"\xff\xd8\xff": "image/jpeg",
|
||||
# b"GIF87a": "image/gif",
|
||||
# b"GIF89a": "image/gif",
|
||||
# b"%PDF": "application/pdf",
|
||||
# b"PK\x03\x04": "application/zip",
|
||||
# # ... (see git history for full list)
|
||||
# }
|
||||
|
||||
# Generic MIME types to override with detection
|
||||
GENERIC_MIME_TYPES = frozenset(
|
||||
@@ -837,31 +786,21 @@ def calculate_entropy(data: bytes) -> float:
|
||||
|
||||
|
||||
def detect_mime_type(content: bytes, content_type: str | None = None) -> str:
|
||||
"""Detect MIME type using magic bytes, headers, or content analysis."""
|
||||
# Magic byte detection (highest priority)
|
||||
# Slice once for safety - only examine first MAX_MAGIC_LEN bytes
|
||||
prefix = content[:MAX_MAGIC_LEN]
|
||||
for magic, mime in MAGIC_SIGNATURES.items():
|
||||
if prefix[: len(magic)] == magic:
|
||||
# RIFF container: check subtype at bytes 8-12
|
||||
if magic == b"RIFF" and len(content) >= 12:
|
||||
subtype = content[8:12]
|
||||
if subtype == b"WEBP":
|
||||
return "image/webp"
|
||||
if subtype == b"AVI ":
|
||||
return "video/x-msvideo"
|
||||
if subtype == b"WAVE":
|
||||
return "audio/wav"
|
||||
continue # Unknown RIFF subtype
|
||||
return mime
|
||||
"""Detect MIME type based on text/binary analysis.
|
||||
|
||||
# Explicit Content-Type (if specific)
|
||||
Simple approach: if content is valid UTF-8, it's text/plain.
|
||||
Otherwise, it's application/octet-stream (binary).
|
||||
|
||||
Security headers (X-Content-Type-Options: nosniff, CSP) prevent
|
||||
browsers from MIME-sniffing and executing embedded scripts.
|
||||
"""
|
||||
# Honor explicit Content-Type if specific (not generic)
|
||||
if content_type:
|
||||
mime = content_type.split(";")[0].strip().lower()
|
||||
if mime not in GENERIC_MIME_TYPES and MIME_PATTERN.match(mime):
|
||||
return mime
|
||||
|
||||
# UTF-8 text detection
|
||||
# Text vs binary detection
|
||||
try:
|
||||
content.decode("utf-8")
|
||||
return "text/plain"
|
||||
@@ -874,15 +813,10 @@ def is_recognizable_format(content: bytes) -> tuple[bool, str | None]:
|
||||
|
||||
Returns (is_recognizable, detected_format).
|
||||
Used to enforce encryption by rejecting known formats.
|
||||
"""
|
||||
# Check magic bytes - slice once for safety
|
||||
prefix = content[:MAX_MAGIC_LEN]
|
||||
for magic, mime in MAGIC_SIGNATURES.items():
|
||||
if prefix[: len(magic)] == magic:
|
||||
if magic == b"RIFF" and len(content) >= 12 and content[8:12] != b"WEBP":
|
||||
continue
|
||||
return True, mime
|
||||
|
||||
Simple approach: valid UTF-8 text is recognizable (plaintext).
|
||||
Binary content is considered potentially encrypted (not recognizable).
|
||||
"""
|
||||
# Check if valid UTF-8 text (plaintext)
|
||||
try:
|
||||
content.decode("utf-8")
|
||||
|
||||
Reference in New Issue
Block a user