Input Validation and Sanitization

Input Validation and Sanitization

Input validation forms the first line of defense against many common attacks. In Python, every piece of external input—whether from web forms, API calls, command-line arguments, or file uploads—must be validated before use. Python's dynamic typing means that type confusion attacks are possible if input types aren't explicitly checked. Always validate both the type and content of input data.

import re
from typing import Optional
import ipaddress
from email_validator import validate_email, EmailNotValidError

class InputValidator:
    @staticmethod
    def validate_username(username: str) -> str:
        """Validate username with strict rules"""
        if not isinstance(username, str):
            raise ValueError("Username must be a string")
        
        if len(username) < 3 or len(username) > 20:
            raise ValueError("Username must be between 3 and 20 characters")
        
        # Only allow alphanumeric characters and underscores
        if not re.match(r'^[a-zA-Z0-9_]+$', username):
            raise ValueError("Username can only contain letters, numbers, and underscores")
        
        # Prevent SQL injection attempts
        sql_keywords = ['DROP', 'DELETE', 'INSERT', 'UPDATE', 'SELECT', 'UNION']
        if any(keyword in username.upper() for keyword in sql_keywords):
            raise ValueError("Username contains invalid keywords")
        
        return username.strip()
    
    @staticmethod
    def validate_email_address(email: str) -> str:
        """Validate email with proper library"""
        try:
            # This performs comprehensive validation
            valid = validate_email(email)
            return valid.email
        except EmailNotValidError as e:
            raise ValueError(f"Invalid email: {str(e)}")
    
    @staticmethod
    def validate_ip_address(ip: str) -> str:
        """Validate IP address format"""
        try:
            # This validates both IPv4 and IPv6
            ip_obj = ipaddress.ip_address(ip)
            # Additional check for private/reserved IPs if needed
            if ip_obj.is_private:
                raise ValueError("Private IP addresses not allowed")
            return str(ip_obj)
        except ValueError:
            raise ValueError("Invalid IP address format")
    
    @staticmethod
    def sanitize_html_input(html: str) -> str:
        """Sanitize HTML input to prevent XSS"""
        import bleach
        
        # Define allowed tags and attributes
        allowed_tags = ['p', 'br', 'strong', 'em', 'u', 'i', 'b']
        allowed_attributes = {}
        
        # Clean the HTML
        cleaned = bleach.clean(
            html,
            tags=allowed_tags,
            attributes=allowed_attributes,
            strip=True
        )
        
        return cleaned