Input Validation and Sanitization

Input Validation and Sanitization

Password input validation represents a critical security boundary often implemented incorrectly. While passwords should accept diverse character sets to maximize entropy, certain validations remain necessary for security and system stability. The challenge lies in implementing these validations without reducing password space or creating vulnerabilities.

Length limits prevent both weak passwords and denial-of-service attacks. Minimum lengths should enforce reasonable security—12 characters for standard applications, 16 for high-security contexts. Maximum lengths prevent memory exhaustion attacks but should be generous—128 to 256 characters accommodate passphrases while preventing abuse. Rejecting passwords over 1KB protects against resource exhaustion without limiting legitimate use.

import re
import unicodedata
from typing import Tuple, Optional

class PasswordValidator:
    """Secure password validation with comprehensive checks"""
    
    def __init__(self, min_length=12, max_length=128, require_complexity=True):
        self.min_length = min_length
        self.max_length = max_length
        self.require_complexity = require_complexity
        
    def validate(self, password: str) -> Tuple[bool, Optional[str]]:
        """Validate password with security-focused rules"""
        
        # Check if password exists
        if not password:
            return False, "Password cannot be empty"
        
        # Length validation
        if len(password) < self.min_length:
            return False, f"Password must be at least {self.min_length} characters"
            
        if len(password) > self.max_length:
            return False, f"Password cannot exceed {self.max_length} characters"
        
        # Prevent DoS through excessive memory usage
        if len(password.encode('utf-8')) > 1024:  # 1KB limit
            return False, "Password too long"
        
        # Normalize Unicode to prevent homograph attacks
        normalized = unicodedata.normalize('NFKC', password)
        if normalized != password:
            return False, "Password contains ambiguous characters"
        
        # Check for null bytes (security issue)
        if '\x00' in password:
            return False, "Password contains invalid characters"
        
        # Complexity requirements (if enabled)
        if self.require_complexity:
            complexity_ok, complexity_msg = self._check_complexity(password)
            if not complexity_ok:
                return False, complexity_msg
        
        # Check against common patterns (optional)
        pattern_ok, pattern_msg = self._check_patterns(password)
        if not pattern_ok:
            return False, pattern_msg
            
        return True, None
    
    def _check_complexity(self, password: str) -> Tuple[bool, Optional[str]]:
        """Check password complexity requirements"""
        
        checks = {
            'lowercase': (r'[a-z]', "lowercase letter"),
            'uppercase': (r'[A-Z]', "uppercase letter"),
            'digit': (r'\d', "number"),
            'special': (r'[!@#$%^&*(),.?":{}|<>]', "special character")
        }
        
        missing = []
        for check_name, (pattern, description) in checks.items():
            if not re.search(pattern, password):
                missing.append(description)
        
        # Require at least 3 out of 4 character classes
        if len(missing) > 1:
            return False, f"Password must contain at least: {', '.join(missing)}"
            
        return True, None
    
    def _check_patterns(self, password: str) -> Tuple[bool, Optional[str]]:
        """Check for common weak patterns"""
        
        # Sequential characters
        if re.search(r'(012|123|234|345|456|567|678|789|890)', password):
            return False, "Password contains sequential numbers"
            
        if re.search(r'(abc|bcd|cde|def|efg|fgh|ghi|hij|ijk|jkl|klm|lmn|'
                     r'mno|nop|opq|pqr|qrs|rst|stu|tuv|uvw|vwx|wxy|xyz)', 
                     password.lower()):
            return False, "Password contains sequential letters"
        
        # Repeated characters
        if re.search(r'(.)\1{3,}', password):
            return False, "Password contains too many repeated characters"
        
        # Keyboard patterns (simplified check)
        keyboard_patterns = ['qwerty', 'asdfgh', 'zxcvbn', '123456', 'password']
        password_lower = password.lower()
        for pattern in keyboard_patterns:
            if pattern in password_lower:
                return False, f"Password contains common pattern: {pattern}"
        
        return True, None

# Advanced validation with context
class ContextualPasswordValidator(PasswordValidator):
    """Password validation with user context"""
    
    def validate_with_context(self, password: str, username: str, 
                            email: str = None, old_password: str = None) -> Tuple[bool, Optional[str]]:
        """Validate password considering user context"""
        
        # Basic validation first
        valid, msg = self.validate(password)
        if not valid:
            return False, msg
        
        # Check password doesn't contain username
        if username.lower() in password.lower():
            return False, "Password cannot contain username"
        
        # Check email parts
        if email:
            email_parts = email.lower().split('@')[0].split('.')
            for part in email_parts:
                if len(part) > 3 and part in password.lower():
                    return False, "Password cannot contain email address parts"
        
        # Check similarity to old password
        if old_password:
            if self._calculate_similarity(password, old_password) > 0.8:
                return False, "New password too similar to old password"
        
        return True, None
    
    def _calculate_similarity(self, str1: str, str2: str) -> float:
        """Calculate similarity between two strings"""
        
        # Simple Jaccard similarity for demonstration
        set1 = set(str1.lower())
        set2 = set(str2.lower())
        
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        
        return intersection / union if union > 0 else 0

Character encoding issues create subtle vulnerabilities. Always handle passwords as UTF-8 encoded strings, properly supporting international characters. Normalize Unicode input to prevent homograph attacks where visually similar characters bypass security checks. However, avoid modifying passwords beyond normalization—transformations like lowercasing or trimming spaces can confuse users and reduce entropy.