Privacy-Preserving Machine Learning

Privacy-Preserving Machine Learning

Training machine learning models on user data while preserving privacy requires specialized tools. Federated learning frameworks enable model training on distributed data without centralizing it. Secure multi-party computation allows multiple parties to jointly compute functions over their inputs while keeping those inputs private.

TensorFlow Federated provides production-ready federated learning capabilities. It enables training models across decentralized data while keeping that data local. PySyft extends PyTorch with privacy-preserving techniques including federated learning, differential privacy, and encrypted computation.

# Federated learning with TensorFlow Federated
import tensorflow as tf
import tensorflow_federated as tff

class FederatedModelTrainer:
    def __init__(self):
        self.model = self.create_model()
        
    def create_model(self):
        """Create a simple model for federated training"""
        return tf.keras.models.Sequential([
            tf.keras.layers.Dense(10, activation='relu', input_shape=(784,)),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
    
    def model_fn(self):
        """TFF model function"""
        keras_model = self.create_model()
        return tff.learning.from_keras_model(
            keras_model,
            input_spec=self.get_input_spec(),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )
    
    def create_federated_averaging_process(self):
        """Create federated averaging process with privacy"""
        return tff.learning.build_federated_averaging_process(
            model_fn=self.model_fn,
            client_optimizer_fn=lambda: tf.keras.optimizers.SGD(0.02),
            server_optimizer_fn=lambda: tf.keras.optimizers.SGD(1.0),
            model_update_aggregation_factory=tff.learning.dp_aggregator(
                noise_multiplier=0.1,  # Differential privacy noise
                clients_per_round=100,
                zeroing=True
            )
        )
    
    def train_round(self, federated_data):
        """Execute one round of federated training"""
        state = self.iterative_process.initialize()
        state, metrics = self.iterative_process.next(state, federated_data)
        return state, metrics

# PySyft for privacy-preserving PyTorch
import torch
import syft as sy

class PrivateModelTraining:
    def __init__(self):
        # Create virtual workers for simulation
        self.alice = sy.VirtualWorker(sy.torch.hook, id="alice")
        self.bob = sy.VirtualWorker(sy.torch.hook, id="bob")
        
    def distribute_data(self, data, labels):
        """Distribute data to virtual workers"""
        # Split data between workers
        alice_data = data[:len(data)//2].send(self.alice)
        alice_labels = labels[:len(labels)//2].send(self.alice)
        
        bob_data = data[len(data)//2:].send(self.bob)
        bob_labels = labels[len(labels)//2:].send(self.bob)
        
        return [(alice_data, alice_labels), (bob_data, bob_labels)]
    
    def train_privately(self, model, distributed_data, epochs=10):
        """Train model on distributed data"""
        for epoch in range(epochs):
            for data, labels in distributed_data:
                # Send model to where data is
                model.send(data.location)
                
                # Train on remote data
                optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
                optimizer.zero_grad()
                
                predictions = model(data)
                loss = torch.nn.functional.cross_entropy(predictions, labels)
                loss.backward()
                optimizer.step()
                
                # Get model back
                model.get()
        
        return model