Privacy-Preserving Machine Learning
Privacy-Preserving Machine Learning
Training machine learning models on user data while preserving privacy requires specialized tools. Federated learning frameworks enable model training on distributed data without centralizing it. Secure multi-party computation allows multiple parties to jointly compute functions over their inputs while keeping those inputs private.
TensorFlow Federated provides production-ready federated learning capabilities. It enables training models across decentralized data while keeping that data local. PySyft extends PyTorch with privacy-preserving techniques including federated learning, differential privacy, and encrypted computation.
# Federated learning with TensorFlow Federated
import tensorflow as tf
import tensorflow_federated as tff
class FederatedModelTrainer:
def __init__(self):
self.model = self.create_model()
def create_model(self):
"""Create a simple model for federated training"""
return tf.keras.models.Sequential([
tf.keras.layers.Dense(10, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10, activation='softmax')
])
def model_fn(self):
"""TFF model function"""
keras_model = self.create_model()
return tff.learning.from_keras_model(
keras_model,
input_spec=self.get_input_spec(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
def create_federated_averaging_process(self):
"""Create federated averaging process with privacy"""
return tff.learning.build_federated_averaging_process(
model_fn=self.model_fn,
client_optimizer_fn=lambda: tf.keras.optimizers.SGD(0.02),
server_optimizer_fn=lambda: tf.keras.optimizers.SGD(1.0),
model_update_aggregation_factory=tff.learning.dp_aggregator(
noise_multiplier=0.1, # Differential privacy noise
clients_per_round=100,
zeroing=True
)
)
def train_round(self, federated_data):
"""Execute one round of federated training"""
state = self.iterative_process.initialize()
state, metrics = self.iterative_process.next(state, federated_data)
return state, metrics
# PySyft for privacy-preserving PyTorch
import torch
import syft as sy
class PrivateModelTraining:
def __init__(self):
# Create virtual workers for simulation
self.alice = sy.VirtualWorker(sy.torch.hook, id="alice")
self.bob = sy.VirtualWorker(sy.torch.hook, id="bob")
def distribute_data(self, data, labels):
"""Distribute data to virtual workers"""
# Split data between workers
alice_data = data[:len(data)//2].send(self.alice)
alice_labels = labels[:len(labels)//2].send(self.alice)
bob_data = data[len(data)//2:].send(self.bob)
bob_labels = labels[len(labels)//2:].send(self.bob)
return [(alice_data, alice_labels), (bob_data, bob_labels)]
def train_privately(self, model, distributed_data, epochs=10):
"""Train model on distributed data"""
for epoch in range(epochs):
for data, labels in distributed_data:
# Send model to where data is
model.send(data.location)
# Train on remote data
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
optimizer.zero_grad()
predictions = model(data)
loss = torch.nn.functional.cross_entropy(predictions, labels)
loss.backward()
optimizer.step()
# Get model back
model.get()
return model