I am working on a Flower simulation where I dynamically update the config_train dictionary using a layers_template that is passed through an API. Despite updating config_train successfully, my simulation results in all clients failing during the fit phase, as seen in the terminal logs below. I’m trying to figure out why this is happening.
Here is my code:
run_simulation_flower Function
def run_simulation_flower(path_metrics , model_json=None):
global metrics_file, net, config_train
metrics_file = path_metrics
net = create_model_from_config(model_json['layers']).to(DEVICE)
config_train = update_config_from_template(config_train, model_json)
server = ServerApp(server_fn=server_fn)
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0}}
if DEVICE.type == "cuda":
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0}}
if not os.path.exists(metrics_file):
with open(metrics_file, "w") as f:
data = {"train": []}
json.dump(data, f)
run_simulation(
server_app=server,
client_app=client,
num_supernodes=4,
backend_config=backend_config,
)
update_config_from_template Function
def update_config_from_template(config_train: dict, layers_template: dict) -> dict:
"""
Update the config_train dictionary to include all relevant information from layers_template.
"""
try:
# Parámetros generales
config_train.update({
"layers": layers_template.get("layers", []),
"optimizer": layers_template.get("optimizer", {}),
"loss": layers_template.get("loss", {}),
"federated_name": layers_template.get("federated", {}).get("name", "FedAvg"),
"federated": layers_template.get("federated", {}).get("parameters", {}),
"epochs": layers_template.get("train", {}).get("epochs", config_train.get("epochs", 10)),
"batch_size": layers_template.get("train", {}).get("batch_size", config_train.get("batch_size", 32)),
"rounds": layers_template.get("train", {}).get("rounds", config_train.get("num_rounds", 5)),
"fraction_fit": layers_template.get("federated", {}).get("parameters", {}).get("fraction_fit", 1.0),
"fraction_evaluate": layers_template.get("federated", {}).get("parameters", {}).get("fraction_eval", 0.3),
"min_fit_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_fit_clients", 2),
"min_evaluate_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_eval_clients", 2),
"min_available_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_available_clients", 2),
})
print("Updated config_train:", config_train)
return config_train
except Exception as e:
print("Error updating config_train from layers_template:", e)
return {}
Sample layers_template
layers_template = {
"layers": [
{"type": "Conv2d", "in_channels": 3, "out_channels": 6, "kernel_size": 5},
{"type": "ReLU"},
{"type": "MaxPool2d", "kernel_size": 2, "stride": 2},
{"type": "Conv2d", "in_channels": 6, "out_channels": 16, "kernel_size": 5},
{"type": "ReLU"},
{"type": "MaxPool2d", "kernel_size": 2, "stride": 2},
{"type": "Flatten"},
{"type": "Linear", "in_features": 16 * 5 * 5, "out_features": 120},
{"type": "ReLU"},
{"type": "Linear", "in_features": 120, "out_features": 84},
{"type": "ReLU"},
{"type": "Linear", "in_features": 84, "out_features": 10},
],
"optimizer": {
"name": "Adam",
"parameters": {
"lr": 0.001,
"betas": (0.9, 0.999),
"eps": 1e-08,
"weight_decay": 0,
"amsgrad": False
}
},
"loss": {"name": "CrossEntropyLoss", "parameters": {}},
"federated": {
"name": "test2000",
"parameters": {
"fraction_fit": 1.0,
"fraction_eval": 0.3,
"min_fit_clients": 4,
"min_eval_clients": 2,
"min_available_clients": 4
}
},
"train": {"epochs": 10, "batch_size": 32, "rounds": 5},
"connections": [{}]
}
Terminal output
11/25/2024 16:41:43:INFO:Received initial parameters from one random client
INFO : Evaluating initial global parameters
11/25/2024 16:41:43:INFO:Evaluating initial global parameters
INFO :
11/25/2024 16:41:43:INFO:
INFO : [ROUND 1]
11/25/2024 16:41:43:INFO:[ROUND 1]
INFO : configure_fit: strategy sampled 4 clients (out of 4)
11/25/2024 16:41:43:INFO:configure_fit: strategy sampled 4 clients (out of 4)
INFO : aggregate_fit: received 0 results and 4 failures
11/25/2024 16:41:43:INFO:aggregate_fit: received 0 results and 4 failures
What could be causing the client failures during the fit phase? Is there an issue with how config_train is updated or passed to the run_simulation function?
Any insights or suggestions would be appreciated. Thank you!