I am working on a Flower simulation where I dynamically update the config_train
dictionary using a layers_template
that is passed through an API. Despite updating config_train
successfully, my simulation results in all clients failing during the fit
phase, as seen in the terminal logs below. I’m trying to figure out why this is happening.
Here is my code:
run_simulation_flower
Function
def run_simulation_flower(path_metrics , model_json=None):
global metrics_file, net, config_train
metrics_file = path_metrics
net = create_model_from_config(model_json['layers']).to(DEVICE)
config_train = update_config_from_template(config_train, model_json)
server = ServerApp(server_fn=server_fn)
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0}}
if DEVICE.type == "cuda":
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0}}
if not os.path.exists(metrics_file):
with open(metrics_file, "w") as f:
data = {"train": []}
json.dump(data, f)
run_simulation(
server_app=server,
client_app=client,
num_supernodes=4,
backend_config=backend_config,
)
update_config_from_template
Function
def update_config_from_template(config_train: dict, layers_template: dict) -> dict:
"""
Update the config_train dictionary to include all relevant information from layers_template.
"""
try:
# Parámetros generales
config_train.update({
"layers": layers_template.get("layers", []),
"optimizer": layers_template.get("optimizer", {}),
"loss": layers_template.get("loss", {}),
"federated_name": layers_template.get("federated", {}).get("name", "FedAvg"),
"federated": layers_template.get("federated", {}).get("parameters", {}),
"epochs": layers_template.get("train", {}).get("epochs", config_train.get("epochs", 10)),
"batch_size": layers_template.get("train", {}).get("batch_size", config_train.get("batch_size", 32)),
"rounds": layers_template.get("train", {}).get("rounds", config_train.get("num_rounds", 5)),
"fraction_fit": layers_template.get("federated", {}).get("parameters", {}).get("fraction_fit", 1.0),
"fraction_evaluate": layers_template.get("federated", {}).get("parameters", {}).get("fraction_eval", 0.3),
"min_fit_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_fit_clients", 2),
"min_evaluate_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_eval_clients", 2),
"min_available_clients": layers_template.get("federated", {}).get("parameters", {}).get("min_available_clients", 2),
})
print("Updated config_train:", config_train)
return config_train
except Exception as e:
print("Error updating config_train from layers_template:", e)
return {}
Sample layers_template
layers_template = {
"layers": [
{"type": "Conv2d", "in_channels": 3, "out_channels": 6, "kernel_size": 5},
{"type": "ReLU"},
{"type": "MaxPool2d", "kernel_size": 2, "stride": 2},
{"type": "Conv2d", "in_channels": 6, "out_channels": 16, "kernel_size": 5},
{"type": "ReLU"},
{"type": "MaxPool2d", "kernel_size": 2, "stride": 2},
{"type": "Flatten"},
{"type": "Linear", "in_features": 16 * 5 * 5, "out_features": 120},
{"type": "ReLU"},
{"type": "Linear", "in_features": 120, "out_features": 84},
{"type": "ReLU"},
{"type": "Linear", "in_features": 84, "out_features": 10},
],
"optimizer": {
"name": "Adam",
"parameters": {
"lr": 0.001,
"betas": (0.9, 0.999),
"eps": 1e-08,
"weight_decay": 0,
"amsgrad": False
}
},
"loss": {"name": "CrossEntropyLoss", "parameters": {}},
"federated": {
"name": "test2000",
"parameters": {
"fraction_fit": 1.0,
"fraction_eval": 0.3,
"min_fit_clients": 4,
"min_eval_clients": 2,
"min_available_clients": 4
}
},
"train": {"epochs": 10, "batch_size": 32, "rounds": 5},
"connections": [{}]
}
Terminal output
11/25/2024 16:41:43:INFO:Received initial parameters from one random client
INFO : Evaluating initial global parameters
11/25/2024 16:41:43:INFO:Evaluating initial global parameters
INFO :
11/25/2024 16:41:43:INFO:
INFO : [ROUND 1]
11/25/2024 16:41:43:INFO:[ROUND 1]
INFO : configure_fit: strategy sampled 4 clients (out of 4)
11/25/2024 16:41:43:INFO:configure_fit: strategy sampled 4 clients (out of 4)
INFO : aggregate_fit: received 0 results and 4 failures
11/25/2024 16:41:43:INFO:aggregate_fit: received 0 results and 4 failures
What could be causing the client failures during the fit
phase? Is there an issue with how config_train
is updated or passed to the run_simulation
function?
Any insights or suggestions would be appreciated. Thank you!