diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json index 19d4430..8eff783 100644 --- a/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json @@ -67,6 +67,24 @@ 1.20,1.20,1.15,1.25,1.20,1.15, 1.10,1.15 ] + }, + + "gnn_ref_262144": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "128xDGX-H100", + "BS": 262144, + "Hyperparams": { + "opt_base_learning_rate": 0.005 + }, + "Epochs to converge": [ + 2.40,2.55,2.35,2.45,2.50,2.35, + 2.45,2.60,2.35,2.55,2.60,2.40, + 2.40,2.30,2.30,2.45,2.60,2.50, + 2.75,2.45 + ] } } diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json index abfc4e4..854aac4 100644 --- a/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json @@ -65,5 +65,27 @@ 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 ] - } + }, + "llama2_70b_lora_ref_128": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 1e-3, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 11520,13056,10752,12672,12288,11136,10752,13056, 10752,9984, + 11136,11136,11136,10752,11520,11136,11136,10752,11136,9984 + ] + } } diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json index a4e56d2..c7972ba 100644 --- a/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json @@ -124,6 +124,24 @@ 8, 8, 8, 8, 8, 8, 8, 8, 8, 9] }, + "ssd_ref_2560": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "20xDGX-H100", + "BS": 2560, + "Hyperparams": { + "opt_base_learning_rate": 0.000145, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, + 9, 9, 9] + }, + "ssd_ref_4096": { "Benchmark": "ssd", diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json index 7bd17f2..dbd32de 100644 --- a/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json @@ -1,5 +1,27 @@ { + "sd_ref_384": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "16xDGX-H100", + "BS": 384, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2049024, 2049024, 2049024, 2561280, + 2561280, 2561280, 2561280, 2561280, + 2561280, 2561280, 2561280, 2561280, + 3073536, 3073536, 3073536] + }, + "sd_ref_512": { "Benchmark": "stable_diffusion",