diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index fc260c7e99af4..95f4092d35803 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -43,14 +43,23 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Grad", "(Tensor) Input gradient"); AddInput("Moment", "(Tensor) Second moment"); AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("MomentOut", "(Tensor) Output second moment"); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("epsilon", "(float, default 1.0e-6) " "Constant for numerical stability") .SetDefault(1.0e-6f); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); AddComment(R"DOC( Adaptive Gradient Algorithm (Adagrad). diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h index 8cbba11c7b256..05d62debc3d94 100644 --- a/paddle/fluid/pybind/eager_generator.h +++ b/paddle/fluid/pybind/eager_generator.h @@ -205,6 +205,7 @@ std::map> op_ins_map = { {"sparse_attention", {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}}, {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}}, + {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}}, {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}}, {"nce", {"Input", @@ -361,6 +362,7 @@ std::map> op_outs_map = { "Beta2PowOut", "MasterParamOut"}}, {"sgd", {"ParamOut", "MasterParamOut"}}, + {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, {"lamb", {"ParamOut", "Moment1Out", @@ -399,7 +401,7 @@ std::map> op_passing_outs_map = { "MasterParamOut"}}, {"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}}, {"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}}, - {"adagrad", {"ParamOut", "MomentOut"}}, + {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, {"adamax", {"ParamOut", "MomentOut", "InfNormOut"}}, {"dpsgd", {"ParamOut"}}, {"decayed_adagrad", {"ParamOut", "MomentOut"}}, diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index ed66438513139..b3cebfbe05819 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -29,15 +29,16 @@ inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out) - op : adagrad_ - args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float epsilon) - output : Tensor(param_out), Tensor(moment_out) + args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision) + output : Tensor(param_out), Tensor(moment_out), Tensor(master_param_out) infer_meta : func : AdagradInferMeta kernel : - func : adagrad {dense, dense, dense, dense -> dense, dense} - adagrad_dense_param_sparse_grad {dense, selected_rows, dense, dense -> dense, dense} + func : adagrad {dense, dense, dense, dense, dense -> dense, dense, dense} + adagrad_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense-> dense, dense, dense} data_type : param - inplace : (param -> param_out), (moment -> moment_out) + optional : master_param + inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out) - op : adam_ args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 61dc9e31acd75..eac84e86256aa 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -74,9 +74,12 @@ void AdagradInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& moment, const MetaTensor& learning_rate, + const MetaTensor& master_param, float epsilon, + bool multi_precision, MetaTensor* param_out, - MetaTensor* moment_out) { + MetaTensor* moment_out, + MetaTensor* master_param_out) { auto lr_dims = learning_rate.dims(); PADDLE_ENFORCE_EQ( phi::product(lr_dims), diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 1118604275b6f..b954fdb90aa1d 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -53,9 +53,12 @@ void AdagradInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& moment, const MetaTensor& learning_rate, + const MetaTensor& master_param, float epsilon, + bool multi_precision, MetaTensor* param_out, - MetaTensor* moment_out); + MetaTensor* moment_out, + MetaTensor* master_param_out); void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& grad, diff --git a/paddle/phi/kernels/adagrad_kernel.h b/paddle/phi/kernels/adagrad_kernel.h index cac662fddf264..d676bee45ecfd 100644 --- a/paddle/phi/kernels/adagrad_kernel.h +++ b/paddle/phi/kernels/adagrad_kernel.h @@ -25,9 +25,12 @@ void AdagradDenseKernel(const Context& dev_ctx, const DenseTensor& grad, const DenseTensor& moment, const DenseTensor& learning_rate, + const paddle::optional& master_param, float epsilon, + bool multi_precision, DenseTensor* param_out, - DenseTensor* moment_out); + DenseTensor* moment_out, + DenseTensor* master_param_outs); template void AdagradSparseKernel(const Context& dev_ctx, @@ -35,8 +38,11 @@ void AdagradSparseKernel(const Context& dev_ctx, const SelectedRows& grad, const DenseTensor& moment, const DenseTensor& learning_rate, + const paddle::optional& master_param, float epsilon, + bool multi_precision, DenseTensor* param_out, - DenseTensor* moment_out); + DenseTensor* moment_out, + DenseTensor* master_param_outs); } // namespace phi diff --git a/paddle/phi/kernels/cpu/adagrad_kernel.cc b/paddle/phi/kernels/cpu/adagrad_kernel.cc index fd4c185ab519a..ca5f4ef80d301 100644 --- a/paddle/phi/kernels/cpu/adagrad_kernel.cc +++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc @@ -28,6 +28,42 @@ size_t FindPos(const std::vector& rows, int64_t value) { } } // namespace +template +struct DenseAdagradFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& param_t, + const DenseTensor& grad_t, + const DenseTensor& moment_t, + const DenseTensor& learning_rate, + const paddle::optional& master_param, + float epsilon_t, + bool multi_precision, + DenseTensor* param_out_tensor, + DenseTensor* moment_out_tensor, + DenseTensor* master_param_outs) { + ctx.template Alloc(param_out_tensor); + ctx.template Alloc(moment_out_tensor); + + T epsilon = static_cast(epsilon_t); + + auto param = EigenVector::Flatten(param_t); + + auto grad = EigenVector::Flatten(grad_t); + + auto moment = EigenVector::Flatten(moment_t); + + auto param_out = EigenVector::Flatten(*param_out_tensor); + auto moment_out = EigenVector::Flatten(*moment_out_tensor); + auto place = *ctx.eigen_device(); + + moment_out.device(place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + auto* lr = learning_rate.data(); + param_out.device(place) = + param - lr[0] * grad / (moment_out.sqrt() + epsilon); + } +}; + template struct SparseAdagradFunctor { void operator()(const phi::CPUContext& context, @@ -67,6 +103,8 @@ struct SparseAdagradFunctor { template struct SparseAdagradFunctor; template struct SparseAdagradFunctor; +template struct DenseAdagradFunctor; +template struct DenseAdagradFunctor; } // namespace phi diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu index 8f3d7a3c5580a..14046ed4491c6 100644 --- a/paddle/phi/kernels/gpu/adagrad_kernel.cu +++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/adagrad_kernel.h" - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" @@ -23,6 +25,79 @@ namespace phi { +template +__global__ void AdagradGPUKernel(const T* param, + const T* grad, + const MT* moment, + const MT* lr, + const MT* master_param, + MT epsilon, + T* param_out, + MT* moment_out, + MT* master_param_out, + int num) { + auto idx = blockDim.x * blockIdx.x + threadIdx.x; + MT lr_data = static_cast(lr[0]); + + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + MT grad_data = static_cast(grad[i]); + MT moment_out_data = static_cast(moment[i]) + grad_data * grad_data; + moment_out[i] = static_cast(moment_out_data); + auto in = master_param_out ? master_param[i] : static_cast(param[i]); + MT param_out_data = + in - (lr_data * grad_data) / (sqrt(moment_out_data) + epsilon); + + param_out[i] = static_cast(param_out_data); + + if (master_param_out) { + master_param_out[i] = param_out_data; + } + } +} + +template +struct DenseAdagradFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& param_t, + const DenseTensor& grad_t, + const DenseTensor& moment_t, + const DenseTensor& learning_rate, + const paddle::optional& master_param, + float epsilon_t, + bool multi_precision, + DenseTensor* param_out_tensor, + DenseTensor* moment_out_tensor, + DenseTensor* master_param_outs) { + using MPDType = typename phi::dtype::template MPTypeTrait::Type; + T* param_out_data = ctx.template Alloc(param_out_tensor); + MPDType* moment_out_data = ctx.template Alloc(moment_out_tensor); + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? ctx.template Alloc(master_param_outs) + : nullptr; + + MPDType epsilon = static_cast(epsilon_t); + + int numel = param_t.numel(); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 1); + int grid = config.block_per_grid.x; + int block = config.thread_per_block.x; + auto stream = ctx.stream(); + AdagradGPUKernel + <<>>(param_t.data(), + grad_t.data(), + moment_t.data(), + learning_rate.data(), + master_in_data, + epsilon, + param_out_data, + moment_out_data, + master_out_data, + numel); + } +}; + template __global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows, @@ -123,11 +198,19 @@ struct SparseAdagradFunctor { template struct SparseAdagradFunctor; template struct SparseAdagradFunctor; +template struct DenseAdagradFunctor; +template struct DenseAdagradFunctor; +template struct DenseAdagradFunctor; } // namespace phi -PD_REGISTER_KERNEL( - adagrad, GPU, ALL_LAYOUT, phi::AdagradDenseKernel, float, double) {} +PD_REGISTER_KERNEL(adagrad, + GPU, + ALL_LAYOUT, + phi::AdagradDenseKernel, + float, + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(adagrad_dense_param_sparse_grad, GPU, diff --git a/paddle/phi/kernels/impl/adagrad_kernel_impl.h b/paddle/phi/kernels/impl/adagrad_kernel_impl.h index ca349e8b42693..3156967fc5017 100644 --- a/paddle/phi/kernels/impl/adagrad_kernel_impl.h +++ b/paddle/phi/kernels/impl/adagrad_kernel_impl.h @@ -30,6 +30,21 @@ struct SparseAdagradFunctor { DenseTensor* param); }; +template +struct DenseAdagradFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& param_t, + const DenseTensor& grad_t, + const DenseTensor& moment_t, + const DenseTensor& learning_rate, + const paddle::optional& master_param, + float epsilon_t, + bool multi_precision, + DenseTensor* param_out_tensor, + DenseTensor* moment_out_tensor, + DenseTensor* master_param_outs); +}; + template phi::SelectedRows SquareSelectedRows(const DeviceContext& context, const phi::SelectedRows& input) { @@ -50,35 +65,24 @@ void AdagradDenseKernel(const Context& ctx, const DenseTensor& grad_t, const DenseTensor& moment_t, const DenseTensor& learning_rate, + const paddle::optional& master_param, float epsilon_t, + bool multi_precision, DenseTensor* param_out_tensor, - DenseTensor* moment_out_tensor) { - ctx.template Alloc(param_out_tensor); - ctx.template Alloc(moment_out_tensor); - - T epsilon = static_cast(epsilon_t); - - auto param = EigenVector::Flatten(param_t); - - auto grad = EigenVector::Flatten(grad_t); - - auto moment = EigenVector::Flatten(moment_t); - - auto param_out = EigenVector::Flatten(*param_out_tensor); - auto moment_out = EigenVector::Flatten(*moment_out_tensor); - auto place = *ctx.eigen_device(); - - moment_out.device(place) = moment + grad * grad; - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - if (paddle::platform::is_cpu_place(ctx.GetPlace())) { - auto* lr = learning_rate.data(); - param_out.device(place) = - param - lr[0] * grad / (moment_out.sqrt() + epsilon); - } else { - auto lr = EigenVector::Flatten(learning_rate); - param_out.device(place) = - param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); - } + DenseTensor* moment_out_tensor, + DenseTensor* master_param_outs) { + DenseAdagradFunctor functor; + functor(ctx, + param_t, + grad_t, + moment_t, + learning_rate, + master_param, + epsilon_t, + multi_precision, + param_out_tensor, + moment_out_tensor, + master_param_outs); } template @@ -87,9 +91,12 @@ void AdagradSparseKernel(const Context& ctx, const SelectedRows& grad_t, const DenseTensor& moment_t, const DenseTensor& learning_rate, + const paddle::optional& master_param, float epsilon_t, + bool multi_precision, DenseTensor* param_out, - DenseTensor* moment_out) { + DenseTensor* moment_out, + DenseTensor* master_param_outs) { auto* param_out_tensor = param_out; auto* moment_out_tensor = moment_out; diff --git a/paddle/phi/kernels/xpu/adagrad_kernel.cc b/paddle/phi/kernels/xpu/adagrad_kernel.cc index d31a93aeb9bca..1a4e4c8e1c776 100644 --- a/paddle/phi/kernels/xpu/adagrad_kernel.cc +++ b/paddle/phi/kernels/xpu/adagrad_kernel.cc @@ -24,9 +24,12 @@ void AdagradDenseKernel(const Context& ctx, const DenseTensor& grad, const DenseTensor& moment, const DenseTensor& learning_rate, + const paddle::optional& master_param, float epsilon_t, + bool multi_precision, DenseTensor* param_out_tensor, - DenseTensor* moment_out_tensor) { + DenseTensor* moment_out_tensor, + DenseTensor* master_param_outs) { ctx.template Alloc(param_out_tensor); ctx.template Alloc(moment_out_tensor); diff --git a/paddle/phi/ops/compat/adagrad_sig.cc b/paddle/phi/ops/compat/adagrad_sig.cc index 4d9a8a65d7891..2a79261d21feb 100644 --- a/paddle/phi/ops/compat/adagrad_sig.cc +++ b/paddle/phi/ops/compat/adagrad_sig.cc @@ -18,15 +18,17 @@ namespace phi { KernelSignature AdagradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("Grad")) { - return KernelSignature("adagrad", - {"Param", "Grad", "Moment", "LearningRate"}, - {"epsilon"}, - {"ParamOut", "MomentOut"}); + return KernelSignature( + "adagrad", + {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, + {"epsilon", "multi_precision"}, + {"ParamOut", "MomentOut", "MasterParamOut"}); } else if (ctx.IsSelectedRowsInput("Grad")) { - return KernelSignature("adagrad_dense_param_sparse_grad", - {"Param", "Grad", "Moment", "LearningRate"}, - {"epsilon"}, - {"ParamOut", "MomentOut"}); + return KernelSignature( + "adagrad_dense_param_sparse_grad", + {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, + {"epsilon", "multi_precision"}, + {"ParamOut", "MomentOut", "MasterParamOut"}); } return KernelSignature("unregistered", {}, {}, {}); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e8864e23198f7..11d3d179e62cc 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2030,6 +2030,7 @@ class AdagradOptimizer(Optimizer): some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. @@ -2066,6 +2067,7 @@ def __init__( parameter_list=None, regularization=None, grad_clip=None, + multi_precision=False, name=None, initial_accumulator_value=0.0, ): @@ -2079,13 +2081,83 @@ def __init__( name=name, ) self.type = "adagrad" + self._multi_precision = multi_precision self._epsilon = epsilon self.initial_accumulator_value = initial_accumulator_value + self._master_weights = {} + + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + '_fp32_master' + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._moment_acc_str, master_p) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator( self._moment_acc_str, p, @@ -2098,30 +2170,52 @@ def _append_optimize_op(self, block, param_and_grad): moment_acc = self._get_accumulator( self._moment_acc_str, param_and_grad[0] ) + + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) + if in_dygraph_mode(): _C_ops.adagrad_( param_and_grad[0], param_and_grad[1], moment_acc, self._create_param_lr(param_and_grad), + master_weight, self._epsilon, + find_master, ) return None else: # Create the adagrad optimizer op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": moment_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + outputs = { + "ParamOut": param_and_grad[0], + "MomentOut": moment_acc, + } + + attrs = {"epsilon": self._epsilon, "multi_precision": find_master} + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + adagrad_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment_acc, - }, - attrs={"epsilon": self._epsilon}, + inputs=inputs, + outputs=outputs, + attrs=attrs, stop_gradient=True, ) diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py index 8eed1871a8759..cc24a7dbaf403 100644 --- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py +++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py @@ -23,8 +23,24 @@ from paddle.fluid.op import Operator -def adamgrad_wrapper(param, grad, moment, learning_rate, epsilon): - paddle._C_ops.adagrad_(param, grad, moment, learning_rate, epsilon) +def adamgrad_wrapper( + param, + grad, + moment, + learning_rate, + master_weight=None, + epsilon=1e-8, + multi_precision=False, +): + paddle._C_ops.adagrad_( + param, + grad, + moment, + learning_rate, + master_weight, + epsilon, + multi_precision, + ) class TestAdagradOp1(OpTest): @@ -79,7 +95,7 @@ def setUp(self): 'LearningRate': np.array([lr]).astype("float32"), } - self.attrs = {'epsilon': epsilon} + self.attrs = {'epsilon': epsilon, "multi_precision": False} moment_out = moment + grad * grad param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) @@ -124,7 +140,6 @@ def check_with_place(self, place): moment_np_array = np.full((height, row_numel), 2.0).astype("float32") moment.set(moment_np_array, place) - # create and run sgd operator adagrad_op = Operator( "adagrad", Param='Param', @@ -196,6 +211,249 @@ def test_sparse_adagrad(self): self.check_with_place(place) +class TestAdamaxMultiPrecision2_0(unittest.TestCase): + def dygraph_adagrad_mp(self, mp, use_amp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.optimizer.Adagrad(0.1, parameters=model.parameters()) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.step() + optimizer.clear_grad() + + return output, model.parameters() + + def static_adagrad_mp(self, mp, use_amp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.optimizer.Adagrad(0.1) + optimizer._multi_precision = mp + + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output11_dy, params1_dy = self.dygraph_adagrad_mp( + use_amp=False, mp=True + ) + output22_dy, params2_dy = self.dygraph_adagrad_mp( + use_amp=False, mp=False + ) + output1_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_adagrad_mp(use_amp=True, mp=False) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + np.testing.assert_allclose( + output11_dy.astype('float32').numpy(), + output22_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output11_st = self.static_adagrad_mp(use_amp=False, mp=True) + output22_st = self.static_adagrad_mp(use_amp=False, mp=False) + output1_st = self.static_adagrad_mp(use_amp=True, mp=True) + output2_st = self.static_adagrad_mp(use_amp=True, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output11_st[idx].astype('float32'), + output22_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + +class TestAdagradMultiPrecision1_0(unittest.TestCase): + def dygraph_adagrad_mp(self, use_amp, mp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.fluid.optimizer.Adagrad( + learning_rate=0.001, + parameter_list=model.parameters(), + ) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_gradients() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.minimize(loss) + optimizer.clear_gradients() + + return output, model.parameters() + + def static_adagrad_mp(self, use_amp, mp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.fluid.optimizer.Adagrad(learning_rate=0.001) + + optimizer._multi_precision = mp + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output11_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=True, mp=True) + output22_dy, params2_dy = self.dygraph_adagrad_mp( + use_amp=True, mp=False + ) + output1_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=False, mp=True) + output2_dy, params2_dy = self.dygraph_adagrad_mp( + use_amp=False, mp=False + ) + np.testing.assert_allclose( + output11_dy.astype('float32').numpy(), + output22_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output11_st = self.static_adagrad_mp(use_amp=True, mp=False) + output22_st = self.static_adagrad_mp(use_amp=True, mp=True) + output1_st = self.static_adagrad_mp(use_amp=False, mp=True) + output2_st = self.static_adagrad_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py index 6bea5773270bb..3204ab0f516a0 100644 --- a/python/paddle/optimizer/adagrad.py +++ b/python/paddle/optimizer/adagrad.py @@ -11,8 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import warnings -from ..fluid import framework +import paddle + +from ..fluid import core, framework, unique_name +from ..fluid.layer_helper import LayerHelper from .optimizer import Optimizer __all__ = [] @@ -60,6 +64,7 @@ class Adagrad(Optimizer): some derived class of ``GradientClipBase`` . There are three cliping strategies, ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, meaning there is no gradient clipping. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. @@ -112,6 +117,7 @@ def __init__( parameters=None, weight_decay=None, grad_clip=None, + multi_precision=False, name=None, initial_accumulator_value=0.0, ): @@ -126,12 +132,72 @@ def __init__( ) self.type = "adagrad" self._epsilon = epsilon + self._multi_precision = multi_precision + self._master_weights = {} self.initial_accumulator_value = initial_accumulator_value self._default_dict = { 'epsilon': epsilon, 'initial_accumulator_value': initial_accumulator_value, } + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] + def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -139,6 +205,18 @@ def _create_accumulators(self, block, parameters): parameters = self._update_param_group(parameters) for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._moment_acc_str, master_p) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Momentum optimizer." + ) self._add_accumulator( self._moment_acc_str, p, @@ -154,17 +232,37 @@ def _append_optimize_op(self, block, param_and_grad): moment_acc = self._get_accumulator( self._moment_acc_str, param_and_grad[0] ) + + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) + # Create the adagrad optimizer op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": moment_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + + outputs = {"ParamOut": param_and_grad[0], "MomentOut": moment_acc} + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + adagrad_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}, + inputs=inputs, + outputs=outputs, + attrs={"epsilon": self._epsilon, "multi_precision": find_master}, stop_gradient=True, )