optimizer.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. from __future__ import unicode_literals
  18. from paddle import optimizer as optim
  19. class Momentum(object):
  20. """
  21. Simple Momentum optimizer with velocity state.
  22. Args:
  23. learning_rate (float|Variable) - The learning rate used to update parameters.
  24. Can be a float value or a Variable with one float value as data element.
  25. momentum (float) - Momentum factor.
  26. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  27. """
  28. def __init__(self,
  29. learning_rate,
  30. momentum,
  31. weight_decay=None,
  32. grad_clip=None,
  33. **args):
  34. super(Momentum, self).__init__()
  35. self.learning_rate = learning_rate
  36. self.momentum = momentum
  37. self.weight_decay = weight_decay
  38. self.grad_clip = grad_clip
  39. def __call__(self, model):
  40. train_params = [
  41. param for param in model.parameters() if param.trainable is True
  42. ]
  43. opt = optim.Momentum(
  44. learning_rate=self.learning_rate,
  45. momentum=self.momentum,
  46. weight_decay=self.weight_decay,
  47. grad_clip=self.grad_clip,
  48. parameters=train_params)
  49. return opt
  50. class Adam(object):
  51. def __init__(self,
  52. learning_rate=0.001,
  53. beta1=0.9,
  54. beta2=0.999,
  55. epsilon=1e-08,
  56. parameter_list=None,
  57. weight_decay=None,
  58. grad_clip=None,
  59. name=None,
  60. lazy_mode=False,
  61. **kwargs):
  62. self.learning_rate = learning_rate
  63. self.beta1 = beta1
  64. self.beta2 = beta2
  65. self.epsilon = epsilon
  66. self.parameter_list = parameter_list
  67. self.learning_rate = learning_rate
  68. self.weight_decay = weight_decay
  69. self.grad_clip = grad_clip
  70. self.name = name
  71. self.lazy_mode = lazy_mode
  72. self.group_lr = kwargs.get('group_lr', False)
  73. self.training_step = kwargs.get('training_step', None)
  74. def __call__(self, model):
  75. if self.group_lr:
  76. if self.training_step == 'LF_2':
  77. import paddle
  78. if isinstance(model, paddle.fluid.dygraph.parallel.
  79. DataParallel): # multi gpu
  80. mlm = model._layers.head.MLM_VRM.MLM.parameters()
  81. pre_mlm_pp = model._layers.head.MLM_VRM.Prediction.pp_share.parameters(
  82. )
  83. pre_mlm_w = model._layers.head.MLM_VRM.Prediction.w_share.parameters(
  84. )
  85. else: # single gpu
  86. mlm = model.head.MLM_VRM.MLM.parameters()
  87. pre_mlm_pp = model.head.MLM_VRM.Prediction.pp_share.parameters(
  88. )
  89. pre_mlm_w = model.head.MLM_VRM.Prediction.w_share.parameters(
  90. )
  91. total = []
  92. for param in mlm:
  93. total.append(id(param))
  94. for param in pre_mlm_pp:
  95. total.append(id(param))
  96. for param in pre_mlm_w:
  97. total.append(id(param))
  98. group_base_params = [
  99. param for param in model.parameters() if id(param) in total
  100. ]
  101. group_small_params = [
  102. param for param in model.parameters()
  103. if id(param) not in total
  104. ]
  105. train_params = [{
  106. 'params': group_base_params
  107. }, {
  108. 'params': group_small_params,
  109. 'learning_rate': self.learning_rate.values[0] * 0.1
  110. }]
  111. else:
  112. print(
  113. 'group lr currently only support VisionLAN in LF_2 training step'
  114. )
  115. train_params = [
  116. param for param in model.parameters()
  117. if param.trainable is True
  118. ]
  119. else:
  120. train_params = [
  121. param for param in model.parameters() if param.trainable is True
  122. ]
  123. opt = optim.Adam(
  124. learning_rate=self.learning_rate,
  125. beta1=self.beta1,
  126. beta2=self.beta2,
  127. epsilon=self.epsilon,
  128. weight_decay=self.weight_decay,
  129. grad_clip=self.grad_clip,
  130. name=self.name,
  131. lazy_mode=self.lazy_mode,
  132. parameters=train_params)
  133. return opt
  134. class RMSProp(object):
  135. """
  136. Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
  137. Args:
  138. learning_rate (float|Variable) - The learning rate used to update parameters.
  139. Can be a float value or a Variable with one float value as data element.
  140. momentum (float) - Momentum factor.
  141. rho (float) - rho value in equation.
  142. epsilon (float) - avoid division by zero, default is 1e-6.
  143. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  144. """
  145. def __init__(self,
  146. learning_rate,
  147. momentum=0.0,
  148. rho=0.95,
  149. epsilon=1e-6,
  150. weight_decay=None,
  151. grad_clip=None,
  152. **args):
  153. super(RMSProp, self).__init__()
  154. self.learning_rate = learning_rate
  155. self.momentum = momentum
  156. self.rho = rho
  157. self.epsilon = epsilon
  158. self.weight_decay = weight_decay
  159. self.grad_clip = grad_clip
  160. def __call__(self, model):
  161. train_params = [
  162. param for param in model.parameters() if param.trainable is True
  163. ]
  164. opt = optim.RMSProp(
  165. learning_rate=self.learning_rate,
  166. momentum=self.momentum,
  167. rho=self.rho,
  168. epsilon=self.epsilon,
  169. weight_decay=self.weight_decay,
  170. grad_clip=self.grad_clip,
  171. parameters=train_params)
  172. return opt
  173. class Adadelta(object):
  174. def __init__(self,
  175. learning_rate=0.001,
  176. epsilon=1e-08,
  177. rho=0.95,
  178. parameter_list=None,
  179. weight_decay=None,
  180. grad_clip=None,
  181. name=None,
  182. **kwargs):
  183. self.learning_rate = learning_rate
  184. self.epsilon = epsilon
  185. self.rho = rho
  186. self.parameter_list = parameter_list
  187. self.learning_rate = learning_rate
  188. self.weight_decay = weight_decay
  189. self.grad_clip = grad_clip
  190. self.name = name
  191. def __call__(self, model):
  192. train_params = [
  193. param for param in model.parameters() if param.trainable is True
  194. ]
  195. opt = optim.Adadelta(
  196. learning_rate=self.learning_rate,
  197. epsilon=self.epsilon,
  198. rho=self.rho,
  199. weight_decay=self.weight_decay,
  200. grad_clip=self.grad_clip,
  201. name=self.name,
  202. parameters=train_params)
  203. return opt
  204. class AdamW(object):
  205. def __init__(self,
  206. learning_rate=0.001,
  207. beta1=0.9,
  208. beta2=0.999,
  209. epsilon=1e-8,
  210. weight_decay=0.01,
  211. multi_precision=False,
  212. grad_clip=None,
  213. no_weight_decay_name=None,
  214. one_dim_param_no_weight_decay=False,
  215. name=None,
  216. lazy_mode=False,
  217. **args):
  218. super().__init__()
  219. self.learning_rate = learning_rate
  220. self.beta1 = beta1
  221. self.beta2 = beta2
  222. self.epsilon = epsilon
  223. self.grad_clip = grad_clip
  224. self.weight_decay = 0.01 if weight_decay is None else weight_decay
  225. self.grad_clip = grad_clip
  226. self.name = name
  227. self.lazy_mode = lazy_mode
  228. self.multi_precision = multi_precision
  229. self.no_weight_decay_name_list = no_weight_decay_name.split(
  230. ) if no_weight_decay_name else []
  231. self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
  232. def __call__(self, model):
  233. parameters = [
  234. param for param in model.parameters() if param.trainable is True
  235. ]
  236. self.no_weight_decay_param_name_list = [
  237. p.name for n, p in model.named_parameters()
  238. if any(nd in n for nd in self.no_weight_decay_name_list)
  239. ]
  240. if self.one_dim_param_no_weight_decay:
  241. self.no_weight_decay_param_name_list += [
  242. p.name for n, p in model.named_parameters() if len(p.shape) == 1
  243. ]
  244. opt = optim.AdamW(
  245. learning_rate=self.learning_rate,
  246. beta1=self.beta1,
  247. beta2=self.beta2,
  248. epsilon=self.epsilon,
  249. parameters=parameters,
  250. weight_decay=self.weight_decay,
  251. multi_precision=self.multi_precision,
  252. grad_clip=self.grad_clip,
  253. name=self.name,
  254. lazy_mode=self.lazy_mode,
  255. apply_decay_param_fun=self._apply_decay_param_fun)
  256. return opt
  257. def _apply_decay_param_fun(self, name):
  258. return name not in self.no_weight_decay_param_name_list