rec_att_head.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. import paddle
  18. import paddle.nn as nn
  19. import paddle.nn.functional as F
  20. import numpy as np
  21. class AttentionHead(nn.Layer):
  22. def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
  23. super(AttentionHead, self).__init__()
  24. self.input_size = in_channels
  25. self.hidden_size = hidden_size
  26. self.num_classes = out_channels
  27. self.attention_cell = AttentionGRUCell(
  28. in_channels, hidden_size, out_channels, use_gru=False)
  29. self.generator = nn.Linear(hidden_size, out_channels)
  30. def _char_to_onehot(self, input_char, onehot_dim):
  31. input_ont_hot = F.one_hot(input_char, onehot_dim)
  32. return input_ont_hot
  33. def forward(self, inputs, targets=None, batch_max_length=25):
  34. batch_size = paddle.shape(inputs)[0]
  35. num_steps = batch_max_length
  36. hidden = paddle.zeros((batch_size, self.hidden_size))
  37. output_hiddens = []
  38. if targets is not None:
  39. for i in range(num_steps):
  40. char_onehots = self._char_to_onehot(
  41. targets[:, i], onehot_dim=self.num_classes)
  42. (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
  43. char_onehots)
  44. output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
  45. output = paddle.concat(output_hiddens, axis=1)
  46. probs = self.generator(output)
  47. else:
  48. targets = paddle.zeros(shape=[batch_size], dtype="int32")
  49. probs = None
  50. char_onehots = None
  51. outputs = None
  52. alpha = None
  53. for i in range(num_steps):
  54. char_onehots = self._char_to_onehot(
  55. targets, onehot_dim=self.num_classes)
  56. (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
  57. char_onehots)
  58. probs_step = self.generator(outputs)
  59. if probs is None:
  60. probs = paddle.unsqueeze(probs_step, axis=1)
  61. else:
  62. probs = paddle.concat(
  63. [probs, paddle.unsqueeze(
  64. probs_step, axis=1)], axis=1)
  65. next_input = probs_step.argmax(axis=1)
  66. targets = next_input
  67. if not self.training:
  68. probs = paddle.nn.functional.softmax(probs, axis=2)
  69. return probs
  70. class AttentionGRUCell(nn.Layer):
  71. def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
  72. super(AttentionGRUCell, self).__init__()
  73. self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
  74. self.h2h = nn.Linear(hidden_size, hidden_size)
  75. self.score = nn.Linear(hidden_size, 1, bias_attr=False)
  76. self.rnn = nn.GRUCell(
  77. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  78. self.hidden_size = hidden_size
  79. def forward(self, prev_hidden, batch_H, char_onehots):
  80. batch_H_proj = self.i2h(batch_H)
  81. prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
  82. res = paddle.add(batch_H_proj, prev_hidden_proj)
  83. res = paddle.tanh(res)
  84. e = self.score(res)
  85. alpha = F.softmax(e, axis=1)
  86. alpha = paddle.transpose(alpha, [0, 2, 1])
  87. context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
  88. concat_context = paddle.concat([context, char_onehots], 1)
  89. cur_hidden = self.rnn(concat_context, prev_hidden)
  90. return cur_hidden, alpha
  91. class AttentionLSTM(nn.Layer):
  92. def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
  93. super(AttentionLSTM, self).__init__()
  94. self.input_size = in_channels
  95. self.hidden_size = hidden_size
  96. self.num_classes = out_channels
  97. self.attention_cell = AttentionLSTMCell(
  98. in_channels, hidden_size, out_channels, use_gru=False)
  99. self.generator = nn.Linear(hidden_size, out_channels)
  100. def _char_to_onehot(self, input_char, onehot_dim):
  101. input_ont_hot = F.one_hot(input_char, onehot_dim)
  102. return input_ont_hot
  103. def forward(self, inputs, targets=None, batch_max_length=25):
  104. batch_size = inputs.shape[0]
  105. num_steps = batch_max_length
  106. hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
  107. (batch_size, self.hidden_size)))
  108. output_hiddens = []
  109. if targets is not None:
  110. for i in range(num_steps):
  111. # one-hot vectors for a i-th char
  112. char_onehots = self._char_to_onehot(
  113. targets[:, i], onehot_dim=self.num_classes)
  114. hidden, alpha = self.attention_cell(hidden, inputs,
  115. char_onehots)
  116. hidden = (hidden[1][0], hidden[1][1])
  117. output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
  118. output = paddle.concat(output_hiddens, axis=1)
  119. probs = self.generator(output)
  120. else:
  121. targets = paddle.zeros(shape=[batch_size], dtype="int32")
  122. probs = None
  123. char_onehots = None
  124. alpha = None
  125. for i in range(num_steps):
  126. char_onehots = self._char_to_onehot(
  127. targets, onehot_dim=self.num_classes)
  128. hidden, alpha = self.attention_cell(hidden, inputs,
  129. char_onehots)
  130. probs_step = self.generator(hidden[0])
  131. hidden = (hidden[1][0], hidden[1][1])
  132. if probs is None:
  133. probs = paddle.unsqueeze(probs_step, axis=1)
  134. else:
  135. probs = paddle.concat(
  136. [probs, paddle.unsqueeze(
  137. probs_step, axis=1)], axis=1)
  138. next_input = probs_step.argmax(axis=1)
  139. targets = next_input
  140. if not self.training:
  141. probs = paddle.nn.functional.softmax(probs, axis=2)
  142. return probs
  143. class AttentionLSTMCell(nn.Layer):
  144. def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
  145. super(AttentionLSTMCell, self).__init__()
  146. self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
  147. self.h2h = nn.Linear(hidden_size, hidden_size)
  148. self.score = nn.Linear(hidden_size, 1, bias_attr=False)
  149. if not use_gru:
  150. self.rnn = nn.LSTMCell(
  151. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  152. else:
  153. self.rnn = nn.GRUCell(
  154. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  155. self.hidden_size = hidden_size
  156. def forward(self, prev_hidden, batch_H, char_onehots):
  157. batch_H_proj = self.i2h(batch_H)
  158. prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
  159. res = paddle.add(batch_H_proj, prev_hidden_proj)
  160. res = paddle.tanh(res)
  161. e = self.score(res)
  162. alpha = F.softmax(e, axis=1)
  163. alpha = paddle.transpose(alpha, [0, 2, 1])
  164. context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
  165. concat_context = paddle.concat([context, char_onehots], 1)
  166. cur_hidden = self.rnn(concat_context, prev_hidden)
  167. return cur_hidden, alpha