table_master_resnet.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. This code is refer from:
  16. https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/mmocr/models/textrecog/backbones/table_resnet_extra.py
  17. """
  18. import paddle
  19. import paddle.nn as nn
  20. import paddle.nn.functional as F
  21. class BasicBlock(nn.Layer):
  22. expansion = 1
  23. def __init__(self,
  24. inplanes,
  25. planes,
  26. stride=1,
  27. downsample=None,
  28. gcb_config=None):
  29. super(BasicBlock, self).__init__()
  30. self.conv1 = nn.Conv2D(
  31. inplanes,
  32. planes,
  33. kernel_size=3,
  34. stride=stride,
  35. padding=1,
  36. bias_attr=False)
  37. self.bn1 = nn.BatchNorm2D(planes, momentum=0.9)
  38. self.relu = nn.ReLU()
  39. self.conv2 = nn.Conv2D(
  40. planes, planes, kernel_size=3, stride=1, padding=1, bias_attr=False)
  41. self.bn2 = nn.BatchNorm2D(planes, momentum=0.9)
  42. self.downsample = downsample
  43. self.stride = stride
  44. self.gcb_config = gcb_config
  45. if self.gcb_config is not None:
  46. gcb_ratio = gcb_config['ratio']
  47. gcb_headers = gcb_config['headers']
  48. att_scale = gcb_config['att_scale']
  49. fusion_type = gcb_config['fusion_type']
  50. self.context_block = MultiAspectGCAttention(
  51. inplanes=planes,
  52. ratio=gcb_ratio,
  53. headers=gcb_headers,
  54. att_scale=att_scale,
  55. fusion_type=fusion_type)
  56. def forward(self, x):
  57. residual = x
  58. out = self.conv1(x)
  59. out = self.bn1(out)
  60. out = self.relu(out)
  61. out = self.conv2(out)
  62. out = self.bn2(out)
  63. if self.gcb_config is not None:
  64. out = self.context_block(out)
  65. if self.downsample is not None:
  66. residual = self.downsample(x)
  67. out += residual
  68. out = self.relu(out)
  69. return out
  70. def get_gcb_config(gcb_config, layer):
  71. if gcb_config is None or not gcb_config['layers'][layer]:
  72. return None
  73. else:
  74. return gcb_config
  75. class TableResNetExtra(nn.Layer):
  76. def __init__(self, layers, in_channels=3, gcb_config=None):
  77. assert len(layers) >= 4
  78. super(TableResNetExtra, self).__init__()
  79. self.inplanes = 128
  80. self.conv1 = nn.Conv2D(
  81. in_channels,
  82. 64,
  83. kernel_size=3,
  84. stride=1,
  85. padding=1,
  86. bias_attr=False)
  87. self.bn1 = nn.BatchNorm2D(64)
  88. self.relu1 = nn.ReLU()
  89. self.conv2 = nn.Conv2D(
  90. 64, 128, kernel_size=3, stride=1, padding=1, bias_attr=False)
  91. self.bn2 = nn.BatchNorm2D(128)
  92. self.relu2 = nn.ReLU()
  93. self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2)
  94. self.layer1 = self._make_layer(
  95. BasicBlock,
  96. 256,
  97. layers[0],
  98. stride=1,
  99. gcb_config=get_gcb_config(gcb_config, 0))
  100. self.conv3 = nn.Conv2D(
  101. 256, 256, kernel_size=3, stride=1, padding=1, bias_attr=False)
  102. self.bn3 = nn.BatchNorm2D(256)
  103. self.relu3 = nn.ReLU()
  104. self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2)
  105. self.layer2 = self._make_layer(
  106. BasicBlock,
  107. 256,
  108. layers[1],
  109. stride=1,
  110. gcb_config=get_gcb_config(gcb_config, 1))
  111. self.conv4 = nn.Conv2D(
  112. 256, 256, kernel_size=3, stride=1, padding=1, bias_attr=False)
  113. self.bn4 = nn.BatchNorm2D(256)
  114. self.relu4 = nn.ReLU()
  115. self.maxpool3 = nn.MaxPool2D(kernel_size=2, stride=2)
  116. self.layer3 = self._make_layer(
  117. BasicBlock,
  118. 512,
  119. layers[2],
  120. stride=1,
  121. gcb_config=get_gcb_config(gcb_config, 2))
  122. self.conv5 = nn.Conv2D(
  123. 512, 512, kernel_size=3, stride=1, padding=1, bias_attr=False)
  124. self.bn5 = nn.BatchNorm2D(512)
  125. self.relu5 = nn.ReLU()
  126. self.layer4 = self._make_layer(
  127. BasicBlock,
  128. 512,
  129. layers[3],
  130. stride=1,
  131. gcb_config=get_gcb_config(gcb_config, 3))
  132. self.conv6 = nn.Conv2D(
  133. 512, 512, kernel_size=3, stride=1, padding=1, bias_attr=False)
  134. self.bn6 = nn.BatchNorm2D(512)
  135. self.relu6 = nn.ReLU()
  136. self.out_channels = [256, 256, 512]
  137. def _make_layer(self, block, planes, blocks, stride=1, gcb_config=None):
  138. downsample = None
  139. if stride != 1 or self.inplanes != planes * block.expansion:
  140. downsample = nn.Sequential(
  141. nn.Conv2D(
  142. self.inplanes,
  143. planes * block.expansion,
  144. kernel_size=1,
  145. stride=stride,
  146. bias_attr=False),
  147. nn.BatchNorm2D(planes * block.expansion), )
  148. layers = []
  149. layers.append(
  150. block(
  151. self.inplanes,
  152. planes,
  153. stride,
  154. downsample,
  155. gcb_config=gcb_config))
  156. self.inplanes = planes * block.expansion
  157. for _ in range(1, blocks):
  158. layers.append(block(self.inplanes, planes))
  159. return nn.Sequential(*layers)
  160. def forward(self, x):
  161. f = []
  162. x = self.conv1(x)
  163. x = self.bn1(x)
  164. x = self.relu1(x)
  165. x = self.conv2(x)
  166. x = self.bn2(x)
  167. x = self.relu2(x)
  168. x = self.maxpool1(x)
  169. x = self.layer1(x)
  170. x = self.conv3(x)
  171. x = self.bn3(x)
  172. x = self.relu3(x)
  173. f.append(x)
  174. x = self.maxpool2(x)
  175. x = self.layer2(x)
  176. x = self.conv4(x)
  177. x = self.bn4(x)
  178. x = self.relu4(x)
  179. f.append(x)
  180. x = self.maxpool3(x)
  181. x = self.layer3(x)
  182. x = self.conv5(x)
  183. x = self.bn5(x)
  184. x = self.relu5(x)
  185. x = self.layer4(x)
  186. x = self.conv6(x)
  187. x = self.bn6(x)
  188. x = self.relu6(x)
  189. f.append(x)
  190. return f
  191. class MultiAspectGCAttention(nn.Layer):
  192. def __init__(self,
  193. inplanes,
  194. ratio,
  195. headers,
  196. pooling_type='att',
  197. att_scale=False,
  198. fusion_type='channel_add'):
  199. super(MultiAspectGCAttention, self).__init__()
  200. assert pooling_type in ['avg', 'att']
  201. assert fusion_type in ['channel_add', 'channel_mul', 'channel_concat']
  202. assert inplanes % headers == 0 and inplanes >= 8 # inplanes must be divided by headers evenly
  203. self.headers = headers
  204. self.inplanes = inplanes
  205. self.ratio = ratio
  206. self.planes = int(inplanes * ratio)
  207. self.pooling_type = pooling_type
  208. self.fusion_type = fusion_type
  209. self.att_scale = False
  210. self.single_header_inplanes = int(inplanes / headers)
  211. if pooling_type == 'att':
  212. self.conv_mask = nn.Conv2D(
  213. self.single_header_inplanes, 1, kernel_size=1)
  214. self.softmax = nn.Softmax(axis=2)
  215. else:
  216. self.avg_pool = nn.AdaptiveAvgPool2D(1)
  217. if fusion_type == 'channel_add':
  218. self.channel_add_conv = nn.Sequential(
  219. nn.Conv2D(
  220. self.inplanes, self.planes, kernel_size=1),
  221. nn.LayerNorm([self.planes, 1, 1]),
  222. nn.ReLU(),
  223. nn.Conv2D(
  224. self.planes, self.inplanes, kernel_size=1))
  225. elif fusion_type == 'channel_concat':
  226. self.channel_concat_conv = nn.Sequential(
  227. nn.Conv2D(
  228. self.inplanes, self.planes, kernel_size=1),
  229. nn.LayerNorm([self.planes, 1, 1]),
  230. nn.ReLU(),
  231. nn.Conv2D(
  232. self.planes, self.inplanes, kernel_size=1))
  233. # for concat
  234. self.cat_conv = nn.Conv2D(
  235. 2 * self.inplanes, self.inplanes, kernel_size=1)
  236. elif fusion_type == 'channel_mul':
  237. self.channel_mul_conv = nn.Sequential(
  238. nn.Conv2D(
  239. self.inplanes, self.planes, kernel_size=1),
  240. nn.LayerNorm([self.planes, 1, 1]),
  241. nn.ReLU(),
  242. nn.Conv2D(
  243. self.planes, self.inplanes, kernel_size=1))
  244. def spatial_pool(self, x):
  245. batch, channel, height, width = x.shape
  246. if self.pooling_type == 'att':
  247. # [N*headers, C', H , W] C = headers * C'
  248. x = x.reshape([
  249. batch * self.headers, self.single_header_inplanes, height, width
  250. ])
  251. input_x = x
  252. # [N*headers, C', H * W] C = headers * C'
  253. # input_x = input_x.view(batch, channel, height * width)
  254. input_x = input_x.reshape([
  255. batch * self.headers, self.single_header_inplanes,
  256. height * width
  257. ])
  258. # [N*headers, 1, C', H * W]
  259. input_x = input_x.unsqueeze(1)
  260. # [N*headers, 1, H, W]
  261. context_mask = self.conv_mask(x)
  262. # [N*headers, 1, H * W]
  263. context_mask = context_mask.reshape(
  264. [batch * self.headers, 1, height * width])
  265. # scale variance
  266. if self.att_scale and self.headers > 1:
  267. context_mask = context_mask / paddle.sqrt(
  268. self.single_header_inplanes)
  269. # [N*headers, 1, H * W]
  270. context_mask = self.softmax(context_mask)
  271. # [N*headers, 1, H * W, 1]
  272. context_mask = context_mask.unsqueeze(-1)
  273. # [N*headers, 1, C', 1] = [N*headers, 1, C', H * W] * [N*headers, 1, H * W, 1]
  274. context = paddle.matmul(input_x, context_mask)
  275. # [N, headers * C', 1, 1]
  276. context = context.reshape(
  277. [batch, self.headers * self.single_header_inplanes, 1, 1])
  278. else:
  279. # [N, C, 1, 1]
  280. context = self.avg_pool(x)
  281. return context
  282. def forward(self, x):
  283. # [N, C, 1, 1]
  284. context = self.spatial_pool(x)
  285. out = x
  286. if self.fusion_type == 'channel_mul':
  287. # [N, C, 1, 1]
  288. channel_mul_term = F.sigmoid(self.channel_mul_conv(context))
  289. out = out * channel_mul_term
  290. elif self.fusion_type == 'channel_add':
  291. # [N, C, 1, 1]
  292. channel_add_term = self.channel_add_conv(context)
  293. out = out + channel_add_term
  294. else:
  295. # [N, C, 1, 1]
  296. channel_concat_term = self.channel_concat_conv(context)
  297. # use concat
  298. _, C1, _, _ = channel_concat_term.shape
  299. N, C2, H, W = out.shape
  300. out = paddle.concat(
  301. [out, channel_concat_term.expand([-1, -1, H, W])], axis=1)
  302. out = self.cat_conv(out)
  303. out = F.layer_norm(out, [self.inplanes, H, W])
  304. out = F.relu(out)
  305. return out