123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284 |
- # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- import math
- import paddle
- from paddle import nn, ParamAttr
- from paddle.nn import functional as F
- import numpy as np
- import functools
- from .tps import GridGenerator
- '''This code is refer from:
- https://github.com/hikopensource/DAVAR-Lab-OCR/davarocr/davar_rcg/models/transformations/gaspin_transformation.py
- '''
- class SP_TransformerNetwork(nn.Layer):
- """
- Sturture-Preserving Transformation (SPT) as Equa. (2) in Ref. [1]
- Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021.
- """
- def __init__(self, nc=1, default_type=5):
- """ Based on SPIN
- Args:
- nc (int): number of input channels (usually in 1 or 3)
- default_type (int): the complexity of transformation intensities (by default set to 6 as the paper)
- """
- super(SP_TransformerNetwork, self).__init__()
- self.power_list = self.cal_K(default_type)
- self.sigmoid = nn.Sigmoid()
- self.bn = nn.InstanceNorm2D(nc)
- def cal_K(self, k=5):
- """
- Args:
- k (int): the complexity of transformation intensities (by default set to 6 as the paper)
- Returns:
- List: the normalized intensity of each pixel in [0,1], denoted as \beta [1x(2K+1)]
- """
- from math import log
- x = []
- if k != 0:
- for i in range(1, k+1):
- lower = round(log(1-(0.5/(k+1))*i)/log((0.5/(k+1))*i), 2)
- upper = round(1/lower, 2)
- x.append(lower)
- x.append(upper)
- x.append(1.00)
- return x
- def forward(self, batch_I, weights, offsets, lambda_color=None):
- """
- Args:
- batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width]
- weights:
- offsets: the predicted offset by AIN, a scalar
- lambda_color: the learnable update gate \alpha in Equa. (5) as
- g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets}
- Returns:
- Tensor: transformed images by SPN as Equa. (4) in Ref. [1]
- [batch_size x I_channel_num x I_r_height x I_r_width]
- """
- batch_I = (batch_I + 1) * 0.5
- if offsets is not None:
- batch_I = batch_I*(1-lambda_color) + offsets*lambda_color
- batch_weight_params = paddle.unsqueeze(paddle.unsqueeze(weights, -1), -1)
- batch_I_power = paddle.stack([batch_I.pow(p) for p in self.power_list], axis=1)
- batch_weight_sum = paddle.sum(batch_I_power * batch_weight_params, axis=1)
- batch_weight_sum = self.bn(batch_weight_sum)
- batch_weight_sum = self.sigmoid(batch_weight_sum)
- batch_weight_sum = batch_weight_sum * 2 - 1
- return batch_weight_sum
- class GA_SPIN_Transformer(nn.Layer):
- """
- Geometric-Absorbed SPIN Transformation (GA-SPIN) proposed in Ref. [1]
- Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021.
- """
- def __init__(self, in_channels=1,
- I_r_size=(32, 100),
- offsets=False,
- norm_type='BN',
- default_type=6,
- loc_lr=1,
- stn=True):
- """
- Args:
- in_channels (int): channel of input features,
- set it to 1 if the grayscale images and 3 if RGB input
- I_r_size (tuple): size of rectified images (used in STN transformations)
- offsets (bool): set it to False if use SPN w.o. AIN,
- and set it to True if use SPIN (both with SPN and AIN)
- norm_type (str): the normalization type of the module,
- set it to 'BN' by default, 'IN' optionally
- default_type (int): the K chromatic space,
- set it to 3/5/6 depend on the complexity of transformation intensities
- loc_lr (float): learning rate of location network
- stn (bool): whther to use stn.
- """
- super(GA_SPIN_Transformer, self).__init__()
- self.nc = in_channels
- self.spt = True
- self.offsets = offsets
- self.stn = stn # set to True in GA-SPIN, while set it to False in SPIN
- self.I_r_size = I_r_size
- self.out_channels = in_channels
- if norm_type == 'BN':
- norm_layer = functools.partial(nn.BatchNorm2D, use_global_stats=True)
- elif norm_type == 'IN':
- norm_layer = functools.partial(nn.InstanceNorm2D, weight_attr=False,
- use_global_stats=False)
- else:
- raise NotImplementedError('normalization layer [%s] is not found' % norm_type)
- if self.spt:
- self.sp_net = SP_TransformerNetwork(in_channels,
- default_type)
- self.spt_convnet = nn.Sequential(
- # 32*100
- nn.Conv2D(in_channels, 32, 3, 1, 1, bias_attr=False),
- norm_layer(32), nn.ReLU(),
- nn.MaxPool2D(kernel_size=2, stride=2),
- # 16*50
- nn.Conv2D(32, 64, 3, 1, 1, bias_attr=False),
- norm_layer(64), nn.ReLU(),
- nn.MaxPool2D(kernel_size=2, stride=2),
- # 8*25
- nn.Conv2D(64, 128, 3, 1, 1, bias_attr=False),
- norm_layer(128), nn.ReLU(),
- nn.MaxPool2D(kernel_size=2, stride=2),
- # 4*12
- )
- self.stucture_fc1 = nn.Sequential(
- nn.Conv2D(128, 256, 3, 1, 1, bias_attr=False),
- norm_layer(256), nn.ReLU(),
- nn.MaxPool2D(kernel_size=2, stride=2),
- nn.Conv2D(256, 256, 3, 1, 1, bias_attr=False),
- norm_layer(256), nn.ReLU(), # 2*6
- nn.MaxPool2D(kernel_size=2, stride=2),
- nn.Conv2D(256, 512, 3, 1, 1, bias_attr=False),
- norm_layer(512), nn.ReLU(), # 1*3
- nn.AdaptiveAvgPool2D(1),
- nn.Flatten(1, -1), # batch_size x 512
- nn.Linear(512, 256, weight_attr=nn.initializer.Normal(0.001)),
- nn.BatchNorm1D(256), nn.ReLU()
- )
- self.out_weight = 2*default_type+1
- self.spt_length = 2*default_type+1
- if offsets:
- self.out_weight += 1
- if self.stn:
- self.F = 20
- self.out_weight += self.F * 2
- self.GridGenerator = GridGenerator(self.F*2, self.F)
-
- # self.out_weight*=nc
- # Init structure_fc2 in LocalizationNetwork
- initial_bias = self.init_spin(default_type*2)
- initial_bias = initial_bias.reshape(-1)
- param_attr = ParamAttr(
- learning_rate=loc_lr,
- initializer=nn.initializer.Assign(np.zeros([256, self.out_weight])))
- bias_attr = ParamAttr(
- learning_rate=loc_lr,
- initializer=nn.initializer.Assign(initial_bias))
- self.stucture_fc2 = nn.Linear(256, self.out_weight,
- weight_attr=param_attr,
- bias_attr=bias_attr)
- self.sigmoid = nn.Sigmoid()
- if offsets:
- self.offset_fc1 = nn.Sequential(nn.Conv2D(128, 16,
- 3, 1, 1,
- bias_attr=False),
- norm_layer(16),
- nn.ReLU(),)
- self.offset_fc2 = nn.Conv2D(16, in_channels,
- 3, 1, 1)
- self.pool = nn.MaxPool2D(2, 2)
- def init_spin(self, nz):
- """
- Args:
- nz (int): number of paired \betas exponents, which means the value of K x 2
- """
- init_id = [0.00]*nz+[5.00]
- if self.offsets:
- init_id += [-5.00]
- # init_id *=3
- init = np.array(init_id)
- if self.stn:
- F = self.F
- ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
- ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
- ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
- ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
- ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
- initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
- initial_bias = initial_bias.reshape(-1)
- init = np.concatenate([init, initial_bias], axis=0)
- return init
- def forward(self, x, return_weight=False):
- """
- Args:
- x (Tensor): input image batch
- return_weight (bool): set to False by default,
- if set to True return the predicted offsets of AIN, denoted as x_{offsets}
- Returns:
- Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size
- """
- if self.spt:
- feat = self.spt_convnet(x)
- fc1 = self.stucture_fc1(feat)
- sp_weight_fusion = self.stucture_fc2(fc1)
- sp_weight_fusion = sp_weight_fusion.reshape([x.shape[0], self.out_weight, 1])
- if self.offsets: # SPIN w. AIN
- lambda_color = sp_weight_fusion[:, self.spt_length, 0]
- lambda_color = self.sigmoid(lambda_color).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
- sp_weight = sp_weight_fusion[:, :self.spt_length, :]
- offsets = self.pool(self.offset_fc2(self.offset_fc1(feat)))
- assert offsets.shape[2] == 2 # 2
- assert offsets.shape[3] == 6 # 16
- offsets = self.sigmoid(offsets) # v12
- if return_weight:
- return offsets
- offsets = nn.functional.upsample(offsets, size=(x.shape[2], x.shape[3]), mode='bilinear')
- if self.stn:
- batch_C_prime = sp_weight_fusion[:, (self.spt_length + 1):, :].reshape([x.shape[0], self.F, 2])
- build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size)
- build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0],
- self.I_r_size[0],
- self.I_r_size[1],
- 2])
- else: # SPIN w.o. AIN
- sp_weight = sp_weight_fusion[:, :self.spt_length, :]
- lambda_color, offsets = None, None
- if self.stn:
- batch_C_prime = sp_weight_fusion[:, self.spt_length:, :].reshape([x.shape[0], self.F, 2])
- build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size)
- build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0],
- self.I_r_size[0],
- self.I_r_size[1],
- 2])
- x = self.sp_net(x, sp_weight, offsets, lambda_color)
- if self.stn:
- x = F.grid_sample(x=x, grid=build_P_prime_reshape, padding_mode='border')
- return x
|