| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 | # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at##    http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""This code is refer from: https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py"""import paddleimport paddle.nn as nnfrom paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormalfrom .rec_att_head import AttentionLSTMkaiming_init_ = KaimingNormal()zeros_ = Constant(value=0.)ones_ = Constant(value=1.)class CNTHead(nn.Layer):    def __init__(self,                 embed_size=512,                 encode_length=26,                 out_channels=38,                 **kwargs):        super(CNTHead, self).__init__()        self.out_channels = out_channels        self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False)        self.Prediction_visual = nn.Linear(encode_length * embed_size,                                           self.out_channels)    def forward(self, visual_feature):        b, c, h, w = visual_feature.shape        visual_feature = visual_feature.reshape([b, c, h * w]).transpose(            [0, 2, 1])        visual_feature_num = self.Wv_fusion(visual_feature)  # batch * 26 * 512        b, n, c = visual_feature_num.shape        # using visual feature directly calculate the text length        visual_feature_num = visual_feature_num.reshape([b, n * c])        prediction_visual = self.Prediction_visual(visual_feature_num)        return prediction_visualclass RFLHead(nn.Layer):    def __init__(self,                 in_channels=512,                 hidden_size=256,                 batch_max_legnth=25,                 out_channels=38,                 use_cnt=True,                 use_seq=True,                 **kwargs):        super(RFLHead, self).__init__()        assert use_cnt or use_seq        self.use_cnt = use_cnt        self.use_seq = use_seq        if self.use_cnt:            self.cnt_head = CNTHead(                embed_size=in_channels,                encode_length=batch_max_legnth + 1,                out_channels=out_channels,                **kwargs)        if self.use_seq:            self.seq_head = AttentionLSTM(                in_channels=in_channels,                out_channels=out_channels,                hidden_size=hidden_size,                **kwargs)        self.batch_max_legnth = batch_max_legnth        self.num_class = out_channels        self.apply(self.init_weights)    def init_weights(self, m):        if isinstance(m, nn.Linear):            kaiming_init_(m.weight)            if isinstance(m, nn.Linear) and m.bias is not None:                zeros_(m.bias)    def forward(self, x, targets=None):        cnt_inputs, seq_inputs = x        if self.use_cnt:            cnt_outputs = self.cnt_head(cnt_inputs)        else:            cnt_outputs = None        if self.use_seq:            if self.training:                seq_outputs = self.seq_head(seq_inputs, targets[0],                                            self.batch_max_legnth)            else:                seq_outputs = self.seq_head(seq_inputs, None,                                            self.batch_max_legnth)            return cnt_outputs, seq_outputs        else:            return cnt_outputs
 |