123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355 |
- # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import cv2
- import random
- import pyclipper
- import paddle
- import numpy as np
- import Polygon as plg
- import scipy.io as scio
- from PIL import Image
- import paddle.vision.transforms as transforms
- class RandomScale():
- def __init__(self, short_size=640, **kwargs):
- self.short_size = short_size
- def scale_aligned(self, img, scale):
- oh, ow = img.shape[0:2]
- h = int(oh * scale + 0.5)
- w = int(ow * scale + 0.5)
- if h % 32 != 0:
- h = h + (32 - h % 32)
- if w % 32 != 0:
- w = w + (32 - w % 32)
- img = cv2.resize(img, dsize=(w, h))
- factor_h = h / oh
- factor_w = w / ow
- return img, factor_h, factor_w
- def __call__(self, data):
- img = data['image']
- h, w = img.shape[0:2]
- random_scale = np.array([0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3])
- scale = (np.random.choice(random_scale) * self.short_size) / min(h, w)
- img, factor_h, factor_w = self.scale_aligned(img, scale)
- data['scale_factor'] = (factor_w, factor_h)
- data['image'] = img
- return data
- class MakeShrink():
- def __init__(self, kernel_scale=0.7, **kwargs):
- self.kernel_scale = kernel_scale
- def dist(self, a, b):
- return np.linalg.norm((a - b), ord=2, axis=0)
- def perimeter(self, bbox):
- peri = 0.0
- for i in range(bbox.shape[0]):
- peri += self.dist(bbox[i], bbox[(i + 1) % bbox.shape[0]])
- return peri
- def shrink(self, bboxes, rate, max_shr=20):
- rate = rate * rate
- shrinked_bboxes = []
- for bbox in bboxes:
- area = plg.Polygon(bbox).area()
- peri = self.perimeter(bbox)
- try:
- pco = pyclipper.PyclipperOffset()
- pco.AddPath(bbox, pyclipper.JT_ROUND,
- pyclipper.ET_CLOSEDPOLYGON)
- offset = min(
- int(area * (1 - rate) / (peri + 0.001) + 0.5), max_shr)
- shrinked_bbox = pco.Execute(-offset)
- if len(shrinked_bbox) == 0:
- shrinked_bboxes.append(bbox)
- continue
- shrinked_bbox = np.array(shrinked_bbox[0])
- if shrinked_bbox.shape[0] <= 2:
- shrinked_bboxes.append(bbox)
- continue
- shrinked_bboxes.append(shrinked_bbox)
- except Exception as e:
- shrinked_bboxes.append(bbox)
- return shrinked_bboxes
- def __call__(self, data):
- img = data['image']
- bboxes = data['polys']
- words = data['texts']
- scale_factor = data['scale_factor']
- gt_instance = np.zeros(img.shape[0:2], dtype='uint8') # h,w
- training_mask = np.ones(img.shape[0:2], dtype='uint8')
- training_mask_distance = np.ones(img.shape[0:2], dtype='uint8')
- for i in range(len(bboxes)):
- bboxes[i] = np.reshape(bboxes[i] * (
- [scale_factor[0], scale_factor[1]] * (bboxes[i].shape[0] // 2)),
- (bboxes[i].shape[0] // 2, 2)).astype('int32')
- for i in range(len(bboxes)):
- #different value for different bbox
- cv2.drawContours(gt_instance, [bboxes[i]], -1, i + 1, -1)
- # set training mask to 0
- cv2.drawContours(training_mask, [bboxes[i]], -1, 0, -1)
- # for not accurate annotation, use training_mask_distance
- if words[i] == '###' or words[i] == '???':
- cv2.drawContours(training_mask_distance, [bboxes[i]], -1, 0, -1)
- # make shrink
- gt_kernel_instance = np.zeros(img.shape[0:2], dtype='uint8')
- kernel_bboxes = self.shrink(bboxes, self.kernel_scale)
- for i in range(len(bboxes)):
- cv2.drawContours(gt_kernel_instance, [kernel_bboxes[i]], -1, i + 1,
- -1)
- # for training mask, kernel and background= 1, box region=0
- if words[i] != '###' and words[i] != '???':
- cv2.drawContours(training_mask, [kernel_bboxes[i]], -1, 1, -1)
- gt_kernel = gt_kernel_instance.copy()
- # for gt_kernel, kernel = 1
- gt_kernel[gt_kernel > 0] = 1
- # shrink 2 times
- tmp1 = gt_kernel_instance.copy()
- erode_kernel = np.ones((3, 3), np.uint8)
- tmp1 = cv2.erode(tmp1, erode_kernel, iterations=1)
- tmp2 = tmp1.copy()
- tmp2 = cv2.erode(tmp2, erode_kernel, iterations=1)
- # compute text region
- gt_kernel_inner = tmp1 - tmp2
- # gt_instance: text instance, bg=0, diff word use diff value
- # training_mask: text instance mask, word=0,kernel and bg=1
- # gt_kernel_instance: text kernel instance, bg=0, diff word use diff value
- # gt_kernel: text_kernel, bg=0,diff word use same value
- # gt_kernel_inner: text kernel reference
- # training_mask_distance: word without anno = 0, else 1
- data['image'] = [
- img, gt_instance, training_mask, gt_kernel_instance, gt_kernel,
- gt_kernel_inner, training_mask_distance
- ]
- return data
- class GroupRandomHorizontalFlip():
- def __init__(self, p=0.5, **kwargs):
- self.p = p
- def __call__(self, data):
- imgs = data['image']
- if random.random() < self.p:
- for i in range(len(imgs)):
- imgs[i] = np.flip(imgs[i], axis=1).copy()
- data['image'] = imgs
- return data
- class GroupRandomRotate():
- def __init__(self, **kwargs):
- pass
- def __call__(self, data):
- imgs = data['image']
- max_angle = 10
- angle = random.random() * 2 * max_angle - max_angle
- for i in range(len(imgs)):
- img = imgs[i]
- w, h = img.shape[:2]
- rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1)
- img_rotation = cv2.warpAffine(
- img, rotation_matrix, (h, w), flags=cv2.INTER_NEAREST)
- imgs[i] = img_rotation
- data['image'] = imgs
- return data
- class GroupRandomCropPadding():
- def __init__(self, target_size=(640, 640), **kwargs):
- self.target_size = target_size
- def __call__(self, data):
- imgs = data['image']
- h, w = imgs[0].shape[0:2]
- t_w, t_h = self.target_size
- p_w, p_h = self.target_size
- if w == t_w and h == t_h:
- return data
- t_h = t_h if t_h < h else h
- t_w = t_w if t_w < w else w
- if random.random() > 3.0 / 8.0 and np.max(imgs[1]) > 0:
- # make sure to crop the text region
- tl = np.min(np.where(imgs[1] > 0), axis=1) - (t_h, t_w)
- tl[tl < 0] = 0
- br = np.max(np.where(imgs[1] > 0), axis=1) - (t_h, t_w)
- br[br < 0] = 0
- br[0] = min(br[0], h - t_h)
- br[1] = min(br[1], w - t_w)
- i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0
- j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0
- else:
- i = random.randint(0, h - t_h) if h - t_h > 0 else 0
- j = random.randint(0, w - t_w) if w - t_w > 0 else 0
- n_imgs = []
- for idx in range(len(imgs)):
- if len(imgs[idx].shape) == 3:
- s3_length = int(imgs[idx].shape[-1])
- img = imgs[idx][i:i + t_h, j:j + t_w, :]
- img_p = cv2.copyMakeBorder(
- img,
- 0,
- p_h - t_h,
- 0,
- p_w - t_w,
- borderType=cv2.BORDER_CONSTANT,
- value=tuple(0 for i in range(s3_length)))
- else:
- img = imgs[idx][i:i + t_h, j:j + t_w]
- img_p = cv2.copyMakeBorder(
- img,
- 0,
- p_h - t_h,
- 0,
- p_w - t_w,
- borderType=cv2.BORDER_CONSTANT,
- value=(0, ))
- n_imgs.append(img_p)
- data['image'] = n_imgs
- return data
- class MakeCentripetalShift():
- def __init__(self, **kwargs):
- pass
- def jaccard(self, As, Bs):
- A = As.shape[0] # small
- B = Bs.shape[0] # large
- dis = np.sqrt(
- np.sum((As[:, np.newaxis, :].repeat(
- B, axis=1) - Bs[np.newaxis, :, :].repeat(
- A, axis=0))**2,
- axis=-1))
- ind = np.argmin(dis, axis=-1)
- return ind
- def __call__(self, data):
- imgs = data['image']
- img, gt_instance, training_mask, gt_kernel_instance, gt_kernel, gt_kernel_inner, training_mask_distance = \
- imgs[0], imgs[1], imgs[2], imgs[3], imgs[4], imgs[5], imgs[6]
- max_instance = np.max(gt_instance) # num bbox
- # make centripetal shift
- gt_distance = np.zeros((2, *img.shape[0:2]), dtype=np.float32)
- for i in range(1, max_instance + 1):
- # kernel_reference
- ind = (gt_kernel_inner == i)
- if np.sum(ind) == 0:
- training_mask[gt_instance == i] = 0
- training_mask_distance[gt_instance == i] = 0
- continue
- kpoints = np.array(np.where(ind)).transpose(
- (1, 0))[:, ::-1].astype('float32')
- ind = (gt_instance == i) * (gt_kernel_instance == 0)
- if np.sum(ind) == 0:
- continue
- pixels = np.where(ind)
- points = np.array(pixels).transpose(
- (1, 0))[:, ::-1].astype('float32')
- bbox_ind = self.jaccard(points, kpoints)
- offset_gt = kpoints[bbox_ind] - points
- gt_distance[:, pixels[0], pixels[1]] = offset_gt.T * 0.1
- img = Image.fromarray(img)
- img = img.convert('RGB')
- data["image"] = img
- data["gt_kernel"] = gt_kernel.astype("int64")
- data["training_mask"] = training_mask.astype("int64")
- data["gt_instance"] = gt_instance.astype("int64")
- data["gt_kernel_instance"] = gt_kernel_instance.astype("int64")
- data["training_mask_distance"] = training_mask_distance.astype("int64")
- data["gt_distance"] = gt_distance.astype("float32")
- return data
- class ScaleAlignedShort():
- def __init__(self, short_size=640, **kwargs):
- self.short_size = short_size
- def __call__(self, data):
- img = data['image']
- org_img_shape = img.shape
- h, w = img.shape[0:2]
- scale = self.short_size * 1.0 / min(h, w)
- h = int(h * scale + 0.5)
- w = int(w * scale + 0.5)
- if h % 32 != 0:
- h = h + (32 - h % 32)
- if w % 32 != 0:
- w = w + (32 - w % 32)
- img = cv2.resize(img, dsize=(w, h))
- new_img_shape = img.shape
- img_shape = np.array(org_img_shape + new_img_shape)
- data['shape'] = img_shape
- data['image'] = img
- return data
|