convert_label2html.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. conver table label to html
  16. """
  17. import json
  18. import argparse
  19. from tqdm import tqdm
  20. def save_pred_txt(key, val, tmp_file_path):
  21. with open(tmp_file_path, 'a+', encoding='utf-8') as f:
  22. f.write('{}\t{}\n'.format(key, val))
  23. def skip_char(text, sp_char_list):
  24. """
  25. skip empty cell
  26. @param text: text in cell
  27. @param sp_char_list: style char and special code
  28. @return:
  29. """
  30. for sp_char in sp_char_list:
  31. text = text.replace(sp_char, '')
  32. return text
  33. def gen_html(img):
  34. '''
  35. Formats HTML code from tokenized annotation of img
  36. '''
  37. html_code = img['html']['structure']['tokens'].copy()
  38. to_insert = [i for i, tag in enumerate(html_code) if tag in ('<td>', '>')]
  39. for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]):
  40. if cell['tokens']:
  41. text = ''.join(cell['tokens'])
  42. # skip empty text
  43. sp_char_list = ['<b>', '</b>', '\u2028', ' ', '<i>', '</i>']
  44. text_remove_style = skip_char(text, sp_char_list)
  45. if len(text_remove_style) == 0:
  46. continue
  47. html_code.insert(i + 1, text)
  48. html_code = ''.join(html_code)
  49. html_code = '<html><body><table>{}</table></body></html>'.format(html_code)
  50. return html_code
  51. def load_gt_data(gt_path):
  52. """
  53. load gt
  54. @param gt_path:
  55. @return:
  56. """
  57. data_list = {}
  58. with open(gt_path, 'rb') as f:
  59. lines = f.readlines()
  60. for line in tqdm(lines):
  61. data_line = line.decode('utf-8').strip("\n")
  62. info = json.loads(data_line)
  63. data_list[info['filename']] = info
  64. return data_list
  65. def convert(origin_gt_path, save_path):
  66. """
  67. gen html from label file
  68. @param origin_gt_path:
  69. @param save_path:
  70. @return:
  71. """
  72. data_dict = load_gt_data(origin_gt_path)
  73. for img_name, gt in tqdm(data_dict.items()):
  74. html = gen_html(gt)
  75. save_pred_txt(img_name, html, save_path)
  76. print('conver finish')
  77. def parse_args():
  78. parser = argparse.ArgumentParser(description="args for paddleserving")
  79. parser.add_argument(
  80. "--ori_gt_path", type=str, required=True, help="label gt path")
  81. parser.add_argument(
  82. "--save_path", type=str, required=True, help="path to save file")
  83. args = parser.parse_args()
  84. return args
  85. if __name__ == '__main__':
  86. args = parse_args()
  87. convert(args.ori_gt_path, args.save_path)