9.9 KB

  1. # This is where we handle translating css styles into openpyxl styles
  2. # and cascading those from parent to child in the dom.
  3. from openpyxl.cell import cell
  4. from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color
  5. from openpyxl.styles.fills import FILL_SOLID
  6. from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
  7. from openpyxl.styles.colors import BLACK
  8. FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy'
  9. def colormap(color):
  10. """
  11. Convenience for looking up known colors
  12. """
  13. cmap = {'black': BLACK}
  14. return cmap.get(color, color)
  15. def style_string_to_dict(style):
  16. """
  17. Convert css style string to a python dictionary
  18. """
  19. def clean_split(string, delim):
  20. return (s.strip() for s in string.split(delim))
  21. styles = [clean_split(s, ":") for s in style.split(";") if ":" in s]
  22. return dict(styles)
  23. def get_side(style, name):
  24. return {'border_style': style.get('border-{}-style'.format(name)),
  25. 'color': colormap(style.get('border-{}-color'.format(name)))}
  26. known_styles = {}
  27. def style_dict_to_named_style(style_dict, number_format=None):
  28. """
  29. Change css style (stored in a python dictionary) to openpyxl NamedStyle
  30. """
  31. style_and_format_string = str({
  32. 'style_dict': style_dict,
  33. 'parent': style_dict.parent,
  34. 'number_format': number_format,
  35. })
  36. if style_and_format_string not in known_styles:
  37. # Font
  38. font = Font(bold=style_dict.get('font-weight') == 'bold',
  39. color=style_dict.get_color('color', None),
  40. size=style_dict.get('font-size'))
  41. # Alignment
  42. alignment = Alignment(horizontal=style_dict.get('text-align', 'general'),
  43. vertical=style_dict.get('vertical-align'),
  44. wrap_text=style_dict.get('white-space', 'nowrap') == 'normal')
  45. # Fill
  46. bg_color = style_dict.get_color('background-color')
  47. fg_color = style_dict.get_color('foreground-color', Color())
  48. fill_type = style_dict.get('fill-type')
  49. if bg_color and bg_color != 'transparent':
  50. fill = PatternFill(fill_type=fill_type or FILL_SOLID,
  51. start_color=bg_color,
  52. end_color=fg_color)
  53. else:
  54. fill = PatternFill()
  55. # Border
  56. border = Border(left=Side(**get_side(style_dict, 'left')),
  57. right=Side(**get_side(style_dict, 'right')),
  58. top=Side(**get_side(style_dict, 'top')),
  59. bottom=Side(**get_side(style_dict, 'bottom')),
  60. diagonal=Side(**get_side(style_dict, 'diagonal')),
  61. diagonal_direction=None,
  62. outline=Side(**get_side(style_dict, 'outline')),
  63. vertical=None,
  64. horizontal=None)
  65. name = 'Style {}'.format(len(known_styles) + 1)
  66. pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border,
  67. number_format=number_format)
  68. known_styles[style_and_format_string] = pyxl_style
  69. return known_styles[style_and_format_string]
  70. class StyleDict(dict):
  71. """
  72. It's like a dictionary, but it looks for items in the parent dictionary
  73. """
  74. def __init__(self, *args, **kwargs):
  75. self.parent = kwargs.pop('parent', None)
  76. super(StyleDict, self).__init__(*args, **kwargs)
  77. def __getitem__(self, item):
  78. if item in self:
  79. return super(StyleDict, self).__getitem__(item)
  80. elif self.parent:
  81. return self.parent[item]
  82. else:
  83. raise KeyError('{} not found'.format(item))
  84. def __hash__(self):
  85. return hash(tuple([(k, self.get(k)) for k in self._keys()]))
  86. # Yielding the keys avoids creating unnecessary data structures
  87. # and happily works with both python2 and python3 where the
  88. # .keys() method is a dictionary_view in python3 and a list in python2.
  89. def _keys(self):
  90. yielded = set()
  91. for k in self.keys():
  92. yielded.add(k)
  93. yield k
  94. if self.parent:
  95. for k in self.parent._keys():
  96. if k not in yielded:
  97. yielded.add(k)
  98. yield k
  99. def get(self, k, d=None):
  100. try:
  101. return self[k]
  102. except KeyError:
  103. return d
  104. def get_color(self, k, d=None):
  105. """
  106. Strip leading # off colors if necessary
  107. """
  108. color = self.get(k, d)
  109. if hasattr(color, 'startswith') and color.startswith('#'):
  110. color = color[1:]
  111. if len(color) == 3: # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that
  112. color = ''.join(2 * c for c in color)
  113. return color
  114. class Element(object):
  115. """
  116. Our base class for representing an html element along with a cascading style.
  117. The element is created along with a parent so that the StyleDict that we store
  118. can point to the parent's StyleDict.
  119. """
  120. def __init__(self, element, parent=None):
  121. self.element = element
  122. self.number_format = None
  123. parent_style = parent.style_dict if parent else None
  124. self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style)
  125. self._style_cache = None
  126. def style(self):
  127. """
  128. Turn the css styles for this element into an openpyxl NamedStyle.
  129. """
  130. if not self._style_cache:
  131. self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format)
  132. return self._style_cache
  133. def get_dimension(self, dimension_key):
  134. """
  135. Extracts the dimension from the style dict of the Element and returns it as a float.
  136. """
  137. dimension = self.style_dict.get(dimension_key)
  138. if dimension:
  139. if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']:
  140. dimension = dimension[:-2]
  141. dimension = float(dimension)
  142. return dimension
  143. class Table(Element):
  144. """
  145. The concrete implementations of Elements are semantically named for the types of elements we are interested in.
  146. This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to
  147. allowing Element to have an arbitrary number of children and dealing with an abstract element tree.
  148. """
  149. def __init__(self, table):
  150. """
  151. takes an html table object (from lxml)
  152. """
  153. super(Table, self).__init__(table)
  154. table_head = table.find('thead')
  155. self.head = TableHead(table_head, parent=self) if table_head is not None else None
  156. table_body = table.find('tbody')
  157. self.body = TableBody(table_body if table_body is not None else table, parent=self)
  158. class TableHead(Element):
  159. """
  160. This class maps to the `<th>` element of the html table.
  161. """
  162. def __init__(self, head, parent=None):
  163. super(TableHead, self).__init__(head, parent=parent)
  164. self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')]
  165. class TableBody(Element):
  166. """
  167. This class maps to the `<tbody>` element of the html table.
  168. """
  169. def __init__(self, body, parent=None):
  170. super(TableBody, self).__init__(body, parent=parent)
  171. self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')]
  172. class TableRow(Element):
  173. """
  174. This class maps to the `<tr>` element of the html table.
  175. """
  176. def __init__(self, tr, parent=None):
  177. super(TableRow, self).__init__(tr, parent=parent)
  178. self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')]
  179. def element_to_string(el):
  180. return _element_to_string(el).strip()
  181. def _element_to_string(el):
  182. string = ''
  183. for x in el.iterchildren():
  184. string += '\n' + _element_to_string(x)
  185. text = el.text.strip() if el.text else ''
  186. tail = el.tail.strip() if el.tail else ''
  187. return text + string + '\n' + tail
  188. class TableCell(Element):
  189. """
  190. This class maps to the `<td>` element of the html table.
  191. """
  194. def __init__(self, cell, parent=None):
  195. super(TableCell, self).__init__(cell, parent=parent)
  196. self.value = element_to_string(cell)
  197. self.number_format = self.get_number_format()
  198. def data_type(self):
  199. cell_types = self.CELL_TYPES & set(self.element.get('class', '').split())
  200. if cell_types:
  201. if 'TYPE_FORMULA' in cell_types:
  202. # Make sure TYPE_FORMULA takes precedence over the other classes in the set.
  203. cell_type = 'TYPE_FORMULA'
  204. elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}:
  205. cell_type = 'TYPE_NUMERIC'
  206. else:
  207. cell_type = cell_types.pop()
  208. else:
  209. cell_type = 'TYPE_STRING'
  210. return getattr(cell, cell_type)
  211. def get_number_format(self):
  212. if 'TYPE_CURRENCY' in self.element.get('class', '').split():
  214. if 'TYPE_INTEGER' in self.element.get('class', '').split():
  215. return '#,##0'
  216. if 'TYPE_PERCENTAGE' in self.element.get('class', '').split():
  218. if 'TYPE_DATE' in self.element.get('class', '').split():
  220. if self.data_type() == cell.TYPE_NUMERIC:
  221. try:
  222. int(self.value)
  223. except ValueError:
  224. return '#,##0.##'
  225. else:
  226. return '#,##0'
  227. def format(self, cell):
  228. =
  229. data_type = self.data_type()
  230. if data_type:
  231. cell.data_type = data_type