Source code for xls2xlsx.htmlxls2xlsx

from bs4 import BeautifulSoup, UnicodeDammit, NavigableString, Comment, CData, ProcessingInstruction, Declaration, Doctype    # pip install beautifulsoup4
import quopri
from bs4 import GuessedAtParserWarning
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Border, Alignment, Font, Side, Color
from openpyxl.comments import Comment as OpenpyxlComment
from openpyxl.utils import get_column_letter, column_index_from_string
from openpyxl.drawing.image import Image
from openpyxl.styles import numbers
from openpyxl.styles.numbers import BUILTIN_FORMATS, BUILTIN_FORMATS_REVERSE
import requests
import copy
import io
import os
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import cssutils
import webcolors
import traceback
import logging
import email            # For mht files
import shutil
from functools import lru_cache
from dateutil.parser import parse as date_parse
from datetime import datetime, date, timedelta
from datetime import time as tm
from urllib.parse import urljoin
from time import sleep
from PIL import ImageFont
import math
from fontTools import ttLib          # pip install fonttools
import yaml
import sys
import calendar
import currency_symbols._constants as currency_symbols_constants

TRACE=False

cssutils.log.setLevel(logging.CRITICAL) # Remove 'Unknown Property name' messages
BUILTIN_FORMATS[14] = 'm/d/yyyy'    # See https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1534
del BUILTIN_FORMATS_REVERSE['mm-dd-yy']
BUILTIN_FORMATS_REVERSE['m/d/yyyy'] = 14
REQUESTS_HEADER = {'User-Agent': 'xls2xlsx/0.2.0 (https://pypi.org/project/xls2xlsx/; snoopyjc@gmail.com) htmlxls2xlsx.py/0.2.0'}

[docs]class FontUtils: LINE_HEIGHT_FACTOR = 1.25 # line_height in px = Font size in px * LINE_HEIGHT_FACTOR def __init__(self): self.skinny_chars = "1!|iIl.,;:' " self.upper_chars = "ABCDEFGHJKLMNOPQRSTUVXYZmw()[]$&*-+{}<>/?" self.fat_chars = 'MW@#%_' fontnames_file = os.path.join(os.path.dirname(__file__), 'fontnames.yaml') self.name_to_path = {} if os.path.isfile(fontnames_file): with open(fontnames_file, 'r') as fn: self.name_to_path = yaml.safe_load(fn) else: dirs = ['fonts'] # Code 'borrowed' from PIL: if sys.platform == "win32": windir = os.environ.get("WINDIR") if windir: dirs.append(os.path.join(windir, "fonts")) elif sys.platform in ("linux", "linux2"): lindirs = os.environ.get("XDG_DATA_DIRS", "") if not lindirs: lindirs = "/usr/share" dirs += [os.path.join(lindir, "fonts") for lindir in lindirs.split(":")] elif sys.platform == "darwin": dirs += ["/Library/Fonts", "/System/Library/Fonts", os.path.expanduser("~/Library/Fonts"), ] FONT_SPECIFIER_NAME_ID = 4 for d in dirs: if os.path.isdir(d): files = os.listdir(d) for f in files: path = os.path.join(d, f) try: tt = ttLib.TTFont(path) name = None for r in tt['name'].names: if r.nameID == FONT_SPECIFIER_NAME_ID: if b'\000' in r.string: try: name = str(r.string, 'utf-16-be') break except Exception: pass else: name = str(r.string, 'latin-1') break if name: nl = name.lower() if nl != name: self.name_to_path[nl] = '!' + name # '!' means alias self.name_to_path[name] = path else: print(f'No name for {path}') except Exception: pass with open(fontnames_file, 'w') as fn: yaml.dump(self.name_to_path, fn)
[docs] def get_real_font_name(self, name, bold=False, italic=False): """Given a font name which may not be in the proper case, return the real font name, or None if not found""" fn = name.lower() if bold: fn += ' bold' if italic: fn += ' italic' result = self.name_to_path.get(fn) if result is not None: if result[0] == '!': # Alias return result[1:] # Real name else: return fn # We found it in lower case
[docs] def get_font_path(self, name, bold=False, italic=False): """Given a font name (which may not be in the proper case), return the path to the font file or None if not found""" real_name = self.get_real_font_name(name, bold, italic) if real_name is None: return None return self.name_to_path.get(real_name)
[docs] @lru_cache(maxsize=32) def get_font(self, name, size=10, bold=False, italic=False): # Pillow creates images that are 72 pixels per inch by default, and a point is 1/72 of an inch, so # we must convert the size to pixels in order to get the proper size. size = self.pt_to_px(size) size = math.ceil(size) font_path = self.get_font_path(name, bold, italic) if not font_path: return None return ImageFont.truetype(font_path, size)
[docs] @staticmethod def str_to_filename(s, ext): # pragma nocover """Convert this string to a valid filename (used for debugging only)""" result = re.sub(r'[.<>:"/\\|?*\[\]\s]', '_', s) + ext if len(result) > 100: result = result[:100] + ext return result
[docs] def get_font_size(self, font, s): """Get the width and height of text 's' in the given font. 's' is a single line of text (without newlines)""" height = self.pt_to_px(font.sz) * self.LINE_HEIGHT_FACTOR if not font or not s: return (0, height) tt_font = self.get_font(font.name, font.sz, font.b, font.i) if tt_font: #return font.getsize_multiline(s, spacing=font.size/3) width, _ = tt_font.getsize(s) #width = self.pt_to_px(width) if TRACE: os.makedirs('trace', exist_ok=True) from PIL import ImageDraw from PIL import Image as PILImage img = PILImage.new('RGB', (math.ceil(width), math.ceil(height)), color='white') draw = ImageDraw.Draw(img) draw.text((0, 0), s, fill='black', font=tt_font) img.save(os.path.join('trace', self.str_to_filename(f'{font.name}_{font.sz}_{s}', '.png'))) else: # Estimate the font size if we can't find the true answer width = 0 for c in s: if c in self.skinny_chars: width += 0.6 elif c in self.fat_chars: if font.b and font.name == 'Calibri': width += 2.1 else: width += 1.9 elif c in self.upper_chars: width += 1.4 else: width += 1 if font.b and font.name != 'Calibri': width *= 1.1 # 10% wider for non-Calibri fonts in bold (like Arial) width += 1 # Give it some margin width *= (font.sz/11) width *= 7 # Convert chars to px return (width, height)
[docs] def lines_needed(self, img_width, s, font): """How many lines are needed to render this text 's' using img_width pixels?""" if '\n' in s: lines = s.split('\n') result = 0 for line in lines: result += self.lines_needed(img_width, line, font) if TRACE: print(f'lines_needed({img_width}, {s}, {font.sz}) = {result} (1)') return result number_of_lines = 0 # count how many lines are needed to break the string into multi-lines that fit img_width line = '' for token in s.split(): if line: line += ' ' + token else: line = token w = self.get_font_size(font, line)[0] if w > img_width: number_of_lines += int(w // img_width) line = token elif w == img_width: number_of_lines += 1 line = '' if line: number_of_lines += 1 if number_of_lines == 0: # E.g. if you send the empty string number_of_lines = 1 if TRACE: print(f'lines_needed({img_width}, {s}, {font.sz}) = {number_of_lines} (2)') return number_of_lines
[docs] @staticmethod def pt_to_px(pt): if pt is None: return pt return pt / 0.75
[docs] @staticmethod def px_to_pt(px): if px is None: return px return px * 0.75
[docs]class CSSStyle: RETRIES=6 DEFAULT_POINT_SIZE = 10 SPREADSHEET_WIDTH_PX = 1900 # e.g. to determine what width="5%" means SPREADSHEET_HEIGHT_PX = 800 MAX_CELL_HEIGHT_PT = 409 # Excel limit on cell height MAX_CELL_WIDTH_UNITS = 255 # Excel limit on cell width MIN_CELL_HEIGHT_PT = FontUtils.px_to_pt(15) MIN_CELL_WIDTH_PX = 15 DEFAULT_CELL_WIDTH_PX = 64 def __init__(self): self.stylemap = {} self.pt_list = ('7.5pt', '10pt', '12pt', '13.5pt', '18pt', '24pt', '36pt') self.inherited_properties = { 'border-collapse', 'border-spacing', 'caption-side', 'color', 'cursor', 'direction', 'empty-cells', 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', 'font-size-adjust', 'font-stretch', 'font', 'letter-spacing', 'line-height', 'list-style-image', 'list-style-position', 'list-style-type', 'list-style', 'orphans', 'quotes', 'tab-size', 'text-align', 'text-align-last', 'text-decoration-color', 'text-indent', 'text-justify', 'text-shadow', 'text-transform', 'visibility', 'white-space', 'widows', 'word-break', 'word-spacing', 'word-wrap' } # We use Microsoft's special mso-style-parent to inherit the style of the <td> on the next element, which # we send thru depending on what tag it is. This is used to style the entire cell, since we don't support # a per-element style (openpyxl doesn't support rich text) self.default_styles = """ .htmlxls2xlsx {background: inherit; background-color: inherit; border: inherit; border-color: inherit; border-width: inherit; border-bottom-color: inherit; border-left-color: inherit; border-right-color: inherit; border-top-color: inherit; border-top: inherit; border-right: inherit; border-bottom: inherit; border-left: inherit; border-top-width: inherit; border-right-width: inherit; border-bottom-width: inherit; border-left-width: inherit; border-top-style: inherit; border-right-style: inherit; border-bottom-style: inherit; border-left-style: inherit; border-top-color: inherit; border-right-color: inherit; border-bottom-color: inherit; border-left-color: inherit; height: inherit; layout-flow: inherit; max-height: inherit; max-width: inherit; min-height: inherit; min-width: inherit; mso-ignore: inherit; mso-char-indent-count: inherit; mso-number-format: inherit; mso-rotate: inherit; mso-text-control: inherit; padding: inherit; padding-top: inherit; padding-right: inherit; padding-bottom: inherit; padding-left: inherit; text-decoration: inherit; vertical-align: inherit; width: inherit; writing-mode: inherit; } .msocomtxt {display: none; } .msocomanch {display: none; } .msocomhide {display: none; } a {mso-style-parent:htmlxls2xlsx; color: #0563C1; text-decoration: underline;} b {mso-style-parent:htmlxls2xlsx; font-weight: bold;} big {mso-style-parent:htmlxls2xlsx; font-size: 1.33em;} center {mso-style-parent:htmlxls2xlsx; text-align: center;} code {mso-style-parent:htmlxls2xlsx; font-family: monospace;} div {mso-style-parent:htmlxls2xlsx;} em {mso-style-parent:htmlxls2xlsx; font-style: italic;} font {mso-style-parent:htmlxls2xlsx; } h1 {mso-style-parent:htmlxls2xlsx; display: block; font-size: 2em; margin-top: 0.67em; margin-bottom: 0.67em; margin-left: 0; margin-right: 0; font-weight: bold;} h2 {mso-style-parent:htmlxls2xlsx; display: block; font-size: 1.5em; margin-top: 0.83em; margin-bottom: 0.83em; margin-left: 0; margin-right: 0; font-weight: bold;} h3 {mso-style-parent:htmlxls2xlsx; display: block; font-size: 1.17em; margin-top: 1em; margin-bottom: 1em; margin-left: 0; margin-right: 0; font-weight: bold;} h4 {mso-style-parent:htmlxls2xlsx; display: block; margin-top: 1.33em; margin-bottom: 1.33em; margin-left: 0; margin-right: 0; font-weight: bold; } h5 {mso-style-parent:htmlxls2xlsx; display: block; font-size: .83em; margin-top: 1.67em; margin-bottom: 1.67em; margin-left: 0; margin-right: 0; font-weight: bold; } h6 {mso-style-parent:htmlxls2xlsx; display: block; font-size: .67em; margin-top: 2.33em; margin-bottom: 2.33em; margin-left: 0; margin-right: 0; font-weight: bold; } hr {mso-style-parent:htmlxls2xlsx; border-bottom: 0.5pt solid windowtext; } i {mso-style-parent:htmlxls2xlsx; font-style: italic;} p {mso-style-parent:htmlxls2xlsx; display: block; margin-top: 1em; margin-bottom: 1em; margin-left: 0; margin-right: 0;} pre {mso-style-parent:htmlxls2xlsx; display: block; font-family: monospace; white-space: pre; margin: 1em 0;} u {mso-style-parent:htmlxls2xlsx; text-decoration: underline;} s {mso-style-parent:htmlxls2xlsx; text-decoration: line-through;} small {mso-style-parent:htmlxls2xlsx; font-size: 0.75em;} span {mso-style-parent:htmlxls2xlsx;} strike {mso-style-parent:htmlxls2xlsx; text-decoration: line-through;} strong {mso-style-parent:htmlxls2xlsx; font-weight: bold;} table {display: table; border-collapse: separate; border-spacing: 2px; border-color: gray; vertical-align: middle; font-family: sans-serif;} tbody {display: table-row-group; vertical-align: middle; border-color: inherit;} td {display: table-cell; vertical-align: inherit; white-space: normal; font-size: 1em;} th {display: table-cell; vertical-align: inherit; font-size: 1em; font-weight: bold; text-align: center; white-space: nowrap;} thead {display: table-header-group; vertical-align: middle; border-color: inherit;} tr {display: table-row; vertical-align: inherit; border-color: inherit;} """ self.font_map = {'calibri': 'Calibri', 'arial': 'Arial', 'serif': 'Times New Roman', 'monospace': 'Courier New', 'sans-serif': 'Calibri', 'cursive': 'Comic Sans MS'} self.number_format_replacements = {'Fixed': r'0.00', 'Number': r'0.00', 'Short Date': r'm\/dd\/yyyy', 'Long Date': r'dddd\, mmmm d\, yyyy', 'Short Time': r'h\:mm AM/PM', 'Long Time': r'h\:mm\:ss AM/PM', 'Percent': r'0.00%', 'Scientific': r'0.00E+00'} self.pattern_map = {'none': None, 'reverse-diag-stripe': 'darkDown', 'thin-reverse-diag-stripe': 'lightDown', 'gray-75': 'darkGray', 'gray-25': 'lightGray', 'thick-diag-cross': 'darkTrellis', 'thin-diag-cross': 'lightTrellis', 'diag-stripe': 'darkUp', 'thin-diag-stripe': 'lightUp', 'gray-0625': 'gray0625', 'gray-125': 'gray125', 'diag-cross': 'darkGrid', 'thin-horz-cross': 'lightGrid', 'vert-stripe': 'darkVertical', 'thin-vert-stripe': 'lightVertical', 'horz-stripe': 'darkHorizontal', 'thin-horz-stripe': 'lightHorizontal', 'gray-50': 'mediumGray', 'solid': 'solid'} self.default_css = {} self.add_style_sheet(self.default_styles) self.default_css = self.stylemap.copy() if TRACE: print(f'default_css = {self.default_css}') def __str__(self): return '\n'.join([f'{style}\n{self.stylemap[style]}' for style in self.stylemap])
[docs] def map_font(self, font): """Map the given font to something that Excel probably has""" fl = font.lower() if fl in self.font_map: return self.font_map[fl] return font
@staticmethod def _fixup_styles(style): if 'mso-number-format' not in style: return style # Fixup zip-code style which parseStyle changes from 00000 to 0 style = re.sub(r'mso-number-format\s*:\s*(0[^;]*)', r'mso-number-format:"\1"', style) # Fixup mso-number-format style where they forget to add a space after an escape # (per css documentation, spaces after an escape sequence are eaten) return re.sub(r'\\0022 ', r'\\0022 ', style)
[docs] def add_style_sheet(self, style): if not style: return css = cssutils.parseString(self._fixup_styles(style), validate=False) for rule in css: if rule.type == rule.STYLE_RULE: if ',' in rule.selectorText: # e1,e2 selects all e1 elements and all e2 element elements = rule.selectorText.split(',') d = self._style_to_dict(rule.style) for element in elements: e = element.strip() self.stylemap[e] = self.update_style(self.stylemap.get(e), d, e) elif '>' in rule.selectorText or '[' in rule.selectorText: e = rule.selectorText.strip().replace(' ', '') if '>' in e: tag = e.split('>')[-1] else: tag = None self.stylemap[e] = self.update_style(self.stylemap.get(e), self._style_to_dict(rule.style), tag) else: rst = rule.selectorText.strip() rst = rst.replace('tbody', '').replace('thead', '') # We don't send these thru while ' ' in rst: rst = rst.replace(' ', ' ') if rst: # If it's just 'tbody', now it's empty tag = rst.split()[-1] self.stylemap[rst] = self.update_style(self.stylemap.get(rst), self._style_to_dict(rule.style), tag)
def _style_to_dict(self, style): result = {} for item in style: if item.name == 'mso-style-parent': # Special Microsoft style inheritance if f'.{item.value}' in self.stylemap: parent = self.stylemap[f'.{item.value}'] result = self.update_style(parent, result, parent=False) else: result[item.name] = item.value return result def _font_size(self, size): """Convert the size from a <font size="N"> tag to a point size""" result = '12pt' try: default = 3 if size[0] == '+': size = default + int(size[1:]) elif size[0] == '-': size = default - int(size[1:]) else: size = int(size) size = min(max(size, 1), 7) result = self.pt_list[size-1] except Exception: pass return result
[docs] def update_style(self, style, new_style, new_tag=None, parent=False): """Like normal dict.update() but handle relative size fonts (% and em), inherit, and initial. The new_tag specifies the tag name associated with the new_style and is used to process values of "initial". If parent is True (default), then the "style" element is the parent style of "new_style", which determines if properties are inherited or just merged. """ if not new_style: return style.copy() result = {} if parent: for item, value in style.items(): if item in self.inherited_properties: result[item] = value elif style: result = style.copy() for item, value in new_style.items(): if value == 'inherit': if item in result: continue elif style and item in style: result[item] = style[item] continue if self.default_css: value = 'initial' elif parent: continue if value == 'initial': if new_tag in self.default_css and item in self.default_css[new_tag]: value = self.default_css[new_tag][item] else: continue if item == 'font-size' and (value.endswith('em') or value.endswith('%')) and parent: fs = CSSStyle.get_value(value) units = CSSStyle.get_units(value) if units == '%': fs /= 100 if 'font-size' in result: pt = CSSStyle.get_pt(result['font-size']) else: pt = CSSStyle.DEFAULT_POINT_SIZE if pt*fs != 0.0: result['font-size'] = f'{pt*fs}pt' else: result[item] = value return result
[docs] def apply_style(self, tags): """Apply the appropriate style to the given element(s). The tags is a list of tuples of (tag_name, tag_attrs), given in the proper nested order. Handled: * .class #id element element.class element,element (handled above) element element (direct descendants only) element>element [attribute] [attribute=value] Not handled: .class1.class2 .class1 .class2 element+element element~element """ master_style = {} element_list = [] for tag_name, attrs in tags: style = {} # Style for this tag element_list.append(tag_name) if tag_name == 'font': color = attrs.get("color") face = attrs.get("face") size = attrs.get("size") styl = '' if color: styl += f'color: {color};' if face: styl += f'font-family: {face};' if size: styl += f'font-size: {self._font_size(size)};' css = cssutils.parseStyle(styl) style = self.update_style(style, self._style_to_dict(css), tag_name) if '*' in self.stylemap: style = self.update_style(style, self.stylemap['*'], tag_name) if tag_name in self.stylemap: style = self.update_style(style, self.stylemap[tag_name], tag_name) for i in range(len(element_list)-1): # table tr td; tr td elems = ' '.join(element_list[i:]) # e1 e2 ... if elems in self.stylemap: style = self.update_style(style, self.stylemap[elems], tag_name) elems = '>'.join(element_list[i:]) # e1>e2 ... if elems in self.stylemap: style = self.update_style(style, self.stylemap[elems], tag_name) class_ = attrs.get('class') if class_ is not None: if not isinstance(class_, list): class_ = [class_] for c in class_: if f'{tag_name}.{c}' in self.stylemap: style = self.update_style(style, self.stylemap[f'{tag_name}.{c}'], tag_name) if f'.{c}' in self.stylemap: style = self.update_style(style, self.stylemap[f'.{c}'], tag_name) id_ = attrs.get('id') if id_ is not None and f'#{id_}' in self.stylemap: style = self.update_style(style, self.stylemap[f'#{id_}'], tag_name) for attr in attrs: if f'[{attr}]' in self.stylemap: style = self.update_style(style, self.stylemap[f'[{attr}]'], tag_name) if f'[{attr}={attrs[attr]}]' in self.stylemap: style = self.update_style(style, self.stylemap[f'[{attr}={attrs[attr]}]'], tag_name) if attr == 'align': style['text-align'] = attrs[attr] elif attr == 'valign': style['vertical-align'] = attrs[attr] styl = attrs.get('style') if styl is not None: css = cssutils.parseStyle(self._fixup_styles(styl)) style = self.update_style(style, self._style_to_dict(css), tag_name) master_style = self.update_style(master_style, style, tag_name, parent=True) for k, v in list(master_style.items()): # If any inherit or initial come thru to the end, zap them! if v == 'inherit' or v == 'initial': del master_style[k] return master_style
[docs] def parse_style(self, styl): """Parse a style string (e.g. from style="...") to a dict""" css = cssutils.parseStyle(self._fixup_styles(styl)) return self._style_to_dict(css)
[docs] @staticmethod def format_style(style): """Convert a parsed style dict back to a style string""" result = [] for k, v in style.items(): result.append(f'{k}:{v}') return ';'.join(result)
[docs] @staticmethod def to_xlsx_color(color): result = Color() # Basic black try: if color[0] != '#': color = webcolors.name_to_hex(color) if len(color) == 4 and color[0] == '#': # #345 => #334455 color = color[0] + color[1] + color[1] + color[2] + color[2] + color[3] + color[3] result = Color(color[1:]) except Exception: pass return result
[docs] @staticmethod def get_value(item, default=0.0): """For an item like 0.5pt, get the float value = 0.5""" m = re.match(r'([+-]?(?:\d+(?:[.]\d*)?)|[.]\d+)', item) if m: return float(m.group(1)) return default
[docs] @staticmethod def get_units(item, default=None): """For an item like 0.5pt, return the units = 'pt'""" m = re.match(r'([+-]?(?:\d+(?:[.]\d*)?)|[.]\d+)([^\d]+)$', item) if m: return m.group(2) return default
[docs] @staticmethod def get_pt(item, default=0.0, default_units='px', spreadsheet_pt=SPREADSHEET_WIDTH_PX*0.75): """For an item like 0.5pt or 2px, get the float value in units of pt""" item = str(item) result = CSSStyle.get_value(item, default) units = CSSStyle.get_units(item, default_units) scale = 1.0 scales = {'px': 0.75, 'em': 11.955168, 'in': 72, 'mm': 2.808, 'cm': 28.08, 'Q': 0.702, 'pc': 12, '%': spreadsheet_pt // 100} scale = scales.get(units, scale) return result * scale
[docs] @staticmethod def get_px(item, default=0.0, spreadsheet_px=SPREADSHEET_WIDTH_PX): """For an item like 0.5pt or 2px, get the float value in units of px""" result = CSSStyle.get_pt(item, default, spreadsheet_pt=spreadsheet_px*0.75) result /= 0.75 # convert pt to px return result
[docs] @staticmethod def px_to_units(px): """Convert pixels to excel column width units (determined empirically)""" return px / 7
[docs] @staticmethod def units_to_px(units): """Convert Excel column width units to pixels""" return units * 7
[docs] def style_to_xlsx(self, style, font=None, fill=None, border=None, alignment=None, number_format=None): """Convert this style to the appropriate attributes for openpyxl. returns a tuple with (font, fill, border, alignment)""" if font: font = copy.deepcopy(font) else: font = Font() if fill: fill = copy.deepcopy(fill) else: fill = PatternFill() if border: border = copy.deepcopy(border) else: border = Border() if alignment: alignment = copy.deepcopy(alignment) else: alignment = Alignment() if not number_format: number_format = 'General' def get_side(value): val_split = value.split() if 'none' in value or 'hidden' in value: border_style = None elif 'solid' in value: border_style = 'medium' size = self.get_value(val_split[0], 1.0) if size <= 0.25: border_style = 'hair' elif size <= 0.5: border_style = 'thin' elif size >= 1.5: border_style = 'thick' elif 'double' in value: border_style = 'double' elif 'dotted' in value: border_style = 'dotted' elif 'dashed' in value: border_style = 'dashed' color = val_split[-1] side = Side(border_style=border_style, color=self.to_xlsx_color(color)) return side for item, value in style.items(): try: if item == 'color': font.color = self.to_xlsx_color(value) elif item == 'border': # 0.5pt solid windowtext side = get_side(value) border = Border(left=side, right=side, top=side, bottom=side) elif item == 'border-left-style' or item == 'border-left': # 5px dotted red border.left = get_side(value) elif item == 'border-right-style' or item == 'border-right': # 5px dotted red border.right = get_side(value) elif item == 'border-top-style' or item == 'border-top': # 5px dotted red border.top = get_side(value) elif item == 'border-bottom-style' or item == 'border-bottom': # 5px dotted red border.bottom = get_side(value) elif item == 'font-size': sz = self.get_pt(value) if sz: font.sz = sz elif item == 'font-weight': # normal; bold; 400; 700 if value == 'bold' or self.get_value(value) > 400: font.b = True elif item == 'font-style': # normal; italic; oblique if value == 'italic' or value == 'oblique': font.i = True elif item == 'font-family': # Arial; Arial, sans-serif; "Comic Sans MS", sans-serif font.name = self.map_font(value.split(',')[0].replace('"', '').replace("'", '').strip()) vl = value.lower() if 'sans' in vl or 'calibri' in vl or 'arial' in vl or 'swiss' in vl or \ 'helvetica' in vl or 'impact' in vl or 'tahoma' in vl or 'veranda' in vl: font.family = 2.0 elif 'serif' in vl or 'roman' in vl or 'baskerville' in vl or 'bookman' in vl or \ 'modern' in vl or 'cambria' in vl or 'century' in vl or 'georgia' in vl: font.family = 1.0 elif 'mono' in vl or 'fixed' in vl or 'courier' in vl or 'consol' in vl or \ 'modem' in vl or 'typewriter' in vl or 'terminal' in vl: font.family = 3.0 elif 'script' in vl or 'cursive' in vl: font.family = 4.0 elif 'decorative' in vl or 'old english' in vl or 'fraktur' in vl or \ 'antiqua' in vl: font.family = 5.0 elif item == 'text-align': # center alignment.horizontal = value elif item == 'text-decoration': # none; underline if 'underline' in value: if font.u != 'double': font.u = 'single' if 'line-through' in value: font.strike = True elif item == 'text-underline-style': # single; double if value == 'double': font.u = 'double' elif item == 'vertical-align': # bottom if value == 'middle': value = 'center' if value in {'distributed', 'justify', 'bottom', 'top', 'center'}: alignment.vertical = value elif item == 'white-space': # nowrap if value != 'nowrap': alignment.wrap_text = True elif item == 'background': #this that these those blue value = value.split()[-1] if fill.patternType is None: fill.patternType = 'solid' fill.fgColor = self.to_xlsx_color(value) fill.bgColor = self.to_xlsx_color(value) #fill = PatternFill(patternType='solid', fgColor=self.to_xlsx_color(value), bgColor=self.to_xlsx_color(value)) elif item == 'background-color': #f2f2f2 #fill = PatternFill(patternType='solid', fgColor=self.to_xlsx_color(value), bgColor=self.to_xlsx_color(value)) if fill.patternType is None: fill.patternType = 'solid' fill.fgColor = self.to_xlsx_color(value) fill.bgColor = self.to_xlsx_color(value) elif item == 'mso-pattern': # black gray-50 if value != 'auto': value = value.split() fill.fgColor = self.to_xlsx_color(value[0]) pattern = 'solid' if len(value) == 2: pattern = value[1] if pattern in self.pattern_map: fill.patternType = self.pattern_map[pattern] elif item == 'mso-rotate': if isinstance(value, int) or value.isdigit() or value[0:1] == '-' and value[1:].isdigit(): rot = int(value) if 1 <= rot <= 180: alignment.textRotation = rot elif -90 <= rot <= -1: alignment.textRotation = 90 - rot elif item == 'layout-flow': if 'vertical' in value: alignment.textRotation = 255 elif item == 'writing-mode': if value == 'tb-rl': alignment.textRotation = 255 elif item == 'mso-text-control': # mso-text-control:shrinktofit if value == 'shrinktofit': alignment.shrink_to_fit = True elif item == 'mso-char-indent-count': alignment.indent = value elif item == 'mso-number-format': # mso-number-format:"Short Date" if TRACE: print('mso-number-format=', end='') for c in value: print(f'{c}({ord(c):02x})', end='') print('') value = value.strip() if value and value[0] in '"\'' and value[-1] in '"\'': value = value[1:-1] number_format = value.strip() # Examples: # _-* #,##0.00" €"_-;-* #,##0.00" €"_-;_-* -??" €"_-;_-@_- # _-* #,##0.00 [$€-407]_-;-* #,##0.00 [$€-407]_-;_-* "-"?? [$€-407]_-;_-@_- number_format = self.number_format_replacements.get(number_format, number_format). \ replace('\\\\', '\\') #replace(r'\\', '').replace(r'\[', '[').replace(r'\]', ']').replace(r'\.', '.'). \ #replace(r'\#', '#').replace(r'\/', '/').replace(r'\,', ',') number_format = re.sub(r'\\([.#/,;_"\'\[\]\(\)])', r'\1', number_format) if number_format == r'\@': number_format = '@' if TRACE: print(' number_format=', end='') for c in number_format: print(f'{c}({ord(c):02x})', end='') print('') except Exception as e: if TRACE: print(f'style_to_xlsx: exception {e} on {item}: {value}') traceback.print_exc() return (font, fill, border, alignment, number_format)
[docs] @staticmethod def fixup_excel_width(wid): # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/293 if wid >= 1.29: result = wid + 0.71 else: result = wid * 1.8 if result == 0.0: return result return min(CSSStyle.MAX_CELL_WIDTH_UNITS, max(CSSStyle.px_to_units(CSSStyle.MIN_CELL_WIDTH_PX), result))
[docs] @staticmethod def join(base, fn): """Join a base URL with a filename or a base local path with a filename""" if '://' in base or '://' in fn: # URL result = urljoin(base + '/', fn) else: result = os.path.join(base, fn) if TRACE: print(f'join({base}, {fn}) = {result}') return result
[docs] @staticmethod def read(f, mode='', quiet=False, retries=RETRIES): """Read from either a URL or a filename or file-like object. If mode is 'b', then read in binary. If f is a file-like object and you want it read in unicode with the proper encoding, then open it using 'rb' mode and don't pass a mode to this method. If quiet is True, then return None rather than raising an exception on errors""" def bytes_to_str(b, filename=None): is_html = True if b.startswith(b'MIME-Version:'): is_html = False if filename and filename.endswith('.css'): is_html = False d = UnicodeDammit(b, is_html=is_html) result = d.unicode_markup return result if isinstance(f, str): if '://' in f: # URL for r in range(retries): try: resp = requests.get(f, headers=REQUESTS_HEADER) resp.raise_for_status() if mode == 'b': return resp.content #return resp.text return bytes_to_str(resp.content, f) except Exception: if r == retries-1: if quiet: return None raise sleep(2) try: with open(f, 'rb') as t: contents = t.read() if mode == 'b': return contents return bytes_to_str(contents, f) except Exception: if quiet: return None raise elif isinstance(f, bytes): if mode == 'b': return f return bytes_to_str(f) else: if 'b' in f.mode and mode != 'b': return bytes_to_str(f.read()) return f.read()
[docs] @staticmethod def css_escape_unicode(s): def _sub(m): ch = ord(m.group(1)) return f'\\{ch:x} ' return re.sub(r'([^\x00-\x7F])', _sub, s)
[docs] @staticmethod def html_escape_unicode(s): def _sub(m): ch = ord(m.group(1)) return f'&#x{ch:x};' return re.sub(r'([^\x00-\x7F])', _sub, s)
[docs]class HTMLXLS2XLSX: """Convert an xls file with html contents into and xlsx file""" def __init__(self, f, dirname='.'): """f is a url, filename, file object, or html string""" self.dirname = dirname if isinstance(f, str): fl = f.lower() if '<table' in fl or '<frame' in fl: self.text = f else: self.dirname = os.path.split(f)[0] self.text = CSSStyle.read(f) else: self.text = CSSStyle.read(f) self.made_dir = None csm = currency_symbols_constants.CURRENCY_SYMBOLS_MAP def has_symbol(symbol): """Return True iff this symbol is not just a sequence of normal ascii letters, so things like 'KM' will return False but '$b' gives True""" m = re.match(r'^[A-Za-z]+$', symbol) if m: return False return True currency_symbols_regex = '|'.join([re.escape(symbol) for symbol in csm.values() if has_symbol(symbol)]) self.RE_INT = re.compile(r'^\s*(?P<sign>[+-]?)(?P<curr_left>\s*(?:' + currency_symbols_regex + r')\s*)?(?P<int>\d+)(?P<curr_right>\s*(?:' + currency_symbols_regex + r'))?\s*$') self.RE_FLOAT = re.compile(r'^\s*(?P<sign>[+-]?)(?P<curr_left>\s*(?:' + currency_symbols_regex + r')\s*)?(?P<float>(?:\d+(\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)(?P<curr_right>\s*(?:' + currency_symbols_regex + r'))?\s*$') self.RE_YEARFIRST = re.compile(r'(?:yy.*m.*d)|(?:yy.*d.*m)', re.I) self.RE_DAYFIRST = re.compile(r'd[d\\/ -]+m', re.I) self.EARLIEST_DATE = 1900 # Excel limitation if self.text.startswith('MIME-Version:'): # Unpack mht file into it's specified tempdir msg = email.message_from_string(self.text) first_contents = None filebase = self.dirname.replace('://', '/') for part in msg.walk(): if part.get_content_maintype() == 'multipart': continue filename = part.get_filename() if not filename: # Content-Location: file:///C:/D1A61EE8/width.htm filename = part['Content-Location'].replace('file://', '') filename = re.sub(r'/[C-Z]:/', './', filename).replace(':', '') if not filename.startswith('.'): # No root filenames if filename.startswith('/') or filename.startswith('\\'): filename = '.' + filename else: filename = './' + filename dirname = os.path.join(filebase, os.path.split(filename)[0]) try: split_path = re.split(r'[/\\]', dirname) subpaths = [os.path.sep.join(split_path[:i]) for i in range(1, len(split_path)+1)] for subpath in subpaths: if not os.path.exists(subpath): os.makedirs(dirname) if self.made_dir is None: self.made_dir = subpath if TRACE: print(f'self.made_dir = {self.made_dir}') break except FileExistsError: pass with open(os.path.join(filebase, filename), 'wb') as fp: cte = part.get('content-transfer-encoding', '').lower() if cte == 'quoted-printable': payload = part.get_payload(decode=False) try: bpayload = payload.encode('ascii') except UnicodeError: # Excel saves "quoted-printable" files that contain non-ascii Unicode # characters, so we need to appropriately encode them in either # css encoding or html encoding style if filename.endswith('css'): payload = CSSStyle.css_escape_unicode(payload) else: payload = CSSStyle.html_escape_unicode(payload) bpayload = payload.encode('ascii') payload = quopri.decodestring(bpayload) else: payload = part.get_payload(decode=True) if not first_contents: first_contents = payload first_filename = filename fp.write(payload) if first_contents: # The first file is the "workbook" that contains a frameset and frames for each sheet. #self.text = str(first_contents, 'utf-8') self.text = UnicodeDammit(first_contents, is_html=True).unicode_markup self.dirname = os.path.split(os.path.join(filebase, first_filename))[0] warnings.filterwarnings("ignore", category=UserWarning, module='bs4') warnings.filterwarnings("ignore", category=UserWarning, module='cssutils') warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=GuessedAtParserWarning) # the newlines are seen as bogus text elements, so get rid of them: (except this messes up <pre>) #self.text = ' '.join(self.text.split()) #self.text = re.sub(r'''<span\s+style=["']mso-spacerun:yes["']>(\s+)</span>''', lambda m: '&nbsp;' * len(m.group(1)), self.text) self.text = re.sub(r'>[ \n\t\r\f\v]+<', '><', self.text) #self.text = self.text.replace('> <', '><') stl = self.text.lower() if '<table' not in stl and '<frame' not in stl: if isinstance(f, str): raise ValueError(f'No <table> tags found in {f} - maybe this is Excel 5.0/95 format? If so, try using XLS2XLSX instead.') else: raise ValueError('No <table> tags found - maybe this is Excel 5.0/95 format? If so, try using XLS2XLSX instead.') self.url_soup = BeautifulSoup(self.text)
[docs] def to_xlsx(self, filename=None, workbook=None, worksheet=None, sheet_name=None): """Convert to xlsx using openpyxl. If filename is not None, then the result is written to that file, and the filename is returned, else the workbook is returned. If workbook is passed, then the worksheet is written to the given workbook""" if workbook: wb = workbook ws = worksheet else: wb = Workbook() # Creates one worksheet ws = wb.active # Handle a modern "save as htm" file from excel (including converted mht files) # <frame src="Styles_files/tabstrip.htm" name="frTabs" marginwidth=0 marginheight=0> frames_html = self.url_soup.find_all('frame') if frames_html: tabstrip = frames_html[-1] src = tabstrip['src'] tabstrip_hx2x = self.__class__(CSSStyle.join(self.dirname, src)) # <a href="sheet001.htm" target="frSheet"><font face="Arial" color="#000000">Sheet1</font></a> a_html = tabstrip_hx2x.url_soup.find_all('a') bn = os.path.split(CSSStyle.join(self.dirname, src))[0] for a in a_html: href = a['href'] sn = a.get_text().strip() fn = CSSStyle.join(bn, href) a_hx2x = self.__class__(fn) a_hx2x.to_xlsx(workbook=wb, worksheet=ws, sheet_name=sn) ws = None if self.made_dir: # If we made the directory from a mht file, then remove it when we're done with it try: shutil.rmtree(self.made_dir) if TRACE: print(f'removed {self.made_dir}') except Exception as e: if TRACE: print(f'Exception removing {self.made_dir}: {e}') if filename: wb.save(filename=filename) return filename return wb css_style = CSSStyle() font_utils = FontUtils() styles_html = self.url_soup.find_all(['link', 'style']) for s in styles_html: if s.name == 'link': rel = s.get('rel') if not rel or (isinstance(rel, list) and rel[0].lower() != 'stylesheet') or \ (isinstance(rel, str) and rel.lower() != 'stylesheet'): continue try: fn = s.get('href') cs = CSSStyle.read(CSSStyle.join(self.dirname, fn), quiet=True) if TRACE: print(f'Writing TRACE_{fn}') with open(f'TRACE_{fn}', 'w', encoding='utf-8') as tr: tr.write(cs) css_style.add_style_sheet(cs) except Exception: pass else: css_style.add_style_sheet(str(s.encode_contents(), 'utf-8')) # Put elements that are not in a table in a table so we can handle them properly below new_row_elems = {'p', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'div', 'pre', 'ol', 'ul', 'dl', 'hr', 'blockquote', 'footer'} if TRACE: print('Looking for non-table content') body = self.url_soup.find('body') if not body: body = self.url_soup.new_tag('body') while self.url_soup.contents: elem = self.url_soup.contents[0] body.append(elem.extract()) if 'html' in self.url_soup: self.html.append(body) else: self.url_soup = body table = None cell = None col_widths = {} # Measured in units, key is column letter col_widths_no_wrap = {} # Measured in units, key is column letter col_max_widths = {} while True: for elem in body.contents: if elem == table: continue if isinstance(elem, (CData, Comment, ProcessingInstruction, Declaration, Doctype)): continue if isinstance(elem, NavigableString) and not str(elem.string).strip(): continue # Ignore whitespace between elements if elem.name == 'table': # If we have an extra table row at the bottom like this one, then use it to set the column # widths, then zap it. """ <![if supportMisalignedColumns]> <tr height=0 style='display:none'> <td width=87 style='width:65pt'></td> <td width=0></td> </tr> <![endif]>""" for e in elem.contents: if isinstance(e, Declaration) and e.string == 'if supportMisalignedColumns': r = e.find_next_sibling() if r.name == 'tr' and 'style' in r.attrs and r.attrs['style'] == 'display:none': cc = 0 for c in r.contents: if c.name != 'td': continue cc += 1 if 'width' not in c.attrs: continue cl = get_column_letter(cc) width = CSSStyle.px_to_units(CSSStyle.get_px(c.attrs['width'])) col_widths[cl] = width col_max_widths[cl] = width col_widths_no_wrap[cl] = width r.extract() # Zap it! break table = None # We need to make a new one if needed continue if elem.name in ('link', 'style', 'script'): continue if elem.name == 'div' and 'style' in elem.attrs and 'mso-element:comment-list' in elem.attrs['style']: continue # Ignore the cell comments which are stuck at the end of the file in a div if TRACE: print(f'Processing {elem}') if not table: if TRACE: print('Creating table') table = self.url_soup.new_tag('table') table.attrs = body.attrs # Copy bgcolor, style, etc elem.insert_after(table) if elem.name in new_row_elems: cell = None if not cell: if TRACE: print('Creating new row and cell') row = self.url_soup.new_tag('tr') cell = self.url_soup.new_tag('td') row.append(cell) table.append(row) if TRACE: print('Adding element to cell') extracted = elem.extract() cell.append(extracted) break # Start over if we made a change else: break if TRACE: print(f'Styles = \n{css_style}') def str_size(s, font, alignment, max_width, max_height, keep_newlines=False, fully_merged=False): """Get approximate width, height of a string, expressed in pixels.""" is_bold = font.b font_size = font.sz if not font_size: font_size = 11 wrap_text = alignment.wrap_text rotation = alignment.textRotation height = FontUtils.pt_to_px(font_size) * FontUtils.LINE_HEIGHT_FACTOR def almost_vertical(rotation): # 180 means -90 and 255 means vertical with each letter rotated also return 75 <= rotation <= 90 or 165 <= rotation <= 180 or rotation == 255 if alignment.shrink_to_fit: width, _ = font_utils.get_font_size(font, 'a') elif wrap_text: lines = s.split('\n') # Compute the width of the widest line mx = 0 for line in lines: w, h = str_size(line, font, Alignment(wrap_text=False, textRotation=rotation), max_width, max_height) mx = max(mx, w) mx = min(mx, max_width) my = font_utils.lines_needed(mx, s, font) * height # Attempt to optimize the cell size width = mx height1 = height height = my if not fully_merged: # If we don't span the whole width, try to optimize the size if mx/my >= 2: # Speed this up by pre-computing the value based on the area of the rectangle area = width * height # The area of the desired result with a 2:1 aspect ratio is mx * my = (2*my)*my = 2*my**2 # so we solve for my. my = math.sqrt(area / 2.0) if my < max_height: mx = 2 * my width = mx my = font_utils.lines_needed(mx, s, font) * height1 height = my else: my = height while my < max_height and mx/my >= 2: # Go for a 2:1 aspect ratio if possible width = mx height = my mx = mx * 0.9 my = font_utils.lines_needed(mx, s, font) * height1 if TRACE: print(f'str_size: trying {mx, my}') elif keep_newlines: height = 0 width = 0 for line in s.split('\n'): w, h = font_utils.get_font_size(font, line) width = max(width, w) height += h else: width, height = font_utils.get_font_size(font, s.replace('\n', '')) if width > max_width: # This line will be split splits = math.ceil(width / max_width) width = max_width height *= splits if height > max_height: height = max_height if TRACE: print(f'str_size({s}, is_bold={is_bold}, font_size={font_size}, font_name={font.name}, rotation={rotation}, wrap_text={wrap_text}, max_width={max_width}, max_height={max_height}, keep_newlines={keep_newlines}) = {width, height}') if almost_vertical(rotation): return (height, width) return (width, height) tables_html = self.url_soup.find_all("table") if sheet_name is None: m = re.search(r'<x:Name>([^<]+?)</x:Name>', self.text) if m: sheet_name = m.group(1) else: sheet_name = 'Sheet1' if ws: ws.title = sheet_name else: ws = wb.create_sheet(sheet_name) ws.sheet_view.showGridLines = False def type_it(value, number_format=None): if not value: return value v = value if isinstance(v, float) or isinstance(v, date) or isinstance(v, timedelta): return v if isinstance(v, int): if number_format and ('[h' in number_format or '[m' in number_format or '[s' in number_format): v = str(v) else: return v if isinstance(v, str): v = v.replace(',', '').strip() if v[-1:] == '%': try: return round(float(v[:-1]) / 100.0, 15) except Exception: pass if number_format and ('[h' in number_format or '[m' in number_format or '[s' in number_format): # timedelta grabit = [] def grab_escapes(number_format): nonlocal grabit def sub_grabit(m): i = len(grabit) grabit.append(re.escape(m.group(1))) return f'{{{i}}}' nf = re.sub(r'\\(.)', sub_grabit, number_format) nf = re.sub(r'"([^"]*)"', sub_grabit, nf) return nf def restore_escapes(nf): nonlocal grabit if len(grabit): nf = nf.format(*grabit) # Put escaped chars back in return nf pattern = grab_escapes(number_format.split(';')[0]) pattern = re.sub(r'\[hh?\]', r'(?P<HOURS>\\d+)', pattern) pattern = re.sub(r'\[mm?\]', r'(?P<TOTAL_MINUTES>\\d+)', pattern) pattern = re.sub(r'\[ss?\]', r'(?P<TOTAL_SECONDS>\\d+)', pattern) pattern = re.sub(r'mm?', r'(?P<MINUTES>\\d+)', pattern) pattern = re.sub(r'ss?', r'(?P<SECONDS>\\d+)', pattern) pattern = re.sub(r'[.]00*', r'(?P<FRACTION>\\d+)', pattern) pattern = restore_escapes(pattern) m = re.match(pattern, value) if m: def get(what, default=0): nonlocal m try: return int('0' + m.group(what)) except IndexError: return default hours = get('HOURS') total_minutes = get('TOTAL_MINUTES', None) minutes = get('MINUTES') total_seconds = get('TOTAL_SECONDS', None) seconds = get('SECONDS') fraction = float('0.' + str(get('FRACTION'))) if TRACE: print(f'type_it({value}, {number_format}) found timedelta: {hours}h{minutes}m{seconds}s {total_minutes}total_m {total_seconds}total_s') if total_seconds is not None: return timedelta(seconds=total_seconds+fraction) if total_minutes is not None: return timedelta(minutes=total_minutes,seconds=seconds+fraction) return timedelta(seconds=(hours*60+minutes)*60+seconds+fraction) elif TRACE: print(f"type_it({value}, {number_format}) timedelta didn't match {pattern.pattern}") m = re.match(self.RE_INT, v) # Match an integer or float value, possibly having a currency symbol on the left or right (but not both!) if m and bool(m.group('curr_left')) + bool(m.group('curr_right')) != 2: return int(m.group('sign') + m.group('int')) m = re.match(self.RE_FLOAT, v) if m and bool(m.group('curr_left')) + bool(m.group('curr_right')) != 2: return round(float(m.group('sign') + m.group('float')), 15) try: dayfirst = False yearfirst = False yearfirst = bool(number_format and re.search(self.RE_YEARFIRST, number_format)) dayfirst = bool(number_format and re.search(self.RE_DAYFIRST, number_format)) dt = date_parse(value, dayfirst=dayfirst, yearfirst=yearfirst, default=datetime(1, 1, 1)) if dt.date() == date(1, 1, 1): return dt.time() if dt.year == 1: dt = dt.replace(year, date.today().year) if dt.year >= self.EARLIEST_DATE: if dt.time() == tm(0): return dt.date() return dt except Exception: pass if isinstance(value, str): return value.strip() return value def guess_date_format(dt, str_dt): """Take a guess at the date format based on a sample in str_dt""" nonlocal css_style if isinstance(dt, datetime): number_format = css_style.number_format_replacements['Short Date'] + ' ' + \ css_style.number_format_replacements['Long Time'] year = dt.year month = dt.month day = dt.day hour = dt.hour hour_12 = dt.hour % 12 if hour_12 == 0: hour_12 = 12 minute = dt.minute second = dt.second elif isinstance(dt, date): number_format = css_style.number_format_replacements['Short Date'] year = dt.year month = dt.month day = dt.day elif isinstance(value, tm): number_format = css_style.number_format_replacements['Long Time'] year = 0 month = 0 day = 0 hour = dt.hour hour_12 = dt.hour % 12 if hour_12 == 0: hour_12 = 12 minute = dt.minute second = dt.second else: return 'General' # Tokenize str_dt: tokens = [] nf = '' token_seen = set() (TOK_HOUR, TOK_HOUR0, TOK_HOUR_12, TOK_HOUR0_12, TOK_MIN, TOK_SEC, TOK_DAY, TOK_DAY0, TOK_MONTH, TOK_MONTH0, TOK_YEAR, TOK_YEAR2, TOK_SHORT_DAY, TOK_LONG_DAY, TOK_SHORT_MONTH, TOK_LONG_MONTH, TOK_SEP, TOK_ampm, TOK_AMPM) = range(19) token_equiv = [set((TOK_HOUR, TOK_HOUR0, TOK_HOUR_12, TOK_HOUR0_12)), set((TOK_DAY, TOK_DAY0)), set((TOK_MONTH, TOK_MONTH0)), set((TOK_SHORT_DAY, TOK_LONG_DAY)), set((TOK_SHORT_MONTH, TOK_LONG_MONTH)), set((TOK_ampm, TOK_AMPM))] token_letters = {TOK_HOUR:'h', TOK_HOUR0:'hh', TOK_HOUR_12:'h', TOK_HOUR0_12:'hh', TOK_MIN:'mm', TOK_SEC:'ss', TOK_DAY:'d', TOK_DAY0:'dd', TOK_MONTH:'m', TOK_MONTH0:'mm', TOK_YEAR:'yyyy', TOK_YEAR2:'yy', TOK_SHORT_DAY:'ddd', TOK_LONG_DAY:'dddd', TOK_SHORT_MONTH:'mmm', TOK_LONG_MONTH:'mmmm', TOK_ampm:'a/p\\m', TOK_AMPM:'AM/PM'} str_dtl = str_dt.lower() has_am_pm = 'am' in str_dtl or 'pm' in str_dtl toks = re.split(r'(\W+|(?:(?<![A-Za-z_])(?:am\b|pm\b|AM\b|PM\b))|(?:(?<=[0-9])[A-Za-z_]+)|(?:(?<=[A-Za-z_])[0-9]+))', str_dt) matches = 0 for i, tok in enumerate(toks): if not tok: continue # Look ahead to mark this as a time if we see a ':' or am/pm or space then am/pm look_ahead = '' for j in range(i+1, len(toks)): look_ahead += toks[j] if len(look_ahead) >= 2: break look_ahead_time = (look_ahead[0:1] == ':' or \ look_ahead[0:2].lower() in ('am', 'pm') or \ look_ahead[0:3].lower() in (' am', ' pm')) if tok.isdigit(): t_type = TOK_SEP if tok == str(year): t_type = TOK_YEAR elif TOK_MONTH not in token_seen and tok == str(month) and not look_ahead_time: t_type = TOK_MONTH elif TOK_MONTH0 not in token_seen and month < 10 and tok == '0' + str(month) and \ not look_ahead_time: t_type = TOK_MONTH0 elif TOK_DAY not in token_seen and tok == str(day) and not look_ahead_time: t_type = TOK_DAY elif TOK_DAY0 not in token_seen and day < 10 and tok == '0' + str(day) and not look_ahead_time: t_type = TOK_DAY0 elif tok == str(year%100) and not look_ahead_time: t_type = TOK_YEAR2 elif TOK_HOUR not in token_seen and tok == str(hour) and not has_am_pm: t_type = TOK_HOUR elif TOK_HOUR0 not in token_seen and hour < 10 and tok == '0' + str(hour) and not has_am_pm: t_type = TOK_HOUR0 elif TOK_HOUR_12 not in token_seen and tok == str(hour_12) and has_am_pm: t_type = TOK_HOUR_12 elif TOK_HOUR0_12 not in token_seen and hour_12 < 10 and tok == '0' + str(hour_12) and has_am_pm: t_type = TOK_HOUR0_12 elif TOK_MIN not in token_seen and tok == f'{minute:02d}': t_type = TOK_MIN elif TOK_SEC not in token_seen and tok == f'{second:02d}': t_type = TOK_SEC elif tok in calendar.day_abbr: t_type = TOK_SHORT_DAY elif tok in calendar.day_name: t_type = TOK_LONG_DAY elif tok in calendar.month_abbr: t_type = TOK_SHORT_MONTH elif tok in calendar.month_name: t_type = TOK_LONG_MONTH elif tok in ('am', 'pm'): t_type = TOK_ampm elif tok in ('AM', 'PM'): t_type = TOK_AMPM else: t_type = TOK_SEP #tok = re.sub(r'([hHmMsSdDyYaApP])', r'\\\1', tok) # Escape pattern letters tok = re.sub(r'([A-Za-z0-9])', r'\\\1', tok) # Escape all letters and numbers t_val = token_letters.get(t_type, tok) if t_val is not tok: matches += 1 # Gotta see 2 good ones to call "success" tokens.append((t_type, t_val)) token_seen.add(t_type) for te in token_equiv: if t_type in te: for t in te: token_seen.add(t) break for t_type, t_val in tokens: nf += t_val if matches >= 2: number_format = nf if TRACE: print(f'guess_date_format({dt}, {str_dt}) = {number_format} (toks={toks})') return number_format def get_heights(elem, style): """Return the height, min_height, max_height of a given element with the given style""" height = None # height could be 0 for hidden rows min_height = None max_height = CSSStyle.MAX_CELL_HEIGHT_PT if 'min-height' in style: min_height = CSSStyle.get_pt(style['min-height']) if 'max-height' in style: max_height = CSSStyle.get_pt(style['max-height']) if 'height' in style: height = CSSStyle.get_pt(style['height']) elif 'min-height' in style: height = CSSStyle.get_pt(style['min-height']) elif 'height' in row.attrs: height = CSSStyle.get_pt(row['height'], spreadsheet_pt=CSSStyle.SPREADSHEET_HEIGHT_PX*0.75) return (height, min_height, max_height) global_n_cols = 0 for table in tables_html: parent = table.parent while parent: if parent.name == 'td': break parent = parent.parent if parent and parent.name == 'td': continue for row in table.find_all("tr"): if row.parent != table and row.parent.parent != table: # Don't look at nested rows (2 checks because of thead/tbody) continue col_tags = row.find_all(["td", "th"]) if len(col_tags) > 0: cols = 0 for col in col_tags: if col.parent != row: continue colspan = col.get("colspan") if colspan is None: cs = 1 else: cs = int(colspan) cols += cs if cols > global_n_cols: global_n_cols = cols if TRACE: print(f'Row {row} set global_n_cols to {global_n_cols}') if TRACE: print(f'global_n_cols = {global_n_cols}') def style_without(style, what='mso-ignore'): result = {} for k, v in style.items(): if k == what: continue result[k] = v return CSSStyle.format_style(result) # Handle mso-xlrowspan:N, which means to pretend there are more rows than specified by <tr> # <tr height=3D40 style=3D'height:30.0pt;mso-xlrowspan:2'> # We insert new rows and split the height up amongst them. xlrowspans = self.url_soup.find_all('tr', style=lambda s: s and 'mso-xlrowspan' in s) for elem in xlrowspans: style = css_style.parse_style(elem.get('style')) rowspan = int(style['mso-xlrowspan']) height = None if 'height' in style: height = CSSStyle.get_pt(style['height']) else: height = elem.get('height') if height is not None: height = int(height) // rowspan elem['height'] = height style['height'] = f'{height}pt' elem['style'] = style_without(style, what='mso-xlrowspan') for r in range(rowspan-1): new_tr = copy.copy(elem) # This actually calls a custom __copy__ in bs4 elem.insert_after(new_tr) # Handle mso-ignore:colspan; mso-ignore:style; mso-ignore:color; mso-ignore:colspan-rowspan # by re-writing the html. Examples: # <td colspan=7 rowspan=15 height=300 width=560 style='mso-ignore:colspan-rowspan;height:225.0pt;width:420pt'>CHART</td> # <td height=20 class=xl79 colspan=2 style='height:15.0pt;mso-ignore:colspan'>Shrink to fit text</td> # <td height=20 colspan=2 style='height:15.0pt;mso-ignore:colspan'></td> # <font color="#0000FF" style='mso-ignore:color'>47712</font> def element_index(elem): parent = elem.parent for i, e in enumerate(parent.contents): if e == elem: return i return None ignores = self.url_soup.find_all(style=lambda s: s and 'mso-ignore' in s) for elem in ignores: style = css_style.parse_style(elem.get('style')) what = style.get('mso-ignore') if what == 'colspan' or what == 'colspan-rowspan': colspan = int(elem.get("colspan")) width = None if 'width' in style: width = CSSStyle.get_px(style['width']) else: width = elem.get("width") if width is not None: width = int(width) // colspan elem['width'] = width style['width'] = f'{width}px' # Leave it in so we can use it later not to set the column width: elem['style'] = style_without(style) if what == 'colspan-rowspan': elem['style'] = elem['style'] + ';mso-ignore:rowspan' del elem['colspan'] for c in range(colspan-1): new_td = self.url_soup.new_tag(elem.name) new_td.attrs = elem.attrs.copy() elem.insert_after(new_td) else: # color or style or whatever try: elem['style'] = style_without(style) del elem[what] except Exception: pass ignores = self.url_soup.find_all(['td', 'th'], style=lambda s: s and 'mso-ignore:rowspan' in s) for elem in ignores: style = css_style.parse_style(elem.get('style')) rowspan = int(elem.get('rowspan')) height = None if 'height' in style: height = CSSStyle.get_pt(style['height']) else: height = elem.get('height') if height is not None: height = int(height) // rowspan elem['height'] = height style['height'] = f'{height}pt' elem['style'] = style_without(style) del elem['rowspan'] ndx = element_index(elem) row = elem.parent for r in range(rowspan-1): new_td = self.url_soup.new_tag(elem.name) new_td.attrs = elem.attrs.copy() row = row.find_next_sibling('tr') if ndx == 0: e = row.contents[0] e.insert_before(new_td) else: e = row.contents[ndx-1] e.insert_after(new_td) # Code based on the work of John Ricco (johnricco226@gmail.com), Apr 4, 2017: # https://johnricco.github.io/2017/04/04/python-html/ # Parse each table row_counter = 0 row_heights = {} # Measured in pt row_heights_per_column_no_wrap = {} row_heights_per_column = {} has_merged_cell = False fully_merged_rows = set() extra_images = [] # If we have more than 1 image per cell, we put them here until the end has_image = set() # Cells (like A1) that have images for table in tables_html: parent = table.parent while parent: if parent.name == 'td': # Nested tables are handled (implicitly) below break parent = parent.parent if parent and parent.name == 'td': continue pf = None bgcolor = table.get('bgcolor') if bgcolor: color = CSSStyle.to_xlsx_color(bgcolor) pf = PatternFill(patternType='solid', fgColor=color, bgColor=color) bor = None b = type_it(table.get('border')) if b: if b == 1: border_style = 'thin' else: border_style = 'medium' side = Side(border_style=border_style) bor = Border(left=side, right=side, top=side, bottom=side) # Create list to store rowspan values skip_index = [0 for i in range(0, global_n_cols)] if TRACE: print(f'skip_index = {skip_index}') # Start by iterating over each row in this table... for row in table.find_all("tr"): if row.parent != table and row.parent.parent != table: # Don't look at nested rows (2 checks because of thead/tbody) continue rw = row_counter+1 # Skip row if it's blank columns = row.find_all(["td", "th"]) lc = len(columns) if lc == 0: continue else: row_heights_per_column[rw] = {} row_heights_per_column_no_wrap[rw] = {} tuples = [('table', table.attrs), ('tr', row.attrs)] style = css_style.apply_style(tuples) height, min_height, max_height = get_heights(row, style) if height is not None: if rw in row_heights: row_heights[rw] = max(row_heights[rw], height) else: row_heights[rw] = height if min_height and row_heights[rw] < min_height: row_heights[rw] = min_height if max_height and row_heights[rw] > max_height: row_heights[rw] = max_height # Get all cells containing data in this row col_dim = [] row_dim = [] col_dim_counter = -1 row_dim_counter = -1 col_counter = -1 this_skip_index = copy.deepcopy(skip_index) if len(columns) < global_n_cols: # Excel is one big table, so if we have multple tables in the html, # we must merge them into one, and if the rows have a different # # of columns, we set the colspan on the last column to make up the # difference. col_count = 0 for si in skip_index: if si > 0: col_count += 1 for col in columns: if col.parent != row: continue colspan = col.get("colspan") if colspan is None: cs = 1 else: cs = int(colspan) col_count += cs if col_count != global_n_cols: colspan = columns[-1].get("colspan") if colspan is None: cs = 1 else: cs = int(colspan) if TRACE: print(f'setting colspan for last col on row {rw} to {cs+global_n_cols-col_count}') columns[-1]['colspan'] = cs + (global_n_cols - col_count) images = [] for col in columns: if col.parent != row: continue tuples = [('table', table.attrs), ('tr', row.attrs), (col.name, col.attrs)] alg = None if 'align' in col.attrs: alg = Alignment(horizontal=col.attrs['align']) if 'valign' in col.attrs: if value == 'middle': value = 'center' if value in {'distributed', 'justify', 'bottom', 'top', 'center'}: if alg: alg.vertical = value else: alg = Alignment(vertical=value) # If this cell contains one formatting tag, then apply that tag to the entire contents of # the cell, since openpyxl doesn't support rich text. If that tag also contains one # formatting tag, then keep going, so <td><font size="5"><span style="font-family: monospace">...</span></font></td> will apply # both styles to the entire cell. c_col = col while c_col.contents and len(c_col.contents) == 1 and c_col.contents[0].name in \ ('big', 'small', 'center', 'div', 'span', 'a', 'b', 'i', 'u', 'em', 's', 'strike', 'code', 'pre', 'strong', 'font', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr'): cont = c_col.contents[0] tuples.append((cont.name, cont.attrs)) c_col = cont style = css_style.apply_style(tuples) font, fill, border, alignment, number_format = css_style.style_to_xlsx(style, fill=pf, border=bor, alignment=alg) font_size = font.sz if not font_size: font_size = 11 padding_left = (font_size / 10) # in px padding_right = padding_left padding_top = padding_left * 0.75 # in pt = 1px padding_bottom = padding_top if style: padding_left = CSSStyle.get_px(style.get('padding-left', '1px')) padding_right = CSSStyle.get_px(style.get('padding-right', '1px')) padding_top = CSSStyle.get_pt(style.get('padding-top', '1pt')) padding_bottom = CSSStyle.get_pt(style.get('padding-bottom', '2pt')) if 'padding' in style: # 4 in 1 paddings = style['padding'].split() if len(paddings) >= 4: padding_top = CSSStyle.get_pt(paddings[0]) padding_right = CSSStyle.get_px(paddings[1]) padding_bottom = CSSStyle.get_pt(paddings[2]) padding_left = CSSStyle.get_px(paddings[2]) elif len(paddings) == 3: padding_top = CSSStyle.get_pt(paddings[0]) padding_right = CSSStyle.get_px(paddings[1]) padding_left = padding_right padding_bottom = CSSStyle.get_pt(paddings[2]) elif len(paddings) == 2: padding_top = CSSStyle.get_pt(paddings[0]) padding_bottom = padding_top padding_right = CSSStyle.get_px(paddings[1]) padding_left = padding_right elif len(paddings) == 1: padding_top = CSSStyle.get_pt(paddings[0]) padding_bottom = padding_top padding_right = CSSStyle.get_px(paddings[0]) padding_left = padding_top padding_left = CSSStyle.px_to_units(padding_left) padding_right = CSSStyle.px_to_units(padding_right) font_name = font_utils.get_real_font_name(font.name) if font_name: font.name = font_name # Determine cell dimensions colspan = col.get("colspan") if colspan is None: col_dim.append(1) else: col_dim.append(int(colspan)) col_dim_counter += 1 rowspan = col.get("rowspan") if rowspan is None: row_dim.append(1) else: row_dim.append(int(rowspan)) row_dim_counter += 1 # Adjust column counter if col_counter == -1: col_counter = 0 else: col_counter = col_counter + col_dim[col_dim_counter - 1] while skip_index[col_counter] > 0: col_counter += 1 cc = col_counter+1 cl = get_column_letter(cc) if TRACE and style: # pragma nocover print(f'style={style} for {cl}{rw} with {col.get_text()}') print(f'font={font}, fill={fill}, border={border} ({border.left}, {border.top}, {border.right}, {border.bottom}), alignment={alignment}, number_format={number_format}') # Get cell contents #cell_data = col.get_text() # It's not quite this easy!! white_space = style.get('white-space', 'nowrap' if col.name=='th' else 'normal') cell_data = '' hyperlink = None comment_author = None comment_contents = None contents = list(col.contents) i = -1 while True: i += 1 if i >= len(contents): break cont = contents[i] if isinstance(cont, (CData, Comment, ProcessingInstruction, Declaration, Doctype)): continue elif isinstance(cont, NavigableString): if white_space == 'pre': cell_data += str(cont.string) else: cont_s = str(cont.string) cont_s = cont_s.replace('\r', '') cont_s = re.sub(r'\n[ \n\r\t\f\v]*', ' ', cont_s) # Don't match non-breaking space with \s if cell_data.endswith('\n') or cell_data.endswith(' '): cont_s = cont_s.lstrip() # Doesn't remove non-breaking spaces cell_data += cont_s.replace('\xA0', ' ') # Non-breaking space to space else: css = css_style.apply_style([(cont.name, cont.attrs)]) if css.get('display') == 'none': continue elif css.get('visibility') == 'hidden': text = cont.get_text() cell_data += ' ' * len(text) continue if cont.name == 'br': cell_data += '\n' elif cont.name == 'p': if cell_data: cell_data += '\n' elif cont.name == 'a': if isinstance(hyperlink, str): hyperlink = False # We can only handle 1 per cell elif hyperlink is None: hyperlink = cont.get('href') if hyperlink and hyperlink[0] == '#': hyperlink = None # Ignore relative references elif cont.name == 'span' and 'onmouseover' in cont.attrs: # msoCommentShow('_com_1','_anchor_1') m = re.match(r"msoCommentShow\('([^']+)'", cont.attrs['onmouseover']) try: id_ = m.group(1) comment_div = self.url_soup.find(id=id_).div.div comment_author = comment_div.contents[0].get_text() comment_contents = comment_div.contents[1].get_text() except Exception: pass elif cont.name == 'img': src = cont.get('src') if src: try: content = CSSStyle.read(CSSStyle.join(self.dirname, src), mode='b', retries=2) image = Image(io.BytesIO(content)) images.append(image) # If this img is in a nested table (e.g. from a chart), then don't update the heights and widths parent = cont.parent while parent: if parent.name == 'table': break elif parent.name == 'span' and \ 'position:absolute' in parent.attrs.get('style', ''): # If this is positioned, then it doesn't change the cell size parent = None break parent = parent.parent if parent == table: height = cont.get('height') if height: height = CSSStyle.get_pt(height) + padding_top + padding_bottom if rw in row_heights: row_heights[rw] = max(row_heights[rw], height) else: row_heights[rw] = height width = cont.get('width') if width: width = CSSStyle.px_to_units(CSSStyle.get_px(width)) col_widths[cl] = max(col_widths.get(cl, 0), width+padding_left+padding_right) col_widths_no_wrap[cl] = max(col_widths_no_wrap.get(cl, 0), width+padding_left+padding_right) except Exception as e: if TRACE: print(f'Exception getting image for {src}: {e}') # Depth first traversal: insert all sub-nodes in the list after this one contents = contents[0:i+1] + list(cont.contents) + contents[i+1:] # Insert data into cell if TRACE: print(f'ws.cell({rw}, {cc}).value = {type_it(cell_data, number_format)}') try: value = cell_data if number_format != '@' and number_format != 'Text': value = type_it(cell_data, number_format) ws.cell(rw, cc).value = value if number_format == 'General' and isinstance(value, (date, tm)): number_format = guess_date_format(value, cell_data) elif isinstance(cell_data, str): if '\n' in cell_data and not alignment.wrap_text: alignment = copy.deepcopy(alignment) alignment.wrap_text = True white_space = 'normal' if cell_data[-1:] == '%' and number_format == 'General': number_format = numbers.FORMAT_PERCENTAGE ws.cell(rw, cc).font = font ws.cell(rw, cc).fill = fill ws.cell(rw, cc).border = border ws.cell(rw, cc).alignment = alignment ws.cell(rw, cc).number_format = number_format if hyperlink: ws.cell(rw, cc).hyperlink = hyperlink #ws.cell(rw, cc).style = 'Hyperlink' # Added a default style instead if images: # Even though the cell may have multiple images in it, we only add one # to each cell, and the rest go in subsequent cells, handled later image = images.pop(0) # Pop off the first one image.anchor = f'{cl}{rw}' has_image.add(image.anchor) ws.add_image(image) width = image.width for image in images: image_data = (rw, cc, width, image) extra_images.append(image_data) # Save for later width += image.width images = [] if comment_contents and comment_author: ws.cell(rw, cc).comment = OpenpyxlComment(comment_author + comment_contents, comment_author[:-1]) # Remove the ':' from author except AttributeError as e: # pragma nocover if TRACE: # Shouldn't happen anymore, but don't crash if it does print(f'AttributeError {e} (ignored)') cd = col_dim[-1] rd = row_dim[-1] # mso-ignore now handled above #if style and 'colspan' in style.get('mso-ignore', ''): #cd = 1 #if style and 'rowspan' in style.get('mso-ignore', ''): #rd = 1 rx = rw if cd != 1 or rd != 1: has_merged_cell = True if TRACE: print(f'ws.merge_cells(start_row={rw}, start_column={cc}, end_row={row_counter+rd}, end_column={col_counter+cd})') end_row = row_counter+rd ws.merge_cells(start_row=rw, start_column=cc, end_row=end_row, end_column=col_counter+cd) if col_counter+cd >= global_n_cols: if cc == 1: fully_merged_rows.add(rw) row_counter += rd-1 if cd != 1: cl = f'{cl}:{get_column_letter(col_counter+cd)}' if rd != 1: rx = f'{rw}:{end_row}' height, min_height, max_height = get_heights(col, style) if height is not None: row_heights[rx] = height row_heights_per_column[rx] = {} row_heights_per_column_no_wrap[rx] = {} width = None min_width_px = None max_width_px = CSSStyle.units_to_px(CSSStyle.MAX_CELL_WIDTH_UNITS) if 'min-width' in style: min_width_px = CSSStyle.get_px(style['min-width']) if 'max-width' in style: max_width_px = CSSStyle.get_px(style['max-width']) if 'width' in style: width = CSSStyle.px_to_units(CSSStyle.get_px(style['width'])) elif 'min-width' in style: width = CSSStyle.px_to_units(CSSStyle.get_px(style['min-width'])) elif 'width' in col.attrs: w = col['width'] width = CSSStyle.px_to_units(CSSStyle.get_px(w)) # mso-ignore:colspan or mso-ignore:colspan-rowspan in a cell with contents # means it's OK for the contents to run over into the next column ignore_colspan = 'mso-ignore' in style and 'colspan' in style['mso-ignore'] if width is not None and not ignore_colspan: col_widths[cl] = max(col_widths.get(cl, 0), width) col_widths_no_wrap[cl] = max(col_widths_no_wrap.get(cl, 0), width) if col_widths.get(cl) != 0.0 and not ignore_colspan: # 0 means it was hidden, and that's usually set on the first <td> for that column # Grab the heights for later so we can decide to use either the wrapped height or the non-wrapped height = row_heights.get(rx, -1) if height != 0: # 0 means this row is hidden, so just skip it max_height_px = FontUtils.pt_to_px(max_height) wi, he = str_size(str(cell_data).strip(), font, alignment, max_width_px, max_height_px, fully_merged=rw in fully_merged_rows) row_heights_per_column[rx][cl] = max(height, FontUtils.px_to_pt(he)+padding_top+padding_bottom) wiu = CSSStyle.px_to_units(wi) col_widths[cl] = max(col_widths.get(cl, 0), wiu+padding_left+padding_right) if white_space == 'nowrap': col_widths_no_wrap[cl] = col_widths[cl] row_heights_per_column_no_wrap[rx][cl] = row_heights_per_column[rx].get(cl, 0) else: al = copy.deepcopy(alignment) al.wrap_text = False wi, he = str_size(str(cell_data).strip(), font, al, max_width_px, max_height_px, keep_newlines=True) row_heights_per_column_no_wrap[rx][cl] = max(height, FontUtils.px_to_pt(he)+padding_top+padding_bottom) wiu = CSSStyle.px_to_units(wi) col_widths_no_wrap[cl] = max(col_widths_no_wrap.get(cl, 0), wiu+padding_left+padding_right) if max_width_px and cl in col_widths: max_width = CSSStyle.px_to_units(max_width_px) col_widths[cl] = min(col_widths.get(cl), max_width) col_widths_no_wrap[cl] = min(col_widths_no_wrap.get(cl), max_width) if cl not in col_max_widths: col_max_widths[cl] = max_width # Record column skipping index if row_dim[row_dim_counter] > 1: rd = row_dim[row_dim_counter] for i in range(cd): if TRACE: print(f'this_skip_index[{col_counter+i}] = row_dim[{row_dim_counter}] = {rd}') this_skip_index[col_counter+i] = row_dim[row_dim_counter] # Adjust row counter row_counter += 1 # Adjust column skipping index skip_index = [i - 1 if i > 0 else i for i in this_skip_index] if TRACE: print(f'skip_index = {skip_index}') def handle_merged_column_widths(col_widths): for col, wid in list(col_widths.items()): cs = col.split(':') if len(cs) != 2: continue c1 = column_index_from_string(cs[0]) c2 = column_index_from_string(cs[1]) # We have 3 cases for the columns covered by this range: # (1) None of the column widths are specified. In this case divide up this width evenly amongst the columns. # (2) Some of the column widths are specified. In this case, divide up the remaining width evenly amongst the remaining columns. # (3) All of the column widths are specified. In this case, compute the ratio between the total widths of # the columns covered, and use that to bump up (not down) each width appropriately. no_width_cols = set() total_width = 0 for c in range(c1, c2+1): l = get_column_letter(c) if l in col_widths: total_width += col_widths[l] else: no_width_cols.add(c) remaining_width = wid - total_width if remaining_width > 0: ncols = len(no_width_cols) if ncols: # Case 1 or Case 2 width_per = remaining_width / ncols for c in no_width_cols: col_widths[get_column_letter(c)] = width_per elif total_width != 0: # Case 3 ratio = wid / total_width for c in range(c1, c2+1): col_widths[get_column_letter(c)] *= ratio del col_widths[col] def handle_merged_row_heights(row_heights): for row, hei in list(row_heights.items()): if isinstance(hei, dict): # This is a row_heights_per_column # If we have a height specified for a set of merged columns, then just apply it to each column separately for cc, h in list(hei.items()): cs = cc.split(':') if len(cs) == 2: c1 = column_index_from_string(cs[0]) c2 = column_index_from_string(cs[1]) for c in range(c1, c2+1): l = get_column_letter(c) if l in hei: hei[l] = max(hei[l], h) else: hei[l] = h del hei[cc] # At this point, each hei item is for a single column if isinstance(row, str): for cc, h in hei.items(): rh = {} for r, d in row_heights.items(): if cc in d: rh[r] = d[cc] handle_merged_row_heights(rh) for r in rh: if r not in row_heights: row_heights[r] = {} row_heights[r][cc] = rh[r] del row_heights[row] continue if not isinstance(row, str): continue rs = row.split(':') if len(rs) != 2: continue r1 = int(rs[0]) r2 = int(rs[1]) # We have 3 cases for the rows covered by this range: # (1) None of the row heights are specified. In this case divide up this height evenly amongst the rows. # (2) Some of the row heights are specified. In this case, divide up the remaining height evenly amongst the remaining rows. # (3) All of the row heights are specified. In this case, compute the ratio between the total height of # the rows covered, and use that to bump up (not down) each height appropriately. no_height_rows = set() total_height = 0 for r in range(r1, r2+1): if r in row_heights: total_height += row_heights[r] else: no_height_rows.add(r) remaining_height = hei - total_height if remaining_height > 0: nrows = len(no_height_rows) if nrows: # Case 1 or Case 2 height_per = remaining_height / nrows for r in no_height_rows: row_heights[r] = height_per elif total_height != 0: # Case 3 ratio = hei / total_height for r in range(r1, r2+1): row_heights[r] *= ratio del row_heights[row] if has_merged_cell: if TRACE: print("Values before handling merged cells:") print(f'col_widths = {col_widths}') print(f'col_widths_no_wrap = {col_widths_no_wrap}') print(f'row_heights = {row_heights}') print(f'row_heights_per_column = {row_heights_per_column}') print(f'row_heights_per_column_no_wrap = {row_heights_per_column_no_wrap}') handle_merged_column_widths(col_widths) handle_merged_column_widths(col_widths_no_wrap) handle_merged_row_heights(row_heights) handle_merged_row_heights(row_heights_per_column) handle_merged_row_heights(row_heights_per_column_no_wrap) if TRACE: print("Values after handling merged cells:") # Try not to word-wrap as many columns as possible, lopping them off at the end if they are too wide width = 0 for wid in col_widths_no_wrap.values(): width += wid if TRACE: print(f'col_widths = {col_widths}') print(f'col_widths_no_wrap = {col_widths_no_wrap}') print(f'row_heights = {row_heights}') print(f'row_heights_per_column = {row_heights_per_column}') print(f'row_heights_per_column_no_wrap = {row_heights_per_column_no_wrap}') print(f'fully_merged_rows = {fully_merged_rows}') # FIXME: if the height of a cell is greater than the MAX_CELL_HEIGHT_PT, then try # widening that cell in order to fix the text into it better. Check on the latin # text in test case "width.xls". max_spreadsheet_width = CSSStyle.px_to_units(CSSStyle.SPREADSHEET_WIDTH_PX) while width > max_spreadsheet_width: for col in reversed(list(col_widths_no_wrap)): wid = col_widths_no_wrap[col] if wid > col_widths[col]: width -= (wid - col_widths[col]) if TRACE: print(f'Sheet too wide: Removing no_wrap from col {col}') col_widths_no_wrap[col] = col_widths[col] for row in row_heights_per_column_no_wrap: if row in fully_merged_rows: continue if col in row_heights_per_column[row]: row_heights_per_column_no_wrap[row][col] = row_heights_per_column[row][col] break else: break for col, wid in col_widths_no_wrap.items(): wid = CSSStyle.fixup_excel_width(wid) if TRACE: print(f'ws.column_dimensions[{col}].width = {wid}') if col in col_max_widths: max_width = CSSStyle.fixup_excel_width(col_max_widths[col]) if wid > max_width: wid = max_width if wid == 0.0: ws.column_dimensions[col].width = 0.0 ws.column_dimensions[col].hidden = True else: ws.column_dimensions[col].width = wid for row, cols in row_heights_per_column_no_wrap.items(): height = 0 if row in fully_merged_rows: if TRACE: print(f'Using wrapped height for fully_merged row {row}') cols = row_heights_per_column[row] # Wrap it if it's super-wide for h in cols.values(): height = max(height, h) if height != 0: row_heights[row] = max(row_heights.get(row, 0), height) for row, height in row_heights.items(): if TRACE: print(f'ws.row_dimensions[{row}].height = {height}') if height == 0.0: ws.row_dimensions[row].height = 0.0 ws.row_dimensions[row].hidden = True else: ws.row_dimensions[row].height = min(CSSStyle.MAX_CELL_HEIGHT_PT, max(CSSStyle.MIN_CELL_HEIGHT_PT, height)) # If we had multiple images per cell, place them now that we know the column widths for image_data in extra_images: row, col, wid, img = image_data offset = 0 cl = get_column_letter(col) anchor = f'{cl}{row}' for c in range(col, global_n_cols+1): cl = get_column_letter(c) w = CSSStyle.units_to_px(col_widths_no_wrap.get(cl, CSSStyle.px_to_units(CSSStyle.DEFAULT_CELL_WIDTH_PX))) offset += w anchor = f'{cl}{row}' if anchor not in has_image and offset >= wid: break img.anchor = anchor ws.add_image(img) if filename: wb.save(filename=filename) return filename return wb