From 17a9b29bde1741b50f39e8e469798a7474eac919 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sat, 22 Jun 2024 14:54:45 -0400 Subject: [PATCH] Use += with lists to avoid unnecessary temporary list creation This also makes long lines easier to read and measure using a line-based profiler. --- se/commands/build_ids.py | 2 +- se/formatting.py | 2 +- se/se_epub.py | 14 +++++----- se/se_epub_lint.py | 60 +++++++++++++++++++++------------------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/se/commands/build_ids.py b/se/commands/build_ids.py index 2726e92d..d8a418bf 100644 --- a/se/commands/build_ids.py +++ b/se/commands/build_ids.py @@ -69,7 +69,7 @@ def build_ids(plain_output: bool) -> int: id_counter = id_counter + 1 # Now, get a list of what we expect all eligible IDs to be. - replacements = replacements + se.formatting.find_unexpected_ids(dom) + replacements += se.formatting.find_unexpected_ids(dom) # Write our wiped file, we'll update it later with open(filename, "w", encoding="utf-8") as file: diff --git a/se/formatting.py b/se/formatting.py index 06b8888f..b53fa700 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -1533,7 +1533,7 @@ def _get_flattened_children(node: EasyXmlElement, allow_header: bool) -> List[Ea if child.tag not in sectioning_elements and not is_endnote and not is_glossdef: result.append(child) - result = result + _get_flattened_children(child, allow_header) + result += _get_flattened_children(child, allow_header) return result diff --git a/se/se_epub.py b/se/se_epub.py index a34780a3..4390a3cf 100644 --- a/se/se_epub.py +++ b/se/se_epub.py @@ -553,7 +553,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None for filepath in css_filenames: file_css = self.get_file(filepath) - namespaces = namespaces + regex.findall(r"@namespace.+?;", file_css) + namespaces += regex.findall(r"@namespace.+?;", file_css) file_css = regex.sub(r"\s*@(charset|namespace).+?;\s*", "\n", file_css).strip() @@ -1248,15 +1248,15 @@ def generate_spine(self) -> se.easy_xml.EasyXmlElement: halftitlepage, frontmatter = self.__add_to_spine([], frontmatter, "halftitlepage") # Add any remaining frontmatter - spine = spine + natsorted([file_path.name for file_path in frontmatter]) + spine += natsorted([file_path.name for file_path in frontmatter]) # The half title page is always the last front matter - spine = spine + halftitlepage + spine += halftitlepage # Add bodymatter spine, bodymatter = self.__add_to_spine(spine, bodymatter, "prologue") - spine = spine + natsorted([file_path.name for file_path in bodymatter]) + spine += natsorted([file_path.name for file_path in bodymatter]) # Add backmatter spine, backmatter = self.__add_to_spine(spine, backmatter, "afterword") @@ -1270,11 +1270,11 @@ def generate_spine(self) -> se.easy_xml.EasyXmlElement: copyright_page, backmatter = self.__add_to_spine([], backmatter, "copyright-page") # Add any remaining backmatter - spine = spine + natsorted([file_path.name for file_path in backmatter]) + spine += natsorted([file_path.name for file_path in backmatter]) # Colophon and copyright page are always last - spine = spine + colophon - spine = spine + copyright_page + spine += colophon + spine += copyright_page # Now build the spine output spine_xml = "\n" diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 22309d4e..53a924c9 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -693,7 +693,8 @@ def _lint_metadata_checks(self) -> list: messages.append(LintMessage("m-015", f"Metadata long description is not valid XHTML. LXML says: {ex}", se.MESSAGE_TYPE_ERROR, self.metadata_file_path)) # Check for apostrophes outside links in long description - matches = regex.findall(r"’s", long_description) + regex.findall(r"s’", long_description) + matches = regex.findall(r"’s", long_description) + matches += regex.findall(r"s’", long_description) if matches: messages.append(LintMessage("m-044", "Possessive [text]’[/] or [text]’s[/] outside of [xhtml][/] element in long description.", se.MESSAGE_TYPE_ERROR, self.metadata_file_path, matches)) @@ -897,7 +898,7 @@ def _lint_metadata_checks(self) -> list: # Check for common typos in description for node in self.metadata_dom.xpath("/package/metadata/dc:description") + self.metadata_dom.xpath("/package/metadata/meta[@property='se:long-description']"): matches = regex.findall(r"(?[/].", se.MESSAGE_TYPE_ERROR, filename, nodes)) @@ -2284,10 +2285,10 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, matches = [match for match in matches if "" not in match and "" not in match] # xpath to check for opening quote in p, without a next child p that starts with an opening quote or an opening bracket (for editorial insertions within paragraphs of quotation); or that consists of only an ellipses (like an elided part of a longer quotation) # Matching

s can't have a poem/verse ancestor as formatting is often special for those. - matches = matches + [regex.findall(r"“[^”]+

", node.to_string())[0] for node in dom.xpath("/html/body//p[re:test(., '“[^‘”]+$')][not(ancestor::*[re:test(@epub:type, 'z3998:(verse|poem|song|hymn|lyrics)')])][(following-sibling::*[1])[name()='p'][not(re:test(normalize-space(.), '^[“\\[]') or re:test(normalize-space(.), '^…$'))]]")] + matches += [regex.findall(r"“[^”]+

", node.to_string())[0] for node in dom.xpath("/html/body//p[re:test(., '“[^‘”]+$')][not(ancestor::*[re:test(@epub:type, 'z3998:(verse|poem|song|hymn|lyrics)')])][(following-sibling::*[1])[name()='p'][not(re:test(normalize-space(.), '^[“\\[]') or re:test(normalize-space(.), '^…$'))]]")] # Additionally, match short

tags (< 100 chars) that lack closing quote, and whose direct siblings do have closing quotes (to exclude runs of same-speaker dialog), and that is not within a blockquote, verse, or letter - matches = matches + [regex.findall(r"“[^”]+

", node.to_string())[0] for node in dom.xpath("/html/body//p[re:test(., '“[^‘”]+$') and not(re:test(., '[…:]$')) and string-length(normalize-space(.)) <=100][(following-sibling::*[1])[not(re:test(., '“[^”]+$'))] and (preceding-sibling::*[1])[not(re:test(., '“[^”]+$'))]][not(ancestor::*[re:test(@epub:type, 'z3998:(verse|poem|song|hymn|lyrics)')]) and not(ancestor::blockquote) and not (ancestor::*[contains(@epub:type, 'z3998:letter')])][(following-sibling::*[1])[name()='p'][re:test(normalize-space(.), '^[“\\[]') and not(contains(., 'continued'))]]")] + matches += [regex.findall(r"“[^”]+

", node.to_string())[0] for node in dom.xpath("/html/body//p[re:test(., '“[^‘”]+$') and not(re:test(., '[…:]$')) and string-length(normalize-space(.)) <=100][(following-sibling::*[1])[not(re:test(., '“[^”]+$'))] and (preceding-sibling::*[1])[not(re:test(., '“[^”]+$'))]][not(ancestor::*[re:test(@epub:type, 'z3998:(verse|poem|song|hymn|lyrics)')]) and not(ancestor::blockquote) and not (ancestor::*[contains(@epub:type, 'z3998:letter')])][(following-sibling::*[1])[name()='p'][re:test(normalize-space(.), '^[“\\[]') and not(contains(., 'continued'))]]")] if matches: messages.append(LintMessage("t-003", "[text]“[/] missing matching [text]”[/]. Note: When dialog from the same speaker spans multiple [xhtml]

[/] elements, it’s correct grammar to omit closing [text]”[/] until the last [xhtml]

[/] of dialog.", se.MESSAGE_TYPE_WARNING, filename, matches)) @@ -2313,7 +2314,9 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, # Check for repeated punctuation, but first remove `&` so we don't match `&,` # Remove tds with repeated ” as they are probably ditto marks - matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)?", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents) + matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + matches += regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)?", "", file_contents)) + matches += regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents) if matches: messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches)) @@ -2385,12 +2388,12 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, matches = [node.to_string() for node in dom.xpath("(//b | //i)[contains(@epub:type, 'se:name') and not(contains(@epub:type, 'z3998:stage-direction'))][(text()[last()])[re:test(., '[\\.,!\\?;:]$')]]")] # Match b or i elements that are not stage directions, and that end in a comma followed by a lowercase letter - matches = matches + [node.to_string() for node in dom.xpath("(//b | //i)[not(contains(@epub:type, 'z3998:stage-direction'))][(text()[last()])[re:test(., ',$')] and following-sibling::node()[re:test(., '^\\s*[a-z]')] ]")] + matches += [node.to_string() for node in dom.xpath("(//b | //i)[not(contains(@epub:type, 'z3998:stage-direction'))][(text()[last()])[re:test(., ',$')] and following-sibling::node()[re:test(., '^\\s*[a-z]')] ]")] # ...and also check for ending punctuation inside em tags, if it looks like a *part* of a clause # instead of a whole clause. If the is preceded by an em dash or quotes, or if there's punctuation # and a space before it, then it's presumed to be a whole clause. - matches = matches + [match.strip() for match in regex.findall(r"(?]|[!\.\?…;:]\s)(?:\w+?\s*)+[\.,\!\?;]", file_contents) if match.islower()] + matches += [match.strip() for match in regex.findall(r"(?]|[!\.\?…;:]\s)(?:\w+?\s*)+[\.,\!\?;]", file_contents) if match.islower()] if matches: messages.append(LintMessage("t-017", "Ending punctuation inside formatting like bold, small caps, or italics. Ending punctuation is only allowed within formatting if the phrase is an independent clause.", se.MESSAGE_TYPE_WARNING, filename, list(set(matches)))) @@ -2404,7 +2407,8 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, # Outer wrapping match is so that .findall returns the entire match and not the subgroup # The first regex also matches the first few characters before the first double quote; we use those for more sophisticated # checks below, to give fewer false positives like `with its downy red hairs and its “doigts de faune.”` - matches = regex.findall(r"((?:.{1,2}\s)?“<(i|em)[^>]*?>[^<]+?[\!\?\.])", file_contents) + regex.findall(r"([\.\!\?] <(i|em)[^>]*?>[^<]+?[\!\?\.])", file_contents) + matches = regex.findall(r"((?:.{1,2}\s)?“<(i|em)[^>]*?>[^<]+?[\!\?\.])", file_contents) + matches += regex.findall(r"([\.\!\?] <(i|em)[^>]*?>[^<]+?[\!\?\.])", file_contents) # But, if we've matched a name of something, don't include that as an error. For example, `He said, “The Decameron.”` # We also exclude the match from the list if: @@ -2496,7 +2500,8 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, messages.append(LintMessage("s-004", "[xhtml]img[/] element missing [attr]alt[/] attribute.", se.MESSAGE_TYPE_ERROR, filename, img_no_alt)) # Check for low-hanging misquoted fruit - matches = regex.findall(r"[\p{Letter}]+[“‘]", file_contents) + regex.findall(r"[^>]+‘[\p{Lowercase_Letter}]+", file_contents) + matches = regex.findall(r"[\p{Letter}]+[“‘]", file_contents) + matches += regex.findall(r"[^>]+‘[\p{Lowercase_Letter}]+", file_contents) if matches: messages.append(LintMessage("t-028", "Possible mis-curled quotation mark.", se.MESSAGE_TYPE_WARNING, filename, matches)) @@ -2884,7 +2889,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c if special_file != "titlepage": # Don't check the titlepage because it has a standard format and may raise false positives typos = regex.findall(r"(?= 2 ebook_flags["has_other_sources"] = other_source_count > 0 - messages = messages + _lint_metadata_checks(self) + messages += _lint_metadata_checks(self) # Check for double spacing (done here so double_spaced_files doesn't have to be passed to function) if self.metadata_dom.xpath(f"/package/metadata/*[re:test(., '[{se.NO_BREAK_SPACE}{se.HAIR_SPACE} ]{{2,}}')]"): double_spaced_files.append(self.metadata_file_path) # Check for malformed URLs - messages = messages + _get_malformed_urls(self.metadata_dom, self.metadata_file_path) + messages += _get_malformed_urls(self.metadata_dom, self.metadata_file_path) # Make sure some static files are unchanged if self.is_se_ebook: @@ -3472,7 +3477,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if filename.suffix in BINARY_EXTENSIONS or filename.name == "core.css": if filename.suffix in (".jpg", ".jpeg", ".tif", ".tiff", ".png"): - messages = messages + _lint_image_checks(self, filename) + messages += _lint_image_checks(self, filename) continue # Read the file and start doing some serious checks! @@ -3495,7 +3500,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if filename.suffix == ".svg": svg_dom = self.get_dom(filename) - messages = messages + _lint_svg_checks(self, filename, file_contents, svg_dom, root) + messages += _lint_svg_checks(self, filename, file_contents, svg_dom, root) if self.cover_path and filename.name == self.cover_path.name: # For later comparison with titlepage cover_svg_title = svg_dom.xpath("/svg/title/text()", True).replace("The cover for ", "") # can appear on any element in SVG, but we only want to check the root one @@ -3537,10 +3542,10 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N css_filename = (filename.parent / node.get_attr("href")).resolve() dom.apply_css(self.get_file(css_filename), str(css_filename)) - messages = messages + _get_malformed_urls(dom, filename) + messages += _get_malformed_urls(dom, filename) # Extract ID attributes for later checks - id_attrs = id_attrs + dom.xpath("//*[name() != 'section' and name() != 'article' and name() != 'figure' and name() != 'nav']/@id") + id_attrs += dom.xpath("//*[name() != 'section' and name() != 'article' and name() != 'figure' and name() != 'nav']/@id") # Add to the short story count for later checks short_story_count += len(dom.xpath("/html/body//article[contains(@epub:type, 'se:short-story')]")) @@ -3654,23 +3659,22 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N special_file = None if special_file in SPECIAL_FILES: - messages = messages + _lint_special_file_checks(self, filename, dom, file_contents, ebook_flags, special_file) + messages += _lint_special_file_checks(self, filename, dom, file_contents, ebook_flags, special_file) - missing_styles = missing_styles + _update_missing_styles(filename, dom, local_css) + missing_styles += _update_missing_styles(filename, dom, local_css) - messages = messages + _lint_xhtml_css_checks(filename, dom, local_css_path) + messages += _lint_xhtml_css_checks(filename, dom, local_css_path) - messages = messages + _lint_xhtml_metadata_checks(self, filename, dom) + messages += _lint_xhtml_metadata_checks(self, filename, dom) - messages = messages + _lint_xhtml_syntax_checks(self, filename, dom, file_contents, ebook_flags, language, section_tree) + messages += _lint_xhtml_syntax_checks(self, filename, dom, file_contents, ebook_flags, language, section_tree) (typography_messages, missing_files) = _lint_xhtml_typography_checks(filename, dom, file_contents, special_file, ebook_flags, missing_files, self) - if typography_messages: - messages = messages + typography_messages + messages += typography_messages - messages = messages + _lint_xhtml_xhtml_checks(filename, dom, file_contents, local_css_path) + messages += _lint_xhtml_xhtml_checks(filename, dom, file_contents, local_css_path) - messages = messages + _lint_xhtml_typo_checks(filename, dom, file_contents, special_file) + messages += _lint_xhtml_typo_checks(filename, dom, file_contents, special_file) if self.cover_path and cover_svg_title != titlepage_svg_title: messages.append(LintMessage("s-028", f"[path][link=file://{self.cover_path}]{self.cover_path.name}[/][/] and [path][link=file://{self.path / 'images/titlepage.svg'}]titlepage.svg[/][/] [xhtml]<title>[/] elements don’t match.", se.MESSAGE_TYPE_ERROR, self.cover_path)) @@ -3810,7 +3814,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if f"[epub|type~=\"{value}\"]" not in self.local_css: missing_styles.append(element.to_tag_string()) - messages = messages + _lint_image_metadata_checks(self, ebook_flags["has_images"]) + messages += _lint_image_metadata_checks(self, ebook_flags["has_images"]) if missing_styles: messages.append(LintMessage("c-006", f"Semantic found, but missing corresponding style in [path][link=file://{local_css_path}]local.css[/][/].", se.MESSAGE_TYPE_ERROR, local_css_path, sorted(set(missing_styles))))