From 335292f5c5155c0c95913f34020fd42d747f7963 Mon Sep 17 00:00:00 2001 From: Alberto Pettarin Date: Sun, 29 Nov 2015 13:11:04 +0100 Subject: [PATCH] Fixed bug outputting to Kobo. New options. --- README.md | 52 +-- README.rst | 53 +-- VERSION | 2 +- bin/penelope | 2 +- penelope/__init__.py | 2 +- penelope/__main__.py | 2 +- penelope/collation_default.py | 8 +- penelope/collation_german.py | 19 +- penelope/command_line.py | 83 +++-- penelope/dictionary.py | 94 +++++- penelope/dictionary_ebook.py | 523 ++++++++++++++++++++++++++++++ penelope/format_bookeen.py | 45 +-- penelope/format_csv.py | 2 +- penelope/format_epub.py | 398 +++-------------------- penelope/format_kobo.py | 83 ++--- penelope/format_mobi.py | 163 +++------- penelope/format_stardict.py | 47 +-- penelope/format_xml.py | 2 +- penelope/input_parser_identity.py | 2 +- penelope/input_parser_webster.py | 2 +- penelope/prefix_default.py | 35 ++ penelope/prefix_kobo.py | 60 ++++ penelope/utilities.py | 6 +- setup.py | 4 +- 24 files changed, 1037 insertions(+), 652 deletions(-) create mode 100644 penelope/dictionary_ebook.py create mode 100644 penelope/prefix_default.py create mode 100644 penelope/prefix_kobo.py diff --git a/README.md b/README.md index ee1b0ae..e70b13a 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ **Penelope** is a multi-tool for creating, editing and converting dictionaries, especially for eReader devices. -* Version: 3.0.1 -* Date: 2015-11-22 +* Version: 3.1.0 +* Date: 2015-11-29 * Developer: [Alberto Pettarin](http://www.albertopettarin.it/) * License: the MIT License (MIT) * Contact: [click here](http://www.albertopettarin.it/contact.html) @@ -78,7 +78,7 @@ You might need to install `dictzip` (StarDict output) and `kindlegen` (MOBI out $ python -m penelope ``` -This procedure will not any dependencies, see below. +This procedure will not install any dependencies: you will need to do that manually, see below. ### Dependencies @@ -97,9 +97,9 @@ This procedure will not any dependencies, see below. $ [sudo] pip install marisa-trie ``` - or [`MARISA`](https://code.google.com/p/marisa-trie/) executables available in your `$PATH` or specified with `--marisa-bin-path` + or [MARISA](https://code.google.com/p/marisa-trie/) executables available in your `$PATH` or specified with `--marisa-bin-path` -* to write MOBI Kindle dictionaries: the [`kindlegen`](https://www.amazon.com/gp/feature.html?docId=1000765211) executable, available in your `$PATH` or specified with `--kindlegen-path` +* to write MOBI Kindle dictionaries: the [kindlegen](https://www.amazon.com/gp/feature.html?docId=1000765211) executable, available in your `$PATH` or specified with `--kindlegen-path` * to read/write XML dictionaries: the Python module `lxml`: @@ -154,6 +154,8 @@ optional arguments: --title TITLE title string --website WEBSITE website string --year YEAR year string + --apply-css APPLY_CSS + apply the given CSS file (epub and mobi output only) --bookeen-collation-function BOOKEEN_COLLATION_FUNCTION use the specified collation function --bookeen-install-file @@ -165,29 +167,36 @@ optional arguments: --csv-ls CSV_LS CSV line separator (default: '\n') --dictzip-path DICTZIP_PATH path to dictzip executable - --epub-escape-strings - escape HTML strings (default: False) - --epub-group-prefix-length EPUB_GROUP_PREFIX_LENGTH - group headwords by prefix of given length (default: 3) - --epub-merge-group-size EPUB_MERGE_GROUP_SIZE - merge headword groups with less than this number of - headwords (default: 128) - --epub-output-definitions - output definitions in addition to the headwords - (default: False) + --epub-no-compress do not create the compressed container (epub output + only, default: False) + --escape-strings escape HTML strings (default: False) --flatten-synonyms flatten synonyms, creating a new entry with headword=synonym and using the definition of the original headword (default: False) + --group-by-prefix-function GROUP_BY_PREFIX_FUNCTION + compute the prefix of headwords using the given prefix + function file + --group-by-prefix-length GROUP_BY_PREFIX_LENGTH + group headwords by prefix of given length (default: 2) + --group-by-prefix-merge-across-first + merge headword groups even when the first character + changes (default: False) + --group-by-prefix-merge-min-size GROUP_BY_PREFIX_MERGE_MIN_SIZE + merge headword groups until the given minimum number + of headwords is reached (default: 0, meaning no merge + will take place) + --ignore-case ignore headword case, all headwords will be lowercased + (default: False) + --ignore-synonyms ignore synonyms, not reading/writing them if present + (default: False) + --include-index-page include an index page (epub and mobi output only, + default: False) --input-file-encoding INPUT_FILE_ENCODING use the specified encoding for reading the raw contents of input file(s) (default: 'utf-8') --input-parser INPUT_PARSER use the specified parser function after reading the raw contents of input file(s) - --ignore-case ignore headword case, all headwords will be lowercased - (default: False) - --ignore-synonyms ignore synonyms, not reading/writing them if present - (default: False) --kindlegen-path KINDLEGEN_PATH path to kindlegen executable --marisa-bin-path MARISA_BIN_PATH @@ -201,6 +210,8 @@ optional arguments: | ') --mobi-no-kindlegen do not run kindlegen, keep .opf and .html files (default: False) + --no-definitions do not output definitions for EPUB and MOBI formats + (default: False) --sd-ignore-sametypesequence ignore the value of sametypesequence in StarDict .ifo files (default: False) @@ -253,7 +264,6 @@ examples: $ penelope -i dict.xml -j xml -f en -t it -p mobi -o output.epub --epub-output-definitions As above, but also output definitions - ``` You can find ISO 639-1 language codes [here](http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). @@ -338,6 +348,8 @@ were released under the GNU GPL 3 License. * Reading EPUB (3) dictionaries is not supported; the writing part needs polishing/refactoring * Reading PRC/MOBI (Kindle) dictionaries is not supported * There are some limitations on StarDict files that can be read (see comments in `format_stardict.py`) +* Documentation is not complete +* Unit tests are missing ## Acknowledgments diff --git a/README.rst b/README.rst index 1217bb5..0e733d7 100644 --- a/README.rst +++ b/README.rst @@ -4,8 +4,8 @@ Penelope **Penelope** is a multi-tool for creating, editing and converting dictionaries, especially for eReader devices. -- Version: 3.0.1 -- Date: 2015-11-22 +- Version: 3.1.0 +- Date: 2015-11-29 - Developer: `Alberto Pettarin `__ - License: the MIT License (MIT) - Contact: `click here `__ @@ -96,7 +96,8 @@ From source code $ python -m penelope -This procedure will not any dependencies, see below. +This procedure will not install any dependencies: you will need to do +that manually, see below. Dependencies ~~~~~~~~~~~~ @@ -116,11 +117,11 @@ Dependencies $ [sudo] pip install marisa-trie -or ```MARISA`` `__ executables +or `MARISA `__ executables available in your ``$PATH`` or specified with ``--marisa-bin-path`` - to write MOBI Kindle dictionaries: the - ```kindlegen`` `__ + `kindlegen `__ executable, available in your ``$PATH`` or specified with ``--kindlegen-path`` @@ -178,6 +179,8 @@ Usage --title TITLE title string --website WEBSITE website string --year YEAR year string + --apply-css APPLY_CSS + apply the given CSS file (epub and mobi output only) --bookeen-collation-function BOOKEEN_COLLATION_FUNCTION use the specified collation function --bookeen-install-file @@ -189,29 +192,36 @@ Usage --csv-ls CSV_LS CSV line separator (default: '\n') --dictzip-path DICTZIP_PATH path to dictzip executable - --epub-escape-strings - escape HTML strings (default: False) - --epub-group-prefix-length EPUB_GROUP_PREFIX_LENGTH - group headwords by prefix of given length (default: 3) - --epub-merge-group-size EPUB_MERGE_GROUP_SIZE - merge headword groups with less than this number of - headwords (default: 128) - --epub-output-definitions - output definitions in addition to the headwords - (default: False) + --epub-no-compress do not create the compressed container (epub output + only, default: False) + --escape-strings escape HTML strings (default: False) --flatten-synonyms flatten synonyms, creating a new entry with headword=synonym and using the definition of the original headword (default: False) + --group-by-prefix-function GROUP_BY_PREFIX_FUNCTION + compute the prefix of headwords using the given prefix + function file + --group-by-prefix-length GROUP_BY_PREFIX_LENGTH + group headwords by prefix of given length (default: 2) + --group-by-prefix-merge-across-first + merge headword groups even when the first character + changes (default: False) + --group-by-prefix-merge-min-size GROUP_BY_PREFIX_MERGE_MIN_SIZE + merge headword groups until the given minimum number + of headwords is reached (default: 0, meaning no merge + will take place) + --ignore-case ignore headword case, all headwords will be lowercased + (default: False) + --ignore-synonyms ignore synonyms, not reading/writing them if present + (default: False) + --include-index-page include an index page (epub and mobi output only, + default: False) --input-file-encoding INPUT_FILE_ENCODING use the specified encoding for reading the raw contents of input file(s) (default: 'utf-8') --input-parser INPUT_PARSER use the specified parser function after reading the raw contents of input file(s) - --ignore-case ignore headword case, all headwords will be lowercased - (default: False) - --ignore-synonyms ignore synonyms, not reading/writing them if present - (default: False) --kindlegen-path KINDLEGEN_PATH path to kindlegen executable --marisa-bin-path MARISA_BIN_PATH @@ -225,6 +235,8 @@ Usage | ') --mobi-no-kindlegen do not run kindlegen, keep .opf and .html files (default: False) + --no-definitions do not output definitions for EPUB and MOBI formats + (default: False) --sd-ignore-sametypesequence ignore the value of sametypesequence in StarDict .ifo files (default: False) @@ -277,7 +289,6 @@ Usage $ penelope -i dict.xml -j xml -f en -t it -p mobi -o output.epub --epub-output-definitions As above, but also output definitions - You can find ISO 639-1 language codes `here `__. @@ -384,6 +395,8 @@ Limitations and Missing Features - Reading PRC/MOBI (Kindle) dictionaries is not supported - There are some limitations on StarDict files that can be read (see comments in ``format_stardict.py``) +- Documentation is not complete +- Unit tests are missing Acknowledgments --------------- diff --git a/VERSION b/VERSION index cb2b00e..fd2a018 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.1 +3.1.0 diff --git a/bin/penelope b/bin/penelope index b810d23..7421518 100755 --- a/bin/penelope +++ b/bin/penelope @@ -14,7 +14,7 @@ from penelope import main as package_main __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/__init__.py b/penelope/__init__.py index e54bbe4..d4a9326 100644 --- a/penelope/__init__.py +++ b/penelope/__init__.py @@ -32,7 +32,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/__main__.py b/penelope/__main__.py index bbda8b9..596e443 100644 --- a/penelope/__main__.py +++ b/penelope/__main__.py @@ -31,7 +31,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/collation_default.py b/penelope/collation_default.py index a5d7059..ae4b587 100644 --- a/penelope/collation_default.py +++ b/penelope/collation_default.py @@ -2,19 +2,21 @@ # -*- coding: utf-8 -*- """ -This is the default collation function (IcuNoCase) for bookeen output format. +This is the default collation function (IcuNoCase). """ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" def collate_function(string1, string2): """ - Implement IcuNoCase collation. + Implement default IcuNoCase collation, + by simply lowercasing the UTF-8 encoded versions + of the two strings. :param string1: first string :type string1: unicode diff --git a/penelope/collation_german.py b/penelope/collation_german.py index 3c5e2c5..d647f7b 100644 --- a/penelope/collation_german.py +++ b/penelope/collation_german.py @@ -2,19 +2,27 @@ # -*- coding: utf-8 -*- """ -This is a sample collation function (IcuNoCase) for German. +This is a collation function (IcuNoCase) for German. """ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" +REPLACEMENTS = [ + [u"ä", u"a"], + [u"ö", u"o"], + [u"ü", u"u"], + [u"ß", u"ss"] +] + def collate_function(string1, string2): """ Implement IcuNoCase collation for German. + (I do not remember where the procedure comes from.) :param string1: first string :type string1: unicode @@ -26,10 +34,9 @@ def collate_function(string1, string2): b2 = string2.lower() c1 = b1 c2 = b2 - for f in [[u"ä", u"a"], [u"ö", u"o"], [u"ü", u"u"], [u"ß", u"ss"]]: - b1 = b1.replace(f[0], f[1]) - b2 = b2.replace(f[0], f[1]) - + for repl in REPLACEMENTS: + b1 = b1.replace(repl[0], repl[1]) + b2 = b2.replace(repl[0], repl[1]) if b1.encode("utf-16") == b2.encode("utf-16"): if c1.encode("utf-16") == c2.encode("utf-16"): return 0 diff --git a/penelope/command_line.py b/penelope/command_line.py index 6e46d47..f2fa6f6 100644 --- a/penelope/command_line.py +++ b/penelope/command_line.py @@ -15,7 +15,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -154,6 +154,12 @@ "action": "store" }, + { + "short": None, + "long": "--apply-css", + "help": "apply the given CSS file (epub and mobi output only)", + "action": "store" + }, { "short": None, "long": "--bookeen-collation-function", @@ -192,44 +198,44 @@ }, { "short": None, - "long": "--epub-escape-strings", - "help": "escape HTML strings (default: False)", + "long": "--epub-no-compress", + "help": "do not create the compressed container (epub output only, default: False)", "action": "store_true" }, { "short": None, - "long": "--epub-group-prefix-length", - "help": "group headwords by prefix of given length (default: 3)", - "action": "store" + "long": "--escape-strings", + "help": "escape HTML strings (default: False)", + "action": "store_true" }, { "short": None, - "long": "--epub-merge-group-size", - "help": "merge headword groups with less than this number of headwords (default: 128)", - "action": "store" + "long": "--flatten-synonyms", + "help": "flatten synonyms, creating a new entry with headword=synonym and using the definition of the original headword (default: False)", + "action": "store_true" }, { "short": None, - "long": "--epub-output-definitions", - "help": "output definitions in addition to the headwords (default: False)", - "action": "store_true" + "long": "--group-by-prefix-function", + "help": "compute the prefix of headwords using the given prefix function file", + "action": "store" }, { "short": None, - "long": "--flatten-synonyms", - "help": "flatten synonyms, creating a new entry with headword=synonym and using the definition of the original headword (default: False)", - "action": "store_true" + "long": "--group-by-prefix-length", + "help": "group headwords by prefix of given length (default: 2)", + "action": "store" }, { "short": None, - "long": "--input-file-encoding", - "help": "use the specified encoding for reading the raw contents of input file(s) (default: 'utf-8')", - "action": "store" + "long": "--group-by-prefix-merge-across-first", + "help": "merge headword groups even when the first character changes (default: False)", + "action": "store_true" }, { "short": None, - "long": "--input-parser", - "help": "use the specified parser function after reading the raw contents of input file(s)", + "long": "--group-by-prefix-merge-min-size", + "help": "merge headword groups until the given minimum number of headwords is reached (default: 0, meaning no merge will take place)", "action": "store" }, { @@ -244,6 +250,24 @@ "help": "ignore synonyms, not reading/writing them if present (default: False)", "action": "store_true" }, + { + "short": None, + "long": "--include-index-page", + "help": "include an index page (epub and mobi output only, default: False)", + "action": "store_true" + }, + { + "short": None, + "long": "--input-file-encoding", + "help": "use the specified encoding for reading the raw contents of input file(s) (default: 'utf-8')", + "action": "store" + }, + { + "short": None, + "long": "--input-parser", + "help": "use the specified parser function after reading the raw contents of input file(s)", + "action": "store" + }, { "short": None, "long": "--kindlegen-path", @@ -280,6 +304,12 @@ "help": "do not run kindlegen, keep .opf and .html files (default: False)", "action": "store_true" }, + { + "short": None, + "long": "--no-definitions", + "help": "do not output definitions for EPUB and MOBI formats (default: False)", + "action": "store_true" + }, { "short": None, "long": "--sd-ignore-sametypesequence", @@ -442,6 +472,7 @@ def set_default_values(args): def set_default_value(key, value): if not args.__contains__(key): args.__dict__[key] = value + set_default_value("apply_css", None) set_default_value("bookeen_collation_function", None) set_default_value("bookeen_install_file", False) set_default_value("csv_fs", ",") @@ -449,13 +480,16 @@ def set_default_value(key, value): set_default_value("csv_ls", "\n") set_default_value("debug", False) set_default_value("dictzip_path", None) - set_default_value("epub_escape_strings", False) - set_default_value("epub_group_prefix_length", 3) - set_default_value("epub_merge_group_size", 100) - set_default_value("epub_output_definitions", False) + set_default_value("epub_no_compress", False) + set_default_value("escape_strings", False) set_default_value("flatten_synonyms", False) + set_default_value("group_by_prefix_length", 2) + set_default_value("group_by_prefix_function", None) + set_default_value("group_by_prefix_merge_across_first", False) + set_default_value("group_by_prefix_merge_min_size", 0) set_default_value("ignore_case", False) set_default_value("ignore_synonyms", False) + set_default_value("include_index_page", False) set_default_value("input_file_encoding", "utf-8") set_default_value("input_parser", None) set_default_value("keep", False) @@ -465,6 +499,7 @@ def set_default_value(key, value): set_default_value("merge_definitions", False) set_default_value("merge_separator", " | ") set_default_value("mobi_no_kindlegen", False) + set_default_value("no_definitions", False) set_default_value("sd_ignore_sametypesequence", False) set_default_value("sd_no_dictzip", False) set_default_value("sort_after", False) diff --git a/penelope/dictionary.py b/penelope/dictionary.py index 81c9320..386e9da 100644 --- a/penelope/dictionary.py +++ b/penelope/dictionary.py @@ -16,15 +16,17 @@ from __future__ import absolute_import from io import open +import imp import os +from penelope.prefix_default import get_prefix as get_prefix_default from penelope.utilities import get_uuid from penelope.utilities import print_error __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -425,5 +427,95 @@ def default_merge_function(headword, definitions): # not needed, since we called self.clear() #self.sort(False, False, False, False) + def group( + self, + prefix_function=None, + prefix_function_path=None, + prefix_length=2, + merge_min_size=0, + merge_across_first=False + ): + """ + Group headwords by prefix, returning a dictionary containing + the prefixes as keys (possibly, with a "SPECIAL" key) and + the dictionary entries as elements of the list associated with a key. + + :param prefix_function_path: the path to a source file containing + a get_prefix function, mapping a headword + and the prefix_length to the prefix; + if None, a default function will be used + :type prefix_function_path: path + :param prefix_function: the function, mapping a headword + and the prefix_length to the prefix; + if None, a default function will be used + :type prefix_function: function + :param prefix_length: the lenght of the prefixes + :type prefix_length: int + :param merge_min_size: merge headword groups until the given minimum + number of headwords is reached; if 0, does not merge + :type merge_min_size: int + :param merge_across_first: if True, merge groups even when + the first character changes + :type merge_across_first: False + :rtype: (list, list, dict) + """ + def return_triple(groups): + """ + Return a (list_special, list, dict), + where the list contains the sorted keys of dict, + and list_special contains the list of SPECIAL entries. + """ + spec = None + if u"SPECIAL" in groups: + spec = groups[u"SPECIAL"] + del groups[u"SPECIAL"] + keys = sorted(groups.keys()) + return (spec, keys, groups) + + # load the prefix function + get_prefix = get_prefix_default + if prefix_function is not None: + get_prefix = prefix_function + elif prefix_function_path is not None: + try: + get_prefix = imp.load_source("", prefix_function).get_prefix + except: + pass + + # create groups + raw_groups = {} + for index in self.entries_index_sorted: + entry = self.entries[index] + prefix = get_prefix(entry.headword, prefix_length) + if not prefix in raw_groups: + raw_groups[prefix] = [] + raw_groups[prefix].append(self.entries[index]) + + # if no merge is requested, return + if merge_min_size == 0: + return return_triple(raw_groups) + + # merge small groups + merged_groups = {} + if u"SPECIAL" in raw_groups: + # special is never merged + merged_groups[u"SPECIAL"] = raw_groups[u"SPECIAL"] + del raw_groups[u"SPECIAL"] + keys = sorted(raw_groups.keys()) + accumulator_key = keys[0] + accumulator = raw_groups[accumulator_key] + for key in keys[1:]: + if ( + (len(accumulator) >= merge_min_size) or + ((not merge_across_first) and (key[0] != accumulator_key[0])) + ): + merged_groups[accumulator_key] = accumulator + accumulator_key = key + accumulator = raw_groups[accumulator_key] + else: + accumulator += raw_groups[key] + merged_groups[accumulator_key] = accumulator + return return_triple(merged_groups) + diff --git a/penelope/dictionary_ebook.py b/penelope/dictionary_ebook.py new file mode 100644 index 0000000..8c9ca3a --- /dev/null +++ b/penelope/dictionary_ebook.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python +# coding=utf-8 + +""" +DictionaryEbook represents a dictionary ebook +in EPUB 2 and MOBI format. +""" + +from __future__ import absolute_import +from __future__ import print_function +from io import open +import os +import zipfile + +from penelope.utilities import create_temp_directory +from penelope.utilities import delete_directory + +__author__ = "Alberto Pettarin" +__copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" +__license__ = "MIT" +__version__ = "3.1.0" +__email__ = "alberto@albertopettarin.it" +__status__ = "Production" + +class DictionaryEbook(): + """ + A class representing a generic ebook containing a dictionary. + + It can be used to output a MOBI or an EPUB 2 container. + + The ebook must have an OPF, and one or more group XHTML files. + + Optionally, it can have a cover image, an NCX TOC, an index XHTML file. + + The actual file templates are provided by the caller. + """ + + EPUB2 = u"epub2" + + #EPUB3 = u"epub3" + + MOBI = u"mobi" + + GROUP_START_INDEX = 2 + + MIMETYPE_CONTENTS = u"application/epub+zip" + + CONTAINER_XML_CONTENTS = u""" + + + + +""" + + EPUB_CSS_CONTENTS = u"""@charset "UTF-8"; +body { + margin: 10px 25px 10px 25px; +} +h1 { + font-size: 200%; +} +h2 { + font-size: 150%; +} +p { + margin-left: 0em; + margin-right: 0em; + margin-top: 0em; + margin-bottom: 0em; + line-height: 2em; + text-align: justify; +} +a, a:focus, a:active, a:visited { + color: black; + text-decoration: none; +} +body.indexPage {} +h1.indexTitle {} +p.indexGroups { + font-size: 150%; +} +span.indexGroup {} +body.groupPage {} +h1.groupTitle {} +div.groupNavigation {} +span.groupHeadword {} +div.groupEntry { + margin-top: 0; + margin-bottom: 1em; +} +h2.groupHeadword { + margin-left: 5%; +} +p.groupDefinition { + margin-left: 10%; + margin-right: 10%; +} +""" + + MOBI_CSS_CONTENTS = u""""@charset "UTF-8";""" + + INDEX_XHTML_TEMPLATE = u""" + + + + %s + + + +

%s

+

+%s +

+ +""" + INDEX_XHTML_LINK_TEMPLATE = u""" %s""" + + INDEX_XHTML_LINK_JOINER = u" •\n" + + EPUB_GROUP_XHTML_TEMPLATE = u""" + + + + %s + + + +

%s

+ +%s + +""" + EPUB_GROUP_XHTML_INDEX_LINK = u""" [ Index ]""" + + EPUB_GROUP_XHTML_WORD_TEMPLATE = u""" %s""" + + EPUB_GROUP_XHTML_WORD_JOINER = u" •\n" + + EPUB_GROUP_XHTML_WORD_DEFINITION_TEMPLATE = u"""
+

%s

+

%s

+
""" + + EPUB_GROUP_XHTML_WORD_DEFINITION_JOINER = u"\n" + + MOBI_GROUP_XHTML_TEMPLATE = u""" + + + + %s + + + +

%s

+ +%s + +""" + + MOBI_GROUP_XHTML_INDEX_LINK = u""" [ Index ]""" + + MOBI_GROUP_XHTML_WORD_TEMPLATE = u""" %s""" + + MOBI_GROUP_XHTML_WORD_JOINER = u" •\n" + + MOBI_GROUP_XHTML_WORD_DEFINITION_TEMPLATE = u"""
+ +

%s

+

%s

+
+
""" + + MOBI_GROUP_XHTML_WORD_DEFINITION_JOINER = u"\n" + + EPUB2_OPF_TEMPLATE = u""" + + + %s + %s + %s + %s + %s + %s-01-01 +%s + + +%s + + +%s + +""" + + MOBI_OPF_TEMPLATE = u""" + + + + %s + %s + %s + %s + %s + Dictionaries + + + + %s + %s + %s + + + +%s + + +%s + + + +""" + + OPF_MANIFEST_ITEM_TEMPLATE = u""" """ + + OPF_SPINE_ITEMREF_TEMPLATE = u""" """ + + NCX_TEMPLATE = u""" + + + + + + + + + + %s + + +%s + +""" + + NCX_NAVPOINT_TEMPLATE = u""" + + %s + + + """ + + def __init__(self, ebook_format, args): + self.ebook_format = ebook_format + self.args = args + self.root_directory_path = None + self.cover = None + self.files = [] + self.manifest_files = [] + self.groups = [] + + def get_tmp_path(self): + if self.root_directory_path is not None: + return self.root_directory_path + return u"" + + def delete(self): + if self.root_directory_path is not None: + delete_directory(self.root_directory_path) + + def add_file(self, relative_path, contents, mode=zipfile.ZIP_DEFLATED): + file_path = os.path.join(self.root_directory_path, relative_path) + file_obj = open(file_path, "wb") + try: + # Python 2 + if isinstance(contents, unicode): + contents = contents.encode("utf-8") + except NameError: + # Python 3 + if isinstance(contents, str): + contents = contents.encode("utf-8") + except: + # should not occur + pass + file_obj.write(contents) + file_obj.close() + self.files.append({"path": relative_path, "mode": mode}) + + def write_cover(self, cover_path_absolute): + if cover_path_absolute is not None: + try: + basename = os.path.basename(cover_path_absolute) + cover_obj = open(cover_path_absolute, "rb") + cover = cover_obj.read() + cover_obj.close() + b = basename.lower() + mimetype = "image/jpeg" + if b.endswith(".png"): + mimetype = "image/png" + elif b.endswith(".gif"): + mimetype = "image/gif" + self.add_file_manifest(u"OEBPS/%s" % basename, basename, cover, mimetype) + self.cover = basename + except: + pass + + def write_css(self, custom_css_path_absolute): + if self.ebook_format == self.MOBI: + css = self.MOBI_CSS_CONTENTS + else: + css = self.EPUB_CSS_CONTENTS + if custom_css_path_absolute is not None: + try: + css_obj = open(custom_css_path_absolute, "rb") + css = css_obj.read() + css_obj.close() + except: + pass + self.add_file_manifest(u"OEBPS/style.css", u"style.css", css, "text/css") + + def add_file_manifest(self, relative_path, id, contents, mimetype): + self.add_file(relative_path, contents) + self.manifest_files.append({"path": relative_path, "id": id, "mimetype": mimetype}) + + def get_group_xhtml_file_name_from_index(self, index): + if (index < self.GROUP_START_INDEX) or (index >= len(self.groups) + self.GROUP_START_INDEX): + return u"#groupPage" + return u"g%06d.xhtml" % index + + def add_group(self, key, entries): + self.groups.append({"key": key, "entries": entries}) + + def write_groups(self): + if self.ebook_format == self.MOBI: + group_template = self.MOBI_GROUP_XHTML_TEMPLATE + if self.args.include_index_page: + index_link = self.MOBI_GROUP_XHTML_INDEX_LINK + else: + index_link = u"" + word_template = self.MOBI_GROUP_XHTML_WORD_TEMPLATE + word_joiner = self.MOBI_GROUP_XHTML_WORD_JOINER + word_definition_template = self.MOBI_GROUP_XHTML_WORD_DEFINITION_TEMPLATE + word_definition_joiner = self.MOBI_GROUP_XHTML_WORD_DEFINITION_JOINER + else: + group_template = self.EPUB_GROUP_XHTML_TEMPLATE + if self.args.include_index_page: + index_link = self.EPUB_GROUP_XHTML_INDEX_LINK + else: + index_link = u"" + word_template = self.EPUB_GROUP_XHTML_WORD_TEMPLATE + word_joiner = self.EPUB_GROUP_XHTML_WORD_JOINER + word_definition_template = self.EPUB_GROUP_XHTML_WORD_DEFINITION_TEMPLATE + word_definition_joiner = self.EPUB_GROUP_XHTML_WORD_DEFINITION_JOINER + + index = self.GROUP_START_INDEX + for group in self.groups: + group_label = self.get_group_label(group) + group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) + previous_link = self.get_group_xhtml_file_name_from_index(index - 1) + next_link = self.get_group_xhtml_file_name_from_index(index + 1) + group_contents = [] + if self.args.no_definitions: + for entry in group["entries"]: + headword = self.escape_if_needed(entry.headword) + group_contents.append(word_template % (headword)) + group_contents = word_joiner.join(group_contents) + else: + for entry in group["entries"]: + headword = self.escape_if_needed(entry.headword) + definition = self.escape_if_needed(entry.definition) + group_contents.append(word_definition_template % (headword, definition)) + group_contents = word_definition_joiner.join(group_contents) + group_contents = group_template % (group_label, group_label, previous_link, index_link, next_link, group_contents) + self.add_file_manifest(u"OEBPS/%s" % group_xhtml_path, group_xhtml_path, group_contents, u"application/xhtml+xml") + index += 1 + + def escape_if_needed(self, string): + def html_escape(s): + x = s + x = x.replace("&", "&") + x = x.replace('"', """) + x = x.replace("'", "'") + x = x.replace(">", ">") + x = x.replace("<", "<") + return x + if self.args.escape_strings: + return html_escape(string) + return string + + def get_group_label(self, group): + group_label = group["key"] + if group_label != u"SPECIAL": + group_label = "%s–%s" % (group["entries"][0].headword, group["entries"][-1].headword) + return group_label + + def write_index(self): + links = [] + index = self.GROUP_START_INDEX + for group in self.groups: + group_label = self.get_group_label(group) + group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) + group_link = self.INDEX_XHTML_LINK_TEMPLATE % (group_xhtml_path, group_label) + links.append(group_link) + index += 1 + links = self.INDEX_XHTML_LINK_JOINER.join(links) + contents = self.INDEX_XHTML_TEMPLATE % (self.args.title, self.args.title, links) + self.add_file_manifest(u"OEBPS/index.xhtml", u"index.xhtml", contents, u"application/xhtml+xml") + + def write_opf(self): + manifest_contents = [] + spine_contents = [] + for mi in self.manifest_files: + manifest_contents.append(self.OPF_MANIFEST_ITEM_TEMPLATE % (mi["id"], mi["id"], mi["mimetype"])) + if mi["mimetype"] == u"application/xhtml+xml": + spine_contents.append(self.OPF_SPINE_ITEMREF_TEMPLATE % (mi["id"])) + manifest_contents = u"\n".join(manifest_contents) + spine_contents = u"\n".join(spine_contents) + cover = u"" + if self.ebook_format == self.MOBI: + if self.cover is not None: + cover = self.cover + opf_contents = self.MOBI_OPF_TEMPLATE % ( + self.args.title, + self.args.language_from, + self.args.identifier, + self.args.author, + self.args.copyright, + self.args.language_from, + self.args.language_to, + cover, + manifest_contents, + spine_contents + ) + else: + if self.cover is not None: + cover = u""" """ % self.cover + opf_contents = self.EPUB2_OPF_TEMPLATE % ( + self.args.identifier, + self.args.language_from, + self.args.title, + self.args.author, + self.args.copyright, + self.args.year, + cover, + manifest_contents, + spine_contents + ) + self.add_file("OEBPS/content.opf", opf_contents) + + def write_ncx(self): + ncx_items = [] + index = 1 + if self.args.include_index_page: + ncx_items.append(self.NCX_NAVPOINT_TEMPLATE % (index, index, "Index", "index.xhtml")) + index += 1 + for group in self.groups: + group_label = self.get_group_label(group) + group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) + ncx_items.append(self.NCX_NAVPOINT_TEMPLATE % (index, index, group_label, group_xhtml_path)) + index += 1 + ncx_items = u"\n".join(ncx_items) + ncx_contents = self.NCX_TEMPLATE % (self.args.identifier, self.args.title, ncx_items) + self.add_file_manifest(u"OEBPS/toc.ncx", u"toc.ncx", ncx_contents, u"application/x-dtbncx+xml") + + def write(self, file_path_absolute, compress=True): + # get cover path + cover_path_absolute = self.args.cover_path + if cover_path_absolute is not None: + cover_path_absolute = os.path.abspath(cover_path_absolute) + + # get custom css path + custom_css_path_absolute = self.args.apply_css + if custom_css_path_absolute is not None: + custom_css_path_absolute = os.path.abspath(custom_css_path_absolute) + + # create new tmp directory and cd there + self.root_directory_path = create_temp_directory() + cwd = os.getcwd() + os.chdir(self.root_directory_path) + os.makedirs(u"META-INF") + os.makedirs(u"OEBPS") + + # add mimetype and container.xml + if self.ebook_format in [self.EPUB2]: # add EPUB3 here + self.add_file(u"mimetype", self.MIMETYPE_CONTENTS, mode=zipfile.ZIP_STORED) + self.add_file(u"META-INF/container.xml", self.CONTAINER_XML_CONTENTS) + + # add cover + self.write_cover(cover_path_absolute) + + # write CSS + self.write_css(custom_css_path_absolute) + + # write index + if self.args.include_index_page: + self.write_index() + + # write groups + self.write_groups() + + # write ncx + if self.ebook_format in [self.EPUB2]: # add EPUB3 here + self.write_ncx() + + # write opf + self.write_opf() + + # compress + if compress: + output_file_obj = zipfile.ZipFile(file_path_absolute, "w", compression=zipfile.ZIP_DEFLATED) + for file_to_compress in self.files: + output_file_obj.write(file_to_compress["path"], compress_type=file_to_compress["mode"]) + output_file_obj.close() + + # return to previous cwd + os.chdir(cwd) + + diff --git a/penelope/format_bookeen.py b/penelope/format_bookeen.py index 536534a..8db3eac 100644 --- a/penelope/format_bookeen.py +++ b/penelope/format_bookeen.py @@ -12,6 +12,7 @@ import sqlite3 import zipfile +from penelope.collation_default import collate_function as collate_function_default from penelope.utilities import print_debug from penelope.utilities import print_error from penelope.utilities import print_info @@ -22,14 +23,13 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" CHUNK_FILE_PREFIX = "c_" CHUNK_SIZE = 262144 # 2^18 -COLLATION_DEFAULT = os.path.join(os.path.split(__file__)[0], "collation_default.py") -EMPTY_FILE_PATH = os.path.join(os.path.split(__file__)[0], "res/empty.idx") +EMPTY_FILE_PATH = os.path.join(os.path.split(os.path.abspath(__file__))[0], "res/empty.idx") HEADER = "]>" def read(dictionary, args, input_file_string): @@ -166,9 +166,19 @@ def write(dictionary, args, output_file_path): # result to be returned result = None + # get absolute path + output_file_path_absolute = os.path.abspath(output_file_path) + + # get absolute path for collation function file + bookeen_collation_function_path = None + if args.bookeen_collation_function is not None: + bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function) + # create tmp directory + cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) + os.chdir(tmp_path) # get the basename base = os.path.basename(output_file_path) @@ -176,22 +186,22 @@ def write(dictionary, args, output_file_path): base = base[:-4] # copy empty.idx into tmp_path - idx_file_path = os.path.join(tmp_path, base + u".dict.idx") - dict_file_path = os.path.join(tmp_path, base + u".dict") + idx_file_path = base + u".dict.idx" + dict_file_path = base + u".dict" copy_file(EMPTY_FILE_PATH, idx_file_path) # open index sql_connection = sqlite3.connect(idx_file_path) # install collation in the index - collation = imp.load_source("", COLLATION_DEFAULT) - if args.bookeen_collation_function is not None: + collation_function = collate_function_default + if bookeen_collation_function_path is not None: try: - collation = imp.load_source("", args.bookeen_collation_function) - print_debug("Using collation function from '%s'" % (args.bookeen_collation_function), args.debug) + collation_function = imp.load_source("", bookeen_collation_function_path).collate_function + print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug) except: - print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (args.bookeen_collation_function)) - sql_connection.create_collation("IcuNoCase", collation.collate_function) + print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path)) + sql_connection.create_collation("IcuNoCase", collation_function) sql_connection.text_factory = str # get a cursor and delete any data from the index file @@ -204,7 +214,7 @@ def write(dictionary, args, output_file_path): files_to_compress = [] current_offset = 0 chunk_index = 1 - chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) + chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") for entry_index in dictionary.entries_index_sorted: @@ -226,7 +236,7 @@ def write(dictionary, args, output_file_path): if current_offset > CHUNK_SIZE: chunk_file_obj.close() chunk_index += 1 - chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) + chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") current_offset = 0 @@ -235,14 +245,11 @@ def write(dictionary, args, output_file_path): # compress print_debug("Compressing c_* files...", args.debug) - cwd = os.getcwd() - os.chdir(tmp_path) file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() - os.chdir(cwd) print_debug("Compressing c_* files... done", args.debug) # update index metadata @@ -269,18 +276,15 @@ def write(dictionary, args, output_file_path): sql_connection.close() # create .install file or copy .dict.idx and .dict into requested output directory - parent_output_directory = os.path.split(output_file_path)[0] + parent_output_directory = os.path.split(output_file_path_absolute)[0] if args.bookeen_install_file: print_debug("Creating .install file...", args.debug) - cwd = os.getcwd() - os.chdir(tmp_path) file_zip_path = os.path.join(parent_output_directory, base + u".install") file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in [dict_file_path, idx_file_path]: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() - os.chdir(cwd) result = [file_zip_path] print_debug("Creating .install file... done", args.debug) else: @@ -293,6 +297,7 @@ def write(dictionary, args, output_file_path): print_debug("Copying .dict.idx and .dict files... done", args.debug) # delete tmp directory + os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: diff --git a/penelope/format_csv.py b/penelope/format_csv.py index 630aeb9..04dff88 100644 --- a/penelope/format_csv.py +++ b/penelope/format_csv.py @@ -14,7 +14,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/format_epub.py b/penelope/format_epub.py index ac94a8f..40fdfce 100644 --- a/penelope/format_epub.py +++ b/penelope/format_epub.py @@ -12,395 +12,77 @@ import os import zipfile +from penelope.dictionary_ebook import DictionaryEbook +from penelope.utilities import create_temp_directory +from penelope.utilities import delete_directory from penelope.utilities import print_debug from penelope.utilities import print_error from penelope.utilities import print_info -from penelope.utilities import create_temp_directory -from penelope.utilities import copy_file -from penelope.utilities import delete_directory -from penelope.utilities import rename_file __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" -CONTAINER_TEMPLATE = u""" - - - - -""" - -OPF_TEMPLATE = u""" - - - %s - %s - %s - %s - %s - %s-01-01 - - - - -%s - - -%s - -""" - -MANIFEST_ITEM_TEMPLATE = u""" """ - -SPINE_ITEM_TEMPLATE = u""" """ - -NCX_TEMPLATE = u""" - - - - - - - - - - %s - - -%s - -""" - -NCX_NAVPOINT_TEMPLATE = u""" - - %s - - - """ - -CSS_TEMPLATE = u"""@charset "UTF-8"; -body { - margin: 10px 25px 10px 25px; -} -h1 { - font-size: 200%; -} -p { - margin-left: 0em; - margin-right: 0em; - margin-top: 0em; - margin-bottom: 0em; - line-height: 2em; - text-align: justify; -} -a, a:focus, a:active, a:visited { - color: black; - text-decoration: none; -} -/* -span { - margin: 0px 10px 0px 10px; - padding: 2px 2px 2px 2px; - border: solid 1px black; -} -body.index { - margin: 10px 50px 10px 50px; -} -body.letter { - margin: 10px 50px 10px 50px; -} -*/ -p.index { - font-size: 150%; -} -p.letter { - font-size: 150%; -} - -div p { - margin-left: 25px; - margin-rigth: 25px; -} - -div { - margin-top: 10px; - margin-bottom: 10px; -}""" - -INDEX_XHTML_TEMPLATE = u""" - - - - %s - - - -

%s

-

-%s -

- -""" - -INDEX_XHTML_LINK_TEMPLATE = u""" %s""" - -GROUP_XHTML_TEMPLATE = u""" - - - - %s - - - -

%s

-

- [ Previous ] - [ Index ] - [ Next ] -

-%s - -""" - -GROUP_XHTML_WORD_TEMPLATE = u""" %s""" - -GROUP_XHTML_WORD_DEFINITION_TEMPLATE = u"""
-

%s

-

%s

-
""" - def read(dictionary, args, input_file_paths): print_error("Read function not implemented for EPUB dictionaries") return None def write(dictionary, args, output_file_path): - def get_prefix(headword, length): - lowercased = headword.lower() - if ord(lowercased[0]) < 97: - return u"SPECIAL" - if len(lowercased) < length: - return lowercased - return lowercased[0:length] - - def html_escape(s): - x = s - x = x.replace("&", "&") - x = x.replace('"', """) - x = x.replace("'", "'") - x = x.replace(">", ">") - x = x.replace("<", "<") - return x - # result to be returned result = None - # create tmp directory - cwd = os.getcwd() - tmp_path = create_temp_directory() - os.chdir(tmp_path) - - # get the basename - files_to_compress = [] - base = os.path.basename(output_file_path) - if base.endswith(".epub"): - base = base[:-5] - - # create directories - os.makedirs(u"META-INF") - os.makedirs(u"OEBPS") - - # create mimetype - file_mimetype_rel_path = u"mimetype" - file_mimetype_obj = open(file_mimetype_rel_path, "wb") - file_mimetype_obj.write(u"application/epub+zip") - file_mimetype_obj.close() - - # create container.xml - file_container_rel_path = u"META-INF/container.xml" - file_container_obj = open(file_container_rel_path, "wb") - file_container_obj.write(CONTAINER_TEMPLATE.encode("utf-8")) - file_container_obj.close() - files_to_compress.append(file_container_rel_path) + # get absolute path + output_file_path_absolute = os.path.abspath(output_file_path) # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) # create groups - all_entries = [] - groups = {} - i = 0 - for index in dictionary.entries_index_sorted: - entry = dictionary.entries[index] - all_entries.append(entry) - prefix = get_prefix(entry.headword, int(args.epub_group_prefix_length)) - if not prefix in groups: - groups[prefix] = [] - groups[prefix].append(i) - i += 1 - - # merge small groups - merged_groups = [] - keys = sorted(groups.keys()) - accumulator_key = keys[0] - accumulator = groups[accumulator_key] - for key in keys[1:]: - if (len(accumulator) >= int(args.epub_merge_group_size)) or (key[0] != accumulator_key[0]): - merged_groups.append([accumulator_key, accumulator]) - accumulator_key = key - accumulator = groups[accumulator_key] - else: - accumulator += groups[key] - merged_groups.append([accumulator_key, accumulator]) - - # create xhtml files - manifest_items = [] - spine_items = [] - ncx_items = [] - - i = 1 - file_xhtml_rel_path_base = u"index.xhtml" - file_xhtml_rel_path = u"OEBPS/%s" % file_xhtml_rel_path_base - file_xhtml_obj = open(file_xhtml_rel_path, "wb") - j = 2 - group_links = [] - for group in merged_groups: - key = group[0] - group_links.append(INDEX_XHTML_LINK_TEMPLATE % (u"g%06d.xhtml" % (j), key)) - j += 1 - xhtml_content = INDEX_XHTML_TEMPLATE % ( - args.title, - args.title, - " •\n".join(group_links) + special_group, group_keys, group_dict = dictionary.group( + prefix_function_path=args.group_by_prefix_function, + prefix_length=int(args.group_by_prefix_length), + merge_min_size=int(args.group_by_prefix_merge_min_size), + merge_across_first=args.group_by_prefix_merge_across_first ) - file_xhtml_obj.write(xhtml_content.encode("utf-8")) - file_xhtml_obj.close() - files_to_compress.append(file_xhtml_rel_path) - manifest_items.append(MANIFEST_ITEM_TEMPLATE % (file_xhtml_rel_path_base, file_xhtml_rel_path_base)) - spine_items.append(SPINE_ITEM_TEMPLATE % (file_xhtml_rel_path_base)) - ncx_items.append(NCX_NAVPOINT_TEMPLATE % (i, i, "Table of Contents", file_xhtml_rel_path_base)) - - i = 2 - for group in merged_groups: - key = group[0] - entry_indices = group[1] - file_xhtml_rel_path_base = u"g%06d.xhtml" % i - file_xhtml_rel_path = u"OEBPS/%s" % file_xhtml_rel_path_base - file_xhtml_obj = open(file_xhtml_rel_path, "wb") - page_title = u"%s" % (key) - if i == 2: - prev_path = u"#" - else: - prev_path = u"g%06d.xhtml" % (i - 1) - if i + 1 < len(merged_groups) + 2: - next_path = u"g%06d.xhtml" % (i + 1) + all_group_keys = group_keys + if special_group is not None: + all_group_keys += [u"SPECIAL"] + + # create epub object + epub = DictionaryEbook(ebook_format=DictionaryEbook.EPUB2, args=args) + + # add groups + for key in all_group_keys: + if key == u"SPECIAL": + group_entries = special_group else: - next_path = u"#" - words = [] - for entry_index in entry_indices: - if args.epub_output_definitions: - headword = all_entries[entry_index].headword - if args.epub_escape_strings: - headword = html_escape(headword) - definition = all_entries[entry_index].definition - if args.epub_escape_strings: - definition = html_escape(definition) - words.append(GROUP_XHTML_WORD_DEFINITION_TEMPLATE % (headword, definition)) - else: - headword = all_entries[entry_index].headword - if args.epub_escape_strings: - headword = html_escape(headword) - words.append(GROUP_XHTML_WORD_TEMPLATE % (headword)) - if args.epub_output_definitions: - words = u"\n".join(words) - else: - words = u"

%s

" % (u" •\n".join(words)) - xhtml_content = GROUP_XHTML_TEMPLATE % ( - page_title, - page_title, - prev_path, - next_path, - words - ) - file_xhtml_obj.write(xhtml_content.encode("utf-8")) - file_xhtml_obj.close() - files_to_compress.append(file_xhtml_rel_path) - manifest_items.append(MANIFEST_ITEM_TEMPLATE % (file_xhtml_rel_path_base, file_xhtml_rel_path_base)) - spine_items.append(SPINE_ITEM_TEMPLATE % (file_xhtml_rel_path_base)) - ncx_items.append(NCX_NAVPOINT_TEMPLATE % (i, i, key, file_xhtml_rel_path_base)) - i += 1 - - manifest_items = "\n".join(manifest_items) - spine_items = "\n".join(spine_items) - ncx_items = "\n".join(ncx_items) - - # create content.opf - file_opf_rel_path = u"OEBPS/content.opf" - file_opf_obj = open(file_opf_rel_path, "wb") - opf_content = OPF_TEMPLATE % ( - args.identifier, - args.language_from, - args.title, - args.author, - args.copyright, - args.year, - manifest_items, - spine_items - ) - file_opf_obj.write((opf_content).encode("utf-8")) - file_opf_obj.close() - files_to_compress.append(file_opf_rel_path) - - # create toc.ncx - file_ncx_rel_path = u"OEBPS/toc.ncx" - file_ncx_obj = open(file_ncx_rel_path, "wb") - ncx_content = NCX_TEMPLATE % ( - args.identifier, - args.title, - ncx_items - ) - file_ncx_obj.write((ncx_content).encode("utf-8")) - file_ncx_obj.close() - files_to_compress.append(file_ncx_rel_path) - - # create style.css - file_css_rel_path = u"OEBPS/style.css" - file_css_obj = open(file_css_rel_path, "wb") - file_css_obj.write((CSS_TEMPLATE).encode("utf-8")) - file_css_obj.close() - files_to_compress.append(file_css_rel_path) - - # TODO copy cover - #file_cover_rel_path = u"cover" - #file_cover_path = os.path.join(tmp_path, file_cover_rel_path) - #if args.cover_path is not None: - # if os.path.exists(args.cover_path): - # file_cover_rel_path = os.path.basename(args.cover_path) - # file_cover_path = os.path.join(tmp_path, file_cover_rel_path) - # copy_file(args.cover_path, file_cover_path) - # else: - # print_error("Unable to read cover file '%s'" % (args.cover_path)) - #else: - # print_error("No cover image file specified: generating EPUB without cover") - # print_error("Use --cover-path to specify a cover image file") + group_entries = group_dict[key] + epub.add_group(key, group_entries) # create output file - output_file_obj = zipfile.ZipFile(output_file_path, "w", compression=zipfile.ZIP_DEFLATED) - output_file_obj.write(file_mimetype_rel_path, compress_type=zipfile.ZIP_STORED) - for file_to_compress in files_to_compress: - output_file_obj.write(file_to_compress) - output_file_obj.close() - os.chdir(cwd) - result = [output_file_path] + if args.epub_no_compress: + print_debug("Not compressing the EPUB container") + epub.write(output_file_path_absolute, compress=False) + else: + print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) + epub.write(output_file_path_absolute, compress=True) + result = [output_file_path] + print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # delete tmp directory - if args.keep: + tmp_path = epub.get_tmp_path() + if args.epub_no_compress: + print_info("The uncompressed EPUB is inside dir '%s'" % (tmp_path)) + result = [tmp_path] + elif args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) + if result is None: + result = [tmp_path] else: - delete_directory(tmp_path) + epub.delete() print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result diff --git a/penelope/format_kobo.py b/penelope/format_kobo.py index 9022256..a15fea6 100644 --- a/penelope/format_kobo.py +++ b/penelope/format_kobo.py @@ -20,11 +20,13 @@ from __future__ import absolute_import from io import open +import imp import gzip import os import subprocess import zipfile +from penelope.prefix_kobo import get_prefix as get_prefix_kobo from penelope.utilities import create_temp_directory from penelope.utilities import create_temp_file from penelope.utilities import delete_directory @@ -37,7 +39,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -122,56 +124,45 @@ def read_single_file(dictionary, args, input_file_path): return dictionary def write(dictionary, args, output_file_path): - def is_allowed(ch): - # all non-ascii (x > 127) are ok - # all ASCII lowercase letters (97 <= x <= 122) are ok - # everything else is not ok - code = ord(ch) - return (code > 127) or ((code >= 97) and (code <= 122)) - - def compute_prefix(headword): - # defaults to u"11" if the first two letters of headword are not valid - prefix = u"11" - headword = headword.lower() - if len(headword) > 0: - if len(headword) == 1: - # for single-letter headwords, append an 'a' at the end - # e.g. "9" => "9a" - headword += u"a" - if is_allowed(headword[0]) and is_allowed(headword[1]): - prefix = headword[0:2] - return prefix - # result to be returned result = None + # get absolute path + output_file_path_absolute = os.path.abspath(output_file_path) + + # create tmp directory + cwd = os.getcwd() + tmp_path = create_temp_directory() + print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) + os.chdir(tmp_path) + # sort by headword dictionary.sort(by_headword=True) # group by prefix files_to_compress = [] - prefix_to_file = {} - for headword in dictionary.entries_index: - prefix = compute_prefix(headword) - if not prefix in prefix_to_file: - prefix_to_file[prefix] = [] - prefix_to_file[prefix] += [headword] - - # create tmp directory - tmp_path = create_temp_directory() - print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) + prefix_length = int(args.group_by_prefix_length) + special_group, group_keys, group_dict = dictionary.group( + prefix_function=get_prefix_kobo, + prefix_length=prefix_length, + merge_min_size=int(args.group_by_prefix_merge_min_size), + merge_across_first=args.group_by_prefix_merge_across_first + ) + if special_group is not None: + special_group_key = u"1" * prefix_length + group_dict[special_group_key] = special_group + group_keys = [special_group_key] + group_keys # write files - for prefix in sorted(prefix_to_file): + for key in group_keys: # write html file - file_html_path = os.path.join(tmp_path, prefix + u".html") + file_html_path = key + u".html" file_html_obj = open(file_html_path, "wb") file_html_obj.write(u"".encode("utf-8")) - for headword in prefix_to_file[prefix]: - entries = dictionary.entries_index[headword] - for entry_index in entries: - definition = dictionary.entries[entry_index].definition - file_html_obj.write((u"
%s
%s
" % (headword, headword, definition)).encode("utf-8")) + for entry in group_dict[key]: + headword = entry.headword + definition = entry.definition + file_html_obj.write((u"
%s
%s
" % (headword, headword, definition)).encode("utf-8")) file_html_obj.write((u"").encode("utf-8")) file_html_obj.close() @@ -189,8 +180,8 @@ def compute_prefix(headword): rename_file(file_gz_path, file_html_path) files_to_compress.append(file_html_path) - # TODO write words - file_words_path = os.path.join(tmp_path, WORDS_FILE_NAME) + # write words + file_words_path = WORDS_FILE_NAME keys = sorted(dictionary.entries_index.keys()) try: import marisa_trie @@ -231,22 +222,20 @@ def compute_prefix(headword): # add file_words_path to files to compress files_to_compress.append(file_words_path) # create output zip file - cwd = os.getcwd() try: - os.chdir(tmp_path) - print_debug("Writing to file '%s'..." % (output_file_path), args.debug) - file_zip_obj = zipfile.ZipFile(output_file_path, "w", zipfile.ZIP_DEFLATED) + print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) + file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [output_file_path] - print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) + print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: - print_error("Writing to file '%s'... failure" % (output_file_path)) - os.chdir(cwd) + print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory + os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: diff --git a/penelope/format_mobi.py b/penelope/format_mobi.py index 76e8848..6e012e3 100644 --- a/penelope/format_mobi.py +++ b/penelope/format_mobi.py @@ -12,76 +12,23 @@ import os import subprocess +from penelope.dictionary_ebook import DictionaryEbook from penelope.utilities import print_debug from penelope.utilities import print_error from penelope.utilities import print_info from penelope.utilities import create_temp_directory from penelope.utilities import copy_file from penelope.utilities import delete_directory -from penelope.utilities import rename_file __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" KINDLEGEN = u"kindlegen" -HTML_HEADER = u""" - - - %s - - -
-
- %s -
-
- -""" - -HTML_FOOTER = u""" -""" - -HTML_WORD = u""" - -

%s

-

%s

-
- -""" - -OPF_TEMPLATE = u""" - - - - %s - %s - %s - %s - %s - Dictionaries - - - - %s - %s - %s - - - - - - - - - - -""" - def read(dictionary, args, input_file_paths): print_error("Read function not implemented for MOBI dictionaries") return None @@ -90,67 +37,42 @@ def write(dictionary, args, output_file_path): # result to be returned result = None + # get absolute path + output_file_path_absolute = os.path.abspath(output_file_path) + # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) - # create tmp directory - tmp_path = create_temp_directory() - - # get the basename - base = os.path.basename(output_file_path) - if base.endswith(".mobi"): - base = base[:-5] - file_mobi_rel_path = base + u".mobi" - file_html_path = os.path.join(tmp_path, file_mobi_rel_path) - - # copy cover - file_cover_rel_path = u"cover" - file_cover_path = os.path.join(tmp_path, file_cover_rel_path) - if args.cover_path is not None: - if os.path.exists(args.cover_path): - file_cover_rel_path = os.path.basename(args.cover_path) - file_cover_path = os.path.join(tmp_path, file_cover_rel_path) - copy_file(args.cover_path, file_cover_path) - else: - print_error("Unable to read cover file '%s'" % (args.cover_path)) - else: - print_error("No cover image file specified: generating MOBI without cover") - print_error("Use --cover-path to specify a cover image file") - - # TODO split over multiple files? - # write .html file - print_debug("Writing .html file...", args.debug) - file_html_rel_path = u"words.html" - file_html_path = os.path.join(tmp_path, file_html_rel_path) - file_html_obj = open(file_html_path, "wb") - file_html_obj.write((HTML_HEADER % (args.title, args.title)).encode("utf-8")) - for index in dictionary.entries_index_sorted: - entry = dictionary.entries[index] - file_html_obj.write((HTML_WORD % (entry.headword, entry.definition)).encode("utf-8")) - file_html_obj.write((HTML_FOOTER).encode("utf-8")) - file_html_obj.close() - print_debug("Writing .html file... done", args.debug) - - # write .opf file - print_debug("Writing .opf file...", args.debug) - file_opf_rel_path = base + u".opf" - file_opf_path = os.path.join(tmp_path, file_opf_rel_path) - file_opf_obj = open(file_opf_path, "wb") - opf_content = OPF_TEMPLATE % ( - args.title, - args.language_from, - args.identifier, - args.author, - args.copyright, - args.language_from, - args.language_to, - file_cover_rel_path + # create groups + special_group, group_keys, group_dict = dictionary.group( + prefix_function_path=args.group_by_prefix_function, + prefix_length=int(args.group_by_prefix_length), + merge_min_size=int(args.group_by_prefix_merge_min_size), + merge_across_first=args.group_by_prefix_merge_across_first ) - file_opf_obj.write((opf_content).encode("utf-8")) - file_opf_obj.close() - print_debug("Writing .opf file... done", args.debug) + all_group_keys = group_keys + if special_group is not None: + all_group_keys += [u"SPECIAL"] + + # create mobi object + mobi = DictionaryEbook(ebook_format=DictionaryEbook.MOBI, args=args) + + # add groups + for key in all_group_keys: + if key == u"SPECIAL": + group_entries = special_group + else: + group_entries = group_dict[key] + mobi.add_group(key, group_entries) + + # create output file + print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) + mobi.write(output_file_path_absolute, compress=False) + result = [output_file_path] + print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # run kindlegen + tmp_path = mobi.get_tmp_path() if args.mobi_no_kindlegen: print_info("Not running kindlegen, the raw files are located in '%s'" % tmp_path) result = [tmp_path] @@ -158,13 +80,16 @@ def write(dictionary, args, output_file_path): try: print_debug("Creating .mobi file with kindlegen...", args.debug) kindlegen_path = KINDLEGEN + opf_file_path_absolute = os.path.join(tmp_path, "OEBPS", "content.opf") + mobi_file_path_relative = u"content.mobi" + mobi_file_path_absolute = os.path.join(tmp_path, "OEBPS", mobi_file_path_relative) if args.kindlegen_path is None: print_info(" Running '%s' from $PATH" % KINDLEGEN) else: kindlegen_path = args.kindlegen_path print_info(" Running '%s' from '%s'" % (KINDLEGEN, kindlegen_path)) proc = subprocess.Popen( - [kindlegen_path, file_opf_path, "-o", file_mobi_rel_path], + [kindlegen_path, opf_file_path_absolute, "-o", mobi_file_path_relative], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE @@ -173,7 +98,7 @@ def write(dictionary, args, output_file_path): if args.debug: output_unicode = (output[0]).decode("utf-8") print_debug(output_unicode, args.debug) - rename_file(file_html_path, output_file_path) + copy_file(mobi_file_path_absolute, output_file_path_absolute) result = [output_file_path] print_debug("Creating .mobi file with kindlegen... done", args.debug) except OSError as exc: @@ -181,14 +106,14 @@ def write(dictionary, args, output_file_path): print_error(" Please make sure '%s':" % KINDLEGEN) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --kindlegen-path") - result = None - # delete tmp directory - if args.keep: - print_info("Not deleting temp dir '%s'" % (tmp_path)) - else: - delete_directory(tmp_path) - print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) + # delete tmp directory + tmp_path = mobi.get_tmp_path() + if args.keep: + print_info("Not deleting temp dir '%s'" % (tmp_path)) + else: + mobi.delete() + print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result diff --git a/penelope/format_stardict.py b/penelope/format_stardict.py index 4d38adc..1f14ad0 100644 --- a/penelope/format_stardict.py +++ b/penelope/format_stardict.py @@ -24,7 +24,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -267,6 +267,25 @@ def write(dictionary, args, output_file_path): # result to be returned result = None + # get absolute path + output_file_path_absolute = os.path.abspath(output_file_path) + + # create tmp directory + cwd = os.getcwd() + tmp_path = create_temp_directory() + print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) + os.chdir(tmp_path) + + # get the basename and compute output file paths + base = os.path.basename(output_file_path) + if base.endswith(".zip"): + base = base[:-4] + ifo_file_path = base + ".ifo" + idx_file_path = base + ".idx" + dict_file_path = base + ".dict" + dict_dz_file_path = base + ".dict.dz" + syn_file_path = base + ".syn" + # TODO by spec, the index should be sorted # TODO using the comparator stardict_strcmp() defined in the spec # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ), @@ -283,20 +302,6 @@ def write(dictionary, args, output_file_path): # dictionary.sort(by_headword=True, ignore_case=True) - # create tmp directory - tmp_path = create_temp_directory() - print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) - - # get the basename and compute output file paths - base = os.path.basename(output_file_path) - if base.endswith(".zip"): - base = base[:-4] - ifo_file_path = os.path.join(tmp_path, base + ".ifo") - idx_file_path = os.path.join(tmp_path, base + ".idx") - dict_file_path = os.path.join(tmp_path, base + ".dict") - dict_dz_file_path = os.path.join(tmp_path, base + ".dict.dz") - syn_file_path = os.path.join(tmp_path, base + ".syn") - # write .idx and .dict files print_debug("Writing .idx and .dict files...", args.debug) idx_file_obj = open(idx_file_path, "wb") @@ -397,23 +402,21 @@ def write(dictionary, args, output_file_path): ifo_file_obj.close() # create output zip file - cwd = os.getcwd() try: - os.chdir(tmp_path) - print_debug("Writing to file '%s'..." % (output_file_path), args.debug) - file_zip_obj = zipfile.ZipFile(output_file_path, "w", zipfile.ZIP_DEFLATED) + print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) + file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) print_debug("Written %s" % (file_to_compress), args.debug) file_zip_obj.close() result = [output_file_path] - print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) + print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: - print_error("Writing to file '%s'... failure" % (output_file_path)) - os.chdir(cwd) + print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory + os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: diff --git a/penelope/format_xml.py b/penelope/format_xml.py index 4c06d43..df5fa78 100644 --- a/penelope/format_xml.py +++ b/penelope/format_xml.py @@ -15,7 +15,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/input_parser_identity.py b/penelope/input_parser_identity.py index 1850eb8..e9aed40 100644 --- a/penelope/input_parser_identity.py +++ b/penelope/input_parser_identity.py @@ -11,7 +11,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/input_parser_webster.py b/penelope/input_parser_webster.py index 0efc527..9e52b45 100644 --- a/penelope/input_parser_webster.py +++ b/penelope/input_parser_webster.py @@ -9,7 +9,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" diff --git a/penelope/prefix_default.py b/penelope/prefix_default.py new file mode 100644 index 0000000..5c60f8f --- /dev/null +++ b/penelope/prefix_default.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +This is the default prefix function for grouping headwords. +""" + +__author__ = "Alberto Pettarin" +__copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" +__license__ = "MIT" +__version__ = "3.1.0" +__email__ = "alberto@albertopettarin.it" +__status__ = "Production" + +def get_prefix(headword, length): + """ + Return the prefix for the given headword, + of length length. + + :param headword: the headword string + :type headword: unicode + :param length: prefix length + :type length: int + :rtype: unicode + """ + if headword is None: + return None + lowercased = headword.lower() + if ord(lowercased[0]) < 97: + return u"SPECIAL" + if len(lowercased) < length: + return lowercased + return lowercased[0:length] + + diff --git a/penelope/prefix_kobo.py b/penelope/prefix_kobo.py new file mode 100644 index 0000000..44930ba --- /dev/null +++ b/penelope/prefix_kobo.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +This is the prefix function for grouping headwords for Kobo format. +""" + +__author__ = "Alberto Pettarin" +__copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" +__license__ = "MIT" +__version__ = "3.1.0" +__email__ = "alberto@albertopettarin.it" +__status__ = "Production" + +def get_prefix(headword, length): + """ + Return the prefix for the given headword, + of length length. + + Note that the procedure implemented here is the result + of reverse engineering, since no official specification + has been published by Kobo so far. YMMV. + + :param headword: the headword string + :type headword: unicode + :param length: prefix length + :type length: int + :rtype: unicode + """ + def is_allowed(character): + # all non-ascii (x > 127) are ok + # all ASCII lowercase letters (97 <= x <= 122) are ok + # everything else is not ok + try: + code = ord(character) + return (code > 127) or ((code >= 97) and (code <= 122)) + except: + pass + return True + + # defaults to u"SPECIAL", it will be mapped to u"11...1" later + prefix = u"SPECIAL" + headword = headword.lower() + if len(headword) > 0: + while len(headword) < length: + # for headwords shorter than length, append an 'a' at the end + # e.g. length=3, "xy" => "xya" + headword += u"a" + # TODO maybe the check should be done only for the first character + is_ok = True + for character in headword: + if not is_allowed(character): + is_ok = False + break + if is_ok: + prefix = headword[0:length] + return prefix + + + diff --git a/penelope/utilities.py b/penelope/utilities.py index fcee638..e5d0f57 100644 --- a/penelope/utilities.py +++ b/penelope/utilities.py @@ -7,16 +7,18 @@ from __future__ import absolute_import from __future__ import print_function +from io import open import imp import os import shutil import tempfile import uuid +import zipfile __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -59,7 +61,7 @@ def create_temp_directory(): def copy_file(origin, destination): try: - shutil.copyfile(origin, destination) + shutil.copy(origin, destination) except: pass diff --git a/setup.py b/setup.py index 59f0bee..ff49dca 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ __author__ = "Alberto Pettarin" __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" -__version__ = "3.0.1" +__version__ = "3.1.0" __email__ = "alberto@albertopettarin.it" __status__ = "Production" @@ -18,7 +18,7 @@ name="penelope", packages=["penelope"], package_data={"penelope": ["res/*"]}, - version="3.0.1.11", + version="3.1.0.1", description="Penelope is a multi-tool for creating, editing and converting dictionaries, especially for eReader devices", author="Alberto Pettarin", author_email="alberto@albertopettarin.it",