diff --git a/License b/License index 79a52188f10d38e9a3d1997982a965a319251d4b..608f7dab3f5fa9cbecbe72ccdecd2c7da6b02204 100644 --- a/License +++ b/License @@ -1,6 +1,6 @@ # Simple OPDS - программа для каталогизации электронных книг и организации # доступа к ним с использованием протокола OPDS. -# Copyright (C)2014, Дмитрий Шелепнёв +# Copyright (C)2017, Дмитрий Шелепнёв # # Это программа является свободным программным обеспечением. Вы можете # распространять и/или модифицировать её согласно условиям Стандартной diff --git a/book_tools/LICENSE b/book_tools/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d149acb2084667c8946dade6684862adb0a1533e --- /dev/null +++ b/book_tools/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2014 FBReader.ORG Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/book_tools/__init__.py b/book_tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/book_tools/format/__init__.py b/book_tools/format/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce604ee78da8e44edcca50a32237ddad06c6e6f9 --- /dev/null +++ b/book_tools/format/__init__.py @@ -0,0 +1,111 @@ +#import magic +import os +import zipfile +from xml import sax + +from book_tools.format.mimetype import Mimetype + +from book_tools.format.util import list_zip_file_infos +from book_tools.format.epub import EPub +from book_tools.format.fb2 import FB2, FB2Zip +#from fbreader.format.pdf import PDF +#from fbreader.format.msword import MSWord +from book_tools.format.mobi import Mobipocket +#from fbreader.format.rtf import RTF +#from fbreader.format.djvu import DjVu +#from fbreader.format.dummy import Dummy + +#__detector = magic.open(magic.MAGIC_MIME_TYPE) +#__detector.load() + +class __detector: + @staticmethod + def file(filename): + (n, e) = os.path.splitext(filename) + if e.lower() == '.fb2': + return Mimetype.FB2 + elif e.lower()=='.epub' or e.lower()=='.zip': + return Mimetype.ZIP + elif e.lower()=='.pdf': + return Mimetype.PDF + elif e.lower()=='.doc' or e.lower()=='.docx': + return Mimetype.MSWORD + elif e.lower()=='.djvu': + return Mimetype.DJVU + elif e.lower()=='.txt': + return Mimetype.TEXT + elif e.lower()=='.rtf': + return Mimetype.RTF + else: + return Mimetype.OCTET_STREAM + +def detect_mime(file): + FB2_ROOT = 'FictionBook' + mime = __detector.file(file.name) + + try: + if mime == Mimetype.XML or mime == Mimetype.FB2: + if FB2_ROOT == __xml_root_tag(file): + return Mimetype.FB2 + elif mime == Mimetype.ZIP: + with zipfile.ZipFile(file) as zip_file: + if not zip_file.testzip(): + infolist = list_zip_file_infos(zip_file) + if len(infolist) == 1: + if FB2_ROOT == __xml_root_tag(zip_file.open(infolist[0])): + return Mimetype.FB2_ZIP + try: + with zip_file.open('mimetype') as mimetype_file: + if mimetype_file.read(30).decode().rstrip('\n\r') == Mimetype.EPUB: + return Mimetype.EPUB + except Exception as e: + pass + elif mime == Mimetype.OCTET_STREAM: + mobiflag = file.read(68) + if mobiflag.decode()[60:] == 'BOOKMOBI': + return Mimetype.MOBI + except: + pass + + return mime + +def create_bookfile(file, original_filename): + if isinstance(file, str): + file = open(file, 'rb') + mimetype = detect_mime(file) + file.seek(0,0) + if mimetype == Mimetype.EPUB: + return EPub(file, original_filename) + elif mimetype == Mimetype.FB2: + return FB2(file, original_filename) + elif mimetype == Mimetype.FB2_ZIP: + return FB2Zip(file, original_filename) + elif mimetype == Mimetype.MOBI: + return Mobipocket(file, original_filename) +# elif mimetype == Mimetype.PDF: +# return PDF(path, original_filename) +# elif mimetype == Mimetype.MSWORD: +# return MSWord(path, original_filename) +# elif mimetype == Mimetype.RTF: +# return RTF(path, original_filename) +# elif mimetype == Mimetype.DJVU: +# return DjVu(path, original_filename) +# elif mimetype in [Mimetype.TEXT]: +# return Dummy(path, original_filename, mimetype) + else: + raise Exception('File type \'%s\' is not supported, sorry' % mimetype) + +def __xml_root_tag(file): + class XMLRootFound(Exception): + def __init__(self, name): + self.name = name + + class RootTagFinder(sax.handler.ContentHandler): + def startElement(self, name, attributes): + raise XMLRootFound(name) + + try: + sax.parse(file, RootTagFinder()) + except XMLRootFound as e: + return e.name + return None diff --git a/book_tools/format/aes.py b/book_tools/format/aes.py new file mode 100644 index 0000000000000000000000000000000000000000..5699fe143d4d10f9803459e98c6efc5bb4126ce0 --- /dev/null +++ b/book_tools/format/aes.py @@ -0,0 +1,34 @@ +import gzip, os +#from Crypto.Cipher import AES +from tempfile import mktemp + +def encrypt(file_name, key, working_dir): + ''' + file_name: full path to file to encrypt + key: 16 byte string + working_dir: directory to create temorary files + ''' + # tmp_file_name = mktemp(dir=working_dir) + # with open(file_name, 'rb') as istream: + # with gzip.open(tmp_file_name, 'wb') as ostream: + # ostream.writelines(istream) + # + # init_vector = os.urandom(16) + # + # mode = AES.MODE_CBC + # encryptor = AES.new(key, mode, init_vector) + # + # with open(tmp_file_name, 'rb') as istream: + # with open(file_name, 'wb') as ostream: + # ostream.write(init_vector) + # while True: + # data = istream.read(8192) + # if len(data) == 8192: + # ostream.write(encryptor.encrypt(data)) + # else: + # pad = 16 - len(data) % 16 + # ostream.write(encryptor.encrypt(data + pad * chr(pad))) + # break + # + # os.remove(tmp_file_name) + pass diff --git a/book_tools/format/bookfile.py b/book_tools/format/bookfile.py new file mode 100644 index 0000000000000000000000000000000000000000..c60c850d1b0f97239a76e47dc565279079f24dec --- /dev/null +++ b/book_tools/format/bookfile.py @@ -0,0 +1,76 @@ +import os, re +from abc import abstractmethod, ABCMeta + +from book_tools.format.util import minify_cover + +class BookFile(object): + __metaclass__ = ABCMeta + + def __init__(self, file, original_filename, mimetype): + self.file = file + self.mimetype = mimetype + self.original_filename = original_filename + self.title = original_filename + self.description = None + self.authors = [] + self.tags = [] + self.series_info = None + self.language_code = None + self.issues = [] + + def __enter__(self): + return self + + @abstractmethod + def __exit__(self, kind, value, traceback): + pass + + def extract_cover(self, working_dir): + cover, minified = self.extract_cover_internal(working_dir) + if cover and not minified: + minify_cover(os.path.join(working_dir, cover)) + return cover + + def extract_cover_internal(self, working_dir): + return (None, False) + + @staticmethod + def __is_text(text): + return isinstance(text, str) + + def __set_title__(self, title): + if title and BookFile.__is_text(title): + title = title.strip() + if title: + self.title = title + + def __add_author__(self, name, sortkey=None): + if not name or not BookFile.__is_text(name): + return + name = BookFile.__normalise_string__(name) + if not name: + return + if sortkey: + sortkey = sortkey.strip() + if not sortkey: + sortkey = name.split()[-1] + sortkey = BookFile.__normalise_string__(sortkey).lower() + self.authors.append({'name': name, 'sortkey': sortkey}) + + def __add_tag__(self, text): + if text and BookFile.__is_text(text): + text = text.strip() + if text: + self.tags.append(text) + + @staticmethod + def __normalise_string__(text): + if text is None: + return None + return re.sub(r'\s+', ' ', text.strip()) + + def get_encryption_info(self): + return {} + + def repair(self, working_dir): + pass diff --git a/book_tools/format/epub.py b/book_tools/format/epub.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd84ff600a95542e8182dc55b6585e4e8c97a33 --- /dev/null +++ b/book_tools/format/epub.py @@ -0,0 +1,404 @@ +import os, shutil, urllib, zipfile +from lxml import etree +from tempfile import mktemp + +from book_tools.format.aes import encrypt +from book_tools.format.bookfile import BookFile +from book_tools.format.mimetype import Mimetype +from book_tools.format.util import list_zip_file_infos + +class EPub(BookFile): + class Issue(object): + FIRST_ITEM_NOT_MIMETYPE = 1 + MIMETYPE_ITEM_IS_DEFLATED = 2 + + class Namespace(object): + XHTML = 'http://www.w3.org/1999/xhtml' + CONTAINER = 'urn:oasis:names:tc:opendocument:xmlns:container' + OPF = 'http://www.idpf.org/2007/opf' + DUBLIN_CORE = 'http://purl.org/dc/elements/1.1/' + ENCRYPTION = 'http://www.w3.org/2001/04/xmlenc#' + DIGITAL_SIGNATURE = 'http://www.w3.org/2000/09/xmldsig#' + MARLIN = 'http://marlin-drm.com/epub' + CALIBRE = 'http://calibre.kovidgoyal.net/2009/metadata' + + class Entry(object): + MIMETYPE = 'mimetype' + MANIFEST = 'META-INF/manifest.xml' + METADATA = 'META-INF/metadata.xml' + CONTAINER = 'META-INF/container.xml' + ENCRYPTION = 'META-INF/encryption.xml' + RIGHTS = 'META-INF/rights.xml' + SIGNATURES = 'META-INF/signatures.xml' + + TOKEN_URL = 'https://books.fbreader.org/drm/marlin/get-token' + CONTENT_ID_PREFIX = 'urn:marlin:organization:fbreader.org:0001:' + + ALGORITHM_EMBEDDING = 'http://www.idpf.org/2008/embedding' + ALGORITHM_AES128 = Namespace.ENCRYPTION + 'aes128-cbc' + + class StructureException(Exception): + def __init__(self, message): + Exception.__init__(self, 'ePub verification failed: ' + message) + + def __init__(self, file, original_filename): + BookFile.__init__(self, file, original_filename, Mimetype.EPUB) + self.root_filename = None + self.cover_fileinfos = [] + + self.__zip_file = None + self.__initialize() + + def __initialize(self): + self.__zip_file = zipfile.ZipFile(self.file) + self.issues = [] + try: + if self.__zip_file.testzip(): + raise EPub.StructureException('broken zip archive') + + infos = self.__zip_file.infolist() + if len(infos) == 0: + raise EPub.StructureException('empty zip archive') + + mimetype_info = infos[0] + if mimetype_info.filename != EPub.Entry.MIMETYPE: + self.issues.append(EPub.Issue.FIRST_ITEM_NOT_MIMETYPE) + elif mimetype_info.compress_type != zipfile.ZIP_STORED: + self.issues.append(EPub.Issue.MIMETYPE_ITEM_IS_DEFLATED) + + with self.__zip_file.open(EPub.Entry.MIMETYPE) as mimetype_file: + if mimetype_file.read(30).decode().rstrip('\n\r') != Mimetype.EPUB: + raise EPub.StructureException('\'mimetype\' item content is incorrect') + + self.__extract_metainfo() + except EPub.StructureException as error: + self.close() + raise error + except Exception as error: + self.close() + raise EPub.StructureException(error.message) + + def close(self): + self.__zip_file.close() + + def __exit__(self, kind, value, traceback): + self.__zip_file.__exit__(kind, value, traceback) + + def __etree_from_entry(self, info): + with self.__zip_file.open(info) as entry: + try: + return etree.fromstring(entry.read(1048576)) + except: + raise EPub.StructureException('\'' + info.filename + '\' is not a valid XML') + + def __extract_metainfo(self): + root_info = self.__get_root_info() + self.root_filename = root_info.filename + tree = self.__etree_from_entry(root_info) + namespaces = {'opf': EPub.Namespace.OPF, 'dc': EPub.Namespace.DUBLIN_CORE} + + res = tree.xpath('/opf:package/opf:metadata/dc:title', namespaces=namespaces) + if len(res) > 0: + self.__set_title__(res[0].text) + + res = tree.xpath('/opf:package/opf:metadata/dc:creator[@role="aut"]', namespaces=namespaces) + if len(res) == 0: + res = tree.xpath('/opf:package/opf:metadata/dc:creator', namespaces=namespaces) + for node in res: + self.__add_author__(node.text) + + res = tree.xpath('/opf:package/opf:metadata/dc:language', namespaces=namespaces) + if len(res) > 0 and res[0].text: + self.language_code = res[0].text.strip() + + res = tree.xpath('/opf:package/opf:metadata/dc:subject', namespaces=namespaces) + for node in res: + self.__add_tag__(node.text) + + res = tree.xpath('/opf:package/opf:metadata/opf:meta[@name="calibre:series"]', namespaces=namespaces) + if len(res) > 0: + series = BookFile.__normalise_string__(res[0].get('content')) + if series: + res = tree.xpath('/opf:package/opf:metadata/opf:meta[@name="calibre:series_index"]', namespaces=namespaces) + index = BookFile.__normalise_string__(res[0].get('content')) if len(res) > 0 else None + self.series_info = { + 'title': series, + 'index': index or None + } + + res = tree.xpath('/opf:package/opf:metadata/dc:description', namespaces=namespaces) + if len(res) > 0 and res[0].text: + self.description = res[0].text.strip() + + prefix = os.path.dirname(root_info.filename) + if prefix: + prefix += '/' + self.cover_fileinfos = self.__find_cover(tree, prefix) + + def __find_cover(self, tree, prefix): + namespaces = {'opf': EPub.Namespace.OPF, 'dc': EPub.Namespace.DUBLIN_CORE} + + def xpath(query): + return tree.xpath(query, namespaces=namespaces)[0] + + def item_for_href(ref): + return xpath('/opf:package/opf:manifest/opf:item[@href="%s"]' % ref) + + def image_infos(node): + path = os.path.normpath(prefix + node.get('href')) + try: + fileinfo = self.__zip_file.getinfo(path) + except: + fileinfo = self.__zip_file.getinfo(urllib.unquote(path)) + mime = node.get('media-type') + info = { + 'filename': fileinfo.filename, + 'mime': mime + } + if mime.startswith('image/'): + return [info] + elif mime == 'application/xhtml+xml': + xhtml = self.__etree_from_entry(fileinfo) + xhtml_prefix = os.path.dirname(fileinfo.filename) + '/' + img = xhtml.xpath('//xhtml:img[@src]', namespaces={'xhtml': EPub.Namespace.XHTML})[0] + return [info, + { + 'filename': os.path.normpath(xhtml_prefix + img.get('src')), + # TODO: detect mimetype + 'mime': 'image/auto' + } + ] + else: + raise Exception('unknown mimetype %s' % mime) + + try: + return image_infos(xpath('/opf:package/opf:manifest/opf:item[@properties="cover-image"]')) + except: + pass + + try: + node = xpath('/opf:package/opf:metadata/opf:meta[@name="cover"]') + return image_infos(xpath('/opf:package/opf:manifest/opf:item[@id="%s"]' % node.get('content'))) + except: + pass + + try: + node = xpath('/opf:package/opf:metadata/meta[@name="cover"]') + return image_infos(xpath('/opf:package/opf:manifest/opf:item[@id="%s"]' % node.get('content'))) + except: + pass + + try: + node = xpath('/package/metadata/meta[@name="cover"]') + return image_infos(xpath('/package/manifest/item[@id="%s"]' % node.get('content'))) + except: + pass + + try: + node = xpath('/opf:package/opf:guide/opf:reference[@type="other.ms-coverimage-standard"][@title="Cover"]') + return image_infos(item_for_href(node.get('href'))) + except: + pass + + try: + node = xpath('/opf:package/opf:guide/opf:reference[@type="other.ms-coverimage-standard"]') + return image_infos(item_for_href(node.get('href'))) + except: + pass + + try: + return image_infos(xpath('/opf:package/opf:manifest/opf:item[@id="cover"]')) + except: + pass + + return [] + + def __get_root_info(self): + try: + container_info = self.__zip_file.getinfo(EPub.Entry.CONTAINER) + except: + container_info = None + if container_info: + tree = self.__etree_from_entry(container_info) + root_file = None + namespaces = {'cont': EPub.Namespace.CONTAINER} + res = tree.xpath('/cont:container/cont:rootfiles/cont:rootfile', namespaces=namespaces) + if len(res) == 1 and res[0].get('media-type') == 'application/oebps-package+xml': + root_file = res[0].get('full-path') + if root_file: + return self.__zip_file.getinfo(root_file) + else: + opf_infos = [i for i in self.__zip_file.infolist() if i.filename.endswith('.opf')] + if len(opf_infos) > 1: + raise EPub.StructureException('several OPF files in the archive') + elif len(opf_infos) == 1: + return opf_infos[0] + + raise EPub.StructureException('OPF entry not found') + + def __contains_entry(self, name): + try: + self.__zip_file.getinfo(name) + return True + except KeyError: + return False + + def __extract_content_ids(self): + content_ids = set() + try: + tree = self.__etree_from_entry(EPub.Entry.ENCRYPTION) + ns = { + 'c': EPub.Namespace.CONTAINER, + 'e': EPub.Namespace.ENCRYPTION, + 'd': EPub.Namespace.DIGITAL_SIGNATURE + } + res = tree.xpath('/c:encryption/e:EncryptedData/d:KeyInfo/d:KeyName', namespaces=ns) + for node in res: + key_name = res[0].text + if key_name and key_name.startswith(EPub.CONTENT_ID_PREFIX): + content_ids.add(key_name[len(EPub.CONTENT_ID_PREFIX):]) + except: + pass + return list(content_ids) + + def get_encryption_info(self): + UNKNOWN_ENCRYPTION = { 'method': 'unknown' } + + algo = None + + if self.__contains_entry(EPub.Entry.ENCRYPTION): + try: + tree = self.__etree_from_entry(EPub.Entry.ENCRYPTION) + namespaces = {'c': EPub.Namespace.CONTAINER, 'e':EPub.Namespace.ENCRYPTION} + res = tree.xpath('/c:encryption/e:EncryptedData/e:EncryptionMethod', namespaces=namespaces) + algorithms = list(set([r.get('Algorithm') for r in res])) + if len(algorithms) != 1: + return {'method': 'multi', 'ids': algorithms} + if algorithms[0] == EPub.ALGORITHM_EMBEDDING: + return {'method': 'embedding'} + elif algorithms[0] == EPub.ALGORITHM_AES128: + algo = algorithms[0] + else: + return UNKNOWN_ENCRYPTION + except: + return UNKNOWN_ENCRYPTION + + if self.__contains_entry(EPub.Entry.RIGHTS): + if algo == EPub.ALGORITHM_AES128: + try: + tree = self.__etree_from_entry(EPub.Entry.RIGHTS) + namespaces = {'m': EPub.Namespace.MARLIN} + res = tree.xpath('/m:Marlin/m:RightsURL/m:RightsIssuer/m:URL', namespaces=namespaces) + if res: + token_url = res[0].text + content_ids = self.__extract_content_ids() if token_url == EPub.TOKEN_URL else [] + return { + 'method': 'marlin', + 'token_url': token_url, + 'content_ids': content_ids + } + except: + pass + return UNKNOWN_ENCRYPTION + + if self.__contains_entry(EPub.Entry.SIGNATURES): + return UNKNOWN_ENCRYPTION + + return {} + + def __save_tree(self, zip_file, filename, tree, working_dir): + path = os.path.join(working_dir, filename) + with open(path, 'w') as pfile: + tree.write(pfile, pretty_print=True) + zip_file.write(path, arcname=filename) + + def __add_encryption_section(self, index, root, uri, content_id): + # See http://www.marlin-community.com/files/marlin-EPUB-extension-v1.0.pdf + # section 4.2.1 + key_name = EPub.CONTENT_ID_PREFIX + content_id + + enc_data = etree.SubElement(root, etree.QName(EPub.Namespace.ENCRYPTION, 'EncryptedData'), Id='ED%d' % index) + etree.SubElement(enc_data, etree.QName(EPub.Namespace.ENCRYPTION, 'EncryptionMethod'), Algorithm=EPub.Namespace.ENCRYPTION + 'aes128-cbc') + key_info = etree.SubElement(enc_data, etree.QName(EPub.Namespace.DIGITAL_SIGNATURE, 'KeyInfo')) + key_name_tag = etree.SubElement(key_info, etree.QName(EPub.Namespace.DIGITAL_SIGNATURE, 'KeyName')) + key_name_tag.text = key_name + cipher_data = etree.SubElement(enc_data, etree.QName(EPub.Namespace.ENCRYPTION, 'CipherData')) + etree.SubElement(cipher_data, etree.QName(EPub.Namespace.ENCRYPTION, 'CipherReference'), URI=uri) + + def __create_encryption_file(self, zip_file, working_dir, encrypted_files, content_id): + namespaces = { + None : EPub.Namespace.CONTAINER, + 'enc' : EPub.Namespace.ENCRYPTION, + 'ds' : EPub.Namespace.DIGITAL_SIGNATURE + } + root = etree.Element(etree.QName(EPub.Namespace.CONTAINER, 'encryption'), nsmap=namespaces) + tree = etree.ElementTree(root) + + index = 1 + for filename in encrypted_files: + self.__add_encryption_section(index, root, filename, content_id) + index += 1 + + self.__save_tree(zip_file, EPub.Entry.ENCRYPTION, tree, working_dir) + + def __create_rights_file(self, zip_file, working_dir): + namespaces = {None: EPub.Namespace.MARLIN} + root = etree.Element(etree.QName(EPub.Namespace.MARLIN, 'Marlin'), nsmap=namespaces) + tree = etree.ElementTree(root) + etree.SubElement(root, etree.QName(EPub.Namespace.MARLIN, 'Version')).text = '1.0' + rights_url = etree.SubElement(root, etree.QName(EPub.Namespace.MARLIN, 'RightsURL')) + rights_issuer = etree.SubElement(rights_url, etree.QName(EPub.Namespace.MARLIN, 'RightsIssuer')) + etree.SubElement(rights_issuer, etree.QName(EPub.Namespace.MARLIN, 'URL')).text = EPub.TOKEN_URL + self.__save_tree(zip_file, EPub.Entry.RIGHTS, tree, working_dir) + + def encrypt(self, key, content_id, working_dir, files_to_keep=None): + if self.get_encryption_info(): + raise Exception('Cannot encrypt file %s, it is already encrypted' % self.file.name) + + if not files_to_keep: + files_to_keep = [EPub.Entry.MANIFEST, EPub.Entry.METADATA, EPub.Entry.CONTAINER] + files_to_keep += [self.root_filename] + files_to_keep += [info['filename'] for info in self.cover_fileinfos] + + self.__zip_file.extractall(path=working_dir) + + new_epub = mktemp(dir=working_dir) + with zipfile.ZipFile(new_epub, 'w', zipfile.ZIP_DEFLATED) as zip_file: + zip_file.writestr(EPub.Entry.MIMETYPE, Mimetype.EPUB, zipfile.ZIP_STORED) + encrypted_files = [] + for entry in [info.filename for info in list_zip_file_infos(self.__zip_file) if info.filename != EPub.Entry.MIMETYPE]: + path = os.path.join(working_dir, entry) + if entry in files_to_keep: + zip_file.write(path, arcname=entry) + else: + encrypt(os.path.join(working_dir, entry), key, working_dir) + encrypted_files.append(entry) + zip_file.write(path, arcname=entry, compress_type=zipfile.ZIP_STORED) + self.__create_encryption_file(zip_file, working_dir, encrypted_files, content_id) + self.__create_rights_file(zip_file, working_dir) + shutil.move(new_epub, self.path) + self.close() + self.__initialize() + + def repair(self, working_dir): + self.__zip_file.extractall(path=working_dir) + + new_epub = mktemp(dir=working_dir) + with zipfile.ZipFile(new_epub, 'w', zipfile.ZIP_DEFLATED) as zip_file: + zip_file.writestr(EPub.Entry.MIMETYPE, Mimetype.EPUB, zipfile.ZIP_STORED) + for entry in [info.filename for info in list_zip_file_infos(self.__zip_file) if info.filename != EPub.Entry.MIMETYPE]: + zip_file.write(os.path.join(working_dir, entry), arcname=entry) + shutil.move(new_epub, self.path) + self.close() + self.__initialize() + + def extract_cover_internal(self, working_dir): + if len(self.cover_fileinfos) == 0: + return (None, False) + name = self.cover_fileinfos[-1]['filename'] + self.__zip_file.extract(name, path=working_dir) + split = [part for part in name.split('/') if part] + if len(split) > 1: + shutil.move(os.path.join(working_dir, name), os.path.join(working_dir, split[-1])) + shutil.rmtree(os.path.join(working_dir, split[0])) + return (split[-1] if len(split) > 0 else None, False) diff --git a/book_tools/format/fb2.py b/book_tools/format/fb2.py new file mode 100644 index 0000000000000000000000000000000000000000..53d03375d3bd522c1ec873eaf89b81bba1b502d6 --- /dev/null +++ b/book_tools/format/fb2.py @@ -0,0 +1,168 @@ +import base64, os, traceback, zipfile +from lxml import etree +from abc import abstractmethod + +from book_tools.format.bookfile import BookFile +from book_tools.format.mimetype import Mimetype +from book_tools.format.util import list_zip_file_infos + +class FB2StructureException(Exception): + def __init__(self, error): + Exception.__init__(self, 'fb2 verification failed: %s' % error) + if isinstance(error, Exception): + print(traceback.print_exc()) + +class Namespace(object): + FICTION_BOOK = 'http://www.gribuser.ru/xml/fictionbook/2.0' + XLINK = 'http://www.w3.org/1999/xlink' + +class FB2Base(BookFile): + def __init__(self, file, original_filename, mimetype): + BookFile.__init__(self, file, original_filename, mimetype) + self.__namespaces = {'fb': Namespace.FICTION_BOOK, 'xlink': Namespace.XLINK} + try: + tree = self.__create_tree__() + self.__detect_title(tree) + self.__detect_authors(tree) + self.__detect_tags(tree) + self.__detect_series_info(tree) + self.__detect_language(tree) + description = self.__detect_description(tree) + if description: + self.description = description.strip() + except FB2StructureException as error: + raise error + except Exception as error: + raise FB2StructureException(error) + + @abstractmethod + def __create_tree__(self): + return None + + def extract_cover_internal(self, working_dir): + try: + tree = self.__create_tree__() + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:coverpage/fb:image', namespaces=self.__namespaces) + cover_id = res[0].get('{' + Namespace.XLINK + '}href')[1:] + res = tree.xpath('//fb:binary[@id="%s"]' % cover_id, namespaces=self.__namespaces) + content = base64.b64decode(res[0].text) + with open(os.path.join(working_dir, 'cover.jpeg'), 'wb') as cover_file: + cover_file.write(content) + return ('cover.jpeg', False) + except: + return (None, False) + + def __detect_title(self, tree): + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:book-title', namespaces=self.__namespaces) + if len(res) == 0: + res = tree.xpath('/FictionBook/description/title-info/book-title') + if len(res) > 0: + self.__set_title__(res[0].text) + + return None + + def __detect_authors(self, tree): + use_namespaces = True + + def subnode_text(node, name): + if use_namespaces: + subnode = node.find('fb:' + name, namespaces=self.__namespaces) + else: + subnode = node.find(name) + text = subnode.text if subnode is not None else '' + return text or '' + + def add_author_from_node(node): + first_name = subnode_text(node, 'first-name') + middle_name = subnode_text(node, 'middle-name') + last_name = subnode_text(node, 'last-name') + self.__add_author__(' '.join([first_name, middle_name, last_name]), last_name) + + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:author', namespaces=self.__namespaces) + if len(res) == 0: + use_namespaces = False + res = tree.xpath('/FictionBook/description/title-info/author') + + for node in res: + add_author_from_node(node) + + def __detect_language(self, tree): + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:lang', namespaces=self.__namespaces) + if len(res) == 0: + use_namespaces = False + res = tree.xpath('/FictionBook/description/title-info/lang') + if len(res) > 0: + self.language_code = res[0].text + + def __detect_tags(self, tree): + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:genre', namespaces=self.__namespaces) + if len(res) == 0: + use_namespaces = False + res = tree.xpath('/FictionBook/description/title-info/genre') + for node in res: + self.__add_tag__(node.text) + + def __detect_series_info(self, tree): + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:sequence', namespaces=self.__namespaces) + if len(res) == 0: + use_namespaces = False + res = tree.xpath('/FictionBook/description/title-info/sequence') + if len(res) > 0: + title = BookFile.__normalise_string__(res[0].get('name')) + index = BookFile.__normalise_string__(res[0].get('number')) + if title: + self.series_info = { + 'title': title, + 'index': index or None + } + + def __detect_description(self, tree): + res = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:annotation', namespaces=self.__namespaces) + if len(res) == 0: + res = tree.xpath('/FictionBook/description/title-info/annotation') + if len(res) > 0: + return etree.tostring(res[0], encoding='utf-8', method='text') + + return None + +class FB2(FB2Base): + def __init__(self, file, original_filename): + FB2Base.__init__(self, file, original_filename, Mimetype.FB2) + + def __create_tree__(self): + try: + return etree.parse(self.file) + except: + raise FB2StructureException('the file is not a valid XML') + + def __exit__(self, kind, value, traceback): + pass + +class FB2Zip(FB2Base): + def __init__(self, file, original_filename): + self.__zip_file = zipfile.ZipFile(file) + try: + if self.__zip_file.testzip(): + raise FB2StructureException('broken zip archive') + self.__infos = list_zip_file_infos(self.__zip_file) + if len(self.__infos) != 1: + raise FB2StructureException('archive contains %s files' % len(self.__infos)) + except FB2StructureException as error: + self.__zip_file.close() + raise error + except Exception as error: + self.__zip_file.close() + raise FB2StructureException(error) + + FB2Base.__init__(self, file, original_filename, Mimetype.FB2_ZIP) + + def __create_tree__(self): + with self.__zip_file.open(self.__infos[0]) as entry: + try: + return etree.fromstring(entry.read(50 * 1024 * 1024)) + except: + raise FB2StructureException('\'%s\' is not a valid XML' % self.__infos[0].filename) + + def __exit__(self, kind, value, traceback): + self.__zip_file.__exit__(kind, value, traceback) + pass diff --git a/book_tools/format/mimetype.py b/book_tools/format/mimetype.py new file mode 100644 index 0000000000000000000000000000000000000000..d296b24eaeccbb9d310b9cdbd26d8de59e8e4417 --- /dev/null +++ b/book_tools/format/mimetype.py @@ -0,0 +1,14 @@ +class Mimetype: + OCTET_STREAM = 'application/octet-stream' + XML = 'application/xml' + ZIP = 'application/zip' + + EPUB = 'application/epub+zip' + FB2 = 'application/fb2+xml' + FB2_ZIP = 'application/fb2+zip' + PDF = 'application/pdf' + MSWORD = 'application/msword' + MOBI = 'application/x-mobipocket-ebook' + DJVU = 'image/vnd.djvu' + TEXT = 'text/plain' + RTF = 'text/rtf' diff --git a/book_tools/format/mobi.py b/book_tools/format/mobi.py new file mode 100644 index 0000000000000000000000000000000000000000..a7e8c2c89a9973c24801d1d35a85216a364db545 --- /dev/null +++ b/book_tools/format/mobi.py @@ -0,0 +1,37 @@ +import os, shutil +from tempfile import mkdtemp + +from book_tools.pymobi.mobi import BookMobi + +from book_tools.format.bookfile import BookFile +from book_tools.format.mimetype import Mimetype + +class Mobipocket(BookFile): + def __init__(self, file, original_filename): + BookFile.__init__(self, file, original_filename, Mimetype.MOBI) + bm = BookMobi(file) + self._encryption_method = bm['encryption'] + self.__set_title__(bm['title']) + self.__add_author__(bm['author']) + if bm['subject']: + for tag in bm['subject']: + self.__add_tag__(tag) + self.description = bm['description'] + + def __exit__(self, kind, value, traceback): + pass + + def get_encryption_info(self): + return {'method': self._encryption_method} if self._encryption_method != 'no encryption' else {} + + def extract_cover_internal(self, working_dir): + tmp_dir = mkdtemp(dir=working_dir) + BookMobi(self.file).unpackMobi(tmp_dir + '/bookmobi') + try: + if os.path.isfile(tmp_dir + '/bookmobi_cover.jpg'): + shutil.copy(tmp_dir + '/bookmobi_cover.jpg', working_dir) + return ('bookmobi_cover.jpg', False) + else: + return (None, False) + finally: + shutil.rmtree(tmp_dir) diff --git a/book_tools/format/util.py b/book_tools/format/util.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3656df90331c632041503929490bf404f71e44 --- /dev/null +++ b/book_tools/format/util.py @@ -0,0 +1,24 @@ +#import PythonMagick +from PIL import Image, ImageFile + +def list_zip_file_infos(zipfile): + return [info for info in zipfile.infolist() if not info.filename.endswith('/')] + +def minify_cover(path): + # try: + # try: + # image = Image.open(path).convert('RGB') + # except: + # magick_image = PythonMagick.Image(path + '[0]') + # magick_image.write(path) + # image = Image.open(path).convert('RGB') + # width = image.size[0] + # if width > 600: + # new_width = 500 + # new_height = int(float(new_width) * image.size[1] / width) + # image.thumbnail((new_width, new_height), Image.ANTIALIAS) + # ImageFile.MAXBLOCK = image.size[0] * image.size[1] + # image.save(path, 'JPEG', optimize=True, progressive=True) + # except: + # pass + pass diff --git a/book_tools/pymobi/__init__.py b/book_tools/pymobi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..66e7169e529bd92f0a5db2a7edcfd90605a12411 --- /dev/null +++ b/book_tools/pymobi/__init__.py @@ -0,0 +1,9 @@ + +__author__ = 'Yugang LIU' +__email__ = 'liuyug@gmail.com' +__version__ = '0.1.3' +__license__ = 'GPLv3' + + +from book_tools.pymobi.mobi import BookMobi +from book_tools.pymobi.util import * diff --git a/book_tools/pymobi/compression.py b/book_tools/pymobi/compression.py new file mode 100644 index 0000000000000000000000000000000000000000..b5c0b2cab37fa90e96f817d391eea5979e551436 --- /dev/null +++ b/book_tools/pymobi/compression.py @@ -0,0 +1,146 @@ +import struct + + +class Uncompression(object): + def pack(self, data): + return data + + def unpack(self, data): + return data + + +class Palmdoc(object): + def pack(self, i): + raise ValueError('not implement') + + def unpack(self, i): + o, p = '', 0 + while p < len(i): + c = ord(i[p]) + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p + c] + p += c + elif (c < 128): + o += chr(c) + elif (c >= 192): + o += ' ' + chr(c ^ 128) + else: + if p < len(i): + c = (c << 8) | ord(i[p]) + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n - m] + else: + for z in range(n): + o += o[-m] + return o + + def unpack3(self, i): + o, p = b'', 0 + while p < len(i): + c = i[p] + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p + c] + p += c + elif (c < 128): + o += c.to_bytes(1, 'big') + elif (c >= 192): + o += b' ' + (c ^ 128).to_bytes(1, 'big') + else: + if p < len(i): + c = (c << 8) | i[p] + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n - m] + else: + for z in range(n): + o += o[-m].to_bytes(1, 'big') + return o + + +class Huffcdic(object): + q = struct.Struct('>Q').unpack_from + + def loadHuff(self, huff): + if huff[0:8] != 'HUFF\x00\x00\x00\x18': + raise ValueError('invalid huff header') + off1, off2 = struct.unpack_from('>LL', huff, 8) + + def dict1_unpack(v): + codelen, term, maxcode = v & 0x1f, v & 0x80, v >> 8 + assert codelen != 0 + if codelen <= 8: + assert term + maxcode = ((maxcode + 1) << (32 - codelen)) - 1 + return (codelen, term, maxcode) + + self.dict1 = map(dict1_unpack, struct.unpack_from('>256L', huff, off1)) + + dict2 = struct.unpack_from('>64L', huff, off2) + self.mincode, self.maxcode = (), () + for codelen, mincode in enumerate((0,) + dict2[0::2]): + self.mincode += (mincode << (32 - codelen), ) + for codelen, maxcode in enumerate((0,) + dict2[1::2]): + self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) + + self.dictionary = [] + + def loadCdic(self, cdic): + if cdic[0:8] != 'CDIC\x00\x00\x00\x10': + raise ValueError('invalid cdic header') + phrases, bits = struct.unpack_from('>LL', cdic, 8) + n = min(1 << bits, phrases - len(self.dictionary)) + h = struct.Struct('>H').unpack_from + + def getslice(off): + blen, = h(cdic, 16 + off) + slice = cdic[18 + off:18 + off + (blen & 0x7fff)] + return (slice, blen & 0x8000) + + self.dictionary += map(getslice, struct.unpack_from('>%dH' % n, cdic, 16)) + + def pack(self, i): + raise ValueError('not implement') + + def unpack(self, data): + q = Huffcdic.q + + bitsleft = len(data) * 8 + data += "\x00\x00\x00\x00\x00\x00\x00\x00" + pos = 0 + x, = q(data, pos) + n = 32 + + s = '' + while True: + if n <= 0: + pos += 4 + x, = q(data, pos) + n += 32 + code = (x >> n) & ((1 << 32) - 1) + + codelen, term, maxcode = self.dict1[code >> 24] + if not term: + while code < self.mincode[codelen]: + codelen += 1 + maxcode = self.maxcode[codelen] + + n -= codelen + bitsleft -= codelen + if bitsleft < 0: + break + + r = (maxcode - code) >> (32 - codelen) + slice, flag = self.dictionary[r] + if not flag: + self.dictionary[r] = None + slice = self.unpack(slice) + self.dictionary[r] = (slice, 1) + s += slice + return s diff --git a/book_tools/pymobi/mobi.py b/book_tools/pymobi/mobi.py new file mode 100644 index 0000000000000000000000000000000000000000..c91cc23ebc540d31d8d42d66ab54d77f58afcf78 --- /dev/null +++ b/book_tools/pymobi/mobi.py @@ -0,0 +1,628 @@ + +import os.path +import datetime +import struct +import re +import array +import sys +try: + from collections import OrderedDict +except: + from ordereddict import OrderedDict + +from book_tools.pymobi.util import hexdump, decodeVarint, toStr, toByte +from book_tools.pymobi import compression + +DEBUG = False + +#Reader Type Code +pd_file_code = { + 'Adobe Reader': '.pdfADBE', + 'PalmDOC': 'TEXtREAd', + 'BDicty': 'BVokBDIC', + 'DB (Database program)': 'DB99DBOS', + 'eReader 1': 'PNRdPPrs', + 'eReader 2': 'DataPPrs', + 'FireViewer (ImageViewer)': 'vIMGView', + 'HanDBase': 'PmDBPmDB', + 'InfoView': 'InfoINDB', + 'iSilo': 'ToGoToGo', + 'iSilo 3': 'SDocSilX', + 'JFile': 'JbDbJBas', + 'JFile Pro': 'JfDbJFil', + 'LIST': 'DATALSdb', + 'MobileDB': 'Mdb1Mdb1', + 'MobiPocket': 'BOOKMOBI', + 'Plucker': 'DataPlkr', + 'QuickSheet': 'DataSprd', + 'SuperMemo': 'SM01SMem', + 'TealDoc': 'TEXtTlDc', + 'TealInfo': 'InfoTlIf', + 'TealMeal': 'DataTlMl', + 'TealPaint': 'DataTlPt', + 'ThinkDB': 'dataTDBP', + 'Tides': 'TdatTide', + 'TomeRaider': 'ToRaTRPW', + 'Weasel': 'zTXTGPlm', + 'WordSmith': 'BDOCWrdS', +} +compression_type = { + 1: ('no compression', compression.Uncompression), + 2: ('PalmDOC compression', compression.Palmdoc), + 17480: ('HUFF/CDIC compression', compression.Huffcdic), +} +encryption_type = { + 0: 'no encryption', + 1: 'Old Mobipocket Encryption', + 2: 'Mobipocket Encryption', +} +mobi_type = { + 2: 'Mobipocket Book', + 3: 'PalmDoc Book', + 4: 'Audio', + 232: 'mobipocket? generated by kindlegen1.2', + 248: 'KF8: generated by kindlegen2', + 257: 'News', + 258: 'News_Feed', + 259: 'News_Magazine', + 513: 'PICS', + 514: 'WORD', + 515: 'XLS', + 516: 'PPT', + 517: 'TEXT', + 518: 'HTML', +} +encoding_type = { + 1252: 'CP1252 (WinLatin1)', + 65001: 'UTF-8', +} +mobi_exth_type = { + 1: 'drm_server_id', + 2: 'drm_commerce_id', + 3: 'drm_ebookbase_book_id', + 100: 'author <dc:Creator>', + 101: 'publisher <dc:Publisher>', + 102: 'imprint <Imprint>', + 103: 'description <dc:Description>', + 104: 'isbn <dc:Identifier scheme="ISBN">', + 105: 'subject Could appear multiple times <dc:Subject>', + 106: 'publishingdate <dc:Date>', + 107: 'review <Review>', + 108: 'contributor <dc:Contributor>', + 109: 'rights <dc:Rights>', + 110: 'subjectcode <dc:Subject BASICCode="subjectcode">', + 111: 'type <dc:Type>', + 112: 'source <dc:Source>', + 113: 'asin Kindle Paperwhite labels books with "Personal" if they dont have this record.', + 114: 'versionnumber', + 115: 'sample 0x0001 if the book content is only a sample of the full book', + 116: 'startreading Position (4-byte offset) in file at which to open when first opened', + 117: 'adult Mobipocket Creator adds this if Adult only is checked on its GUI; contents: "yes" <Adult>', + 118: 'retail price As text, e.g. "4.99" <SRP>', + 119: 'retail price currency As text, e.g. "USD" <SRP Currency="currency">', + 121: 'KF8 BOUNDARY Offset', + 125: 'count of resources', + 129: 'KF8 cover URI', + 131: 'Unknown', + 200: 'Dictionary short name As text <DictionaryVeryShortName>', + 201: 'coveroffset Add to first image field in Mobi Header to find PDB record containing the cover image <EmbeddedCover>', + 202: 'thumboffset Add to first image field in Mobi Header to find PDB record containing the thumbnail cover image', + 203: 'hasfakecover', + 204: 'Creator Software Known Values: 1=mobigen, 2=Mobipocket Creator, 200=kindlegen (Windows), 201=kindlegen (Linux), 202=kindlegen (Mac).', + 205: 'Creator Major Version', + 206: 'Creator Minor Version', + 207: 'Creator Build Number', + 208: 'watermark', + 209: 'tamper proof keys Used by the Kindle (and Android app) for generating book-specific PIDs.', + 300: 'fontsignature', + 401: 'clippinglimit Integer percentage of the text allowed to be clipped. Usually 10.', + 402: 'publisherlimit', + 403: 'Unknown', + 404: 'ttsflag 1 - Text to Speech disabled; 0 - Text to Speech enabled', + 405: 'Unknown', + 406: 'Unknown', + 407: 'Unknown', + 450: 'Unknown', + 451: 'Unknown', + 452: 'Unknown', + 453: 'Unknown', + 501: 'cdetype PDOC - Personal Doc; EBOK - ebook; EBSP - ebook sample', + 502: 'lastupdatetime', + 503: 'updatedtitle', + 504: 'asin I found a copy of ASIN in this record.', + 508: 'other title', + 517: 'other author', + 522: 'other publisher', + 524: 'language <dc:language>', + 525: 'alignment I found horizontal-lr in this record.', + 529: 'kindlegen version', + 535: 'Creator Build Number', +} + + +class BookMobi(object): + """ + Mobi format: + `Palm Database Format`_ + + `MOBI Format`_ + + .. _`Palm Database Format`: http://wiki.mobileread.com/wiki/PDB#Palm_Database_Format + .. _`MOBI Format`: http://wiki.mobileread.com/wiki/MOBI + + mobi record order + ----------------- + 1. first content. normal is 1. + 2. text record + #. first non-book record + #. ortographic + #. indx record + #. huff/cdic record + #. first image record + #. huff/cdic table record + #. last content record + #. flis record + #. fcis record + #. srcs record + """ + palmdb_format = [ + ('name', '32s', 0), + ('attributes', '>H', 32), + ('version', '>H', 34), + ('creationDate', '>L', 36), + ('modificationDate', '>L', 40), + ('lastbackupDate', '>L', 44), + ('modificationNumber', '>L', 48), + ('appInfoID', '>L', 52), + ('sortInfoID', '>L', 56), + ('type', '4s', 60), + ('creator', '4s', 64), + ('uniqueIDseed', '>L', 68), + ('nextRecordListID', '>L', 72), + ('numberOfRecords', '>H', 76), + # recordInfoList = 8 * numberOfRecords ... + ] + palmdoc_format = [ + ('compression', '>H', 0), + ('unused', '>H', 2), + ('textLength', '>L', 4), + ('recordCount', '>H', 8), + ('recordSize', '>L', 10), + ('currentPosition', '>L', 12), + ('encryptionType', '>H', 12), + ] + mobi_format = [ + ('identifier', '4s', 16), + ('headerLength', '>L', 20), + ('mobiType', '>L', 24), + ('textEncoding', '>L', 28), + ('uniqueID', '>L', 32), + ('fileVersion', '>L', 36), + ('ortographicIndex', '>L', 40), + ('inflectionIndex', '>L', 44), + ('indexNames', '>L', 48), + ('indexKeys', '>L', 52), + ('extraIndex0', '>L', 56), + ('extraIndex1', '>L', 60), + ('extraIndex2', '>L', 64), + ('extraIndex3', '>L', 68), + ('extraIndex4', '>L', 72), + ('extraIndex5', '>L', 76), + ('firstNonBookIndex', '>L', 80), + ('fullNameOffset', '>L', 84), + ('fullNameLength', '>L', 88), + ('locale', '>L', 92), + ('inputLanguage', '>L', 96), + ('outputLanguage', '>L', 100), + ('minVersion', '>L', 104), + ('firstImageIndex', '>L', 108), + ('huffmanRecordOffset', '>L', 112), + ('huffmanRecordCount', '>L', 116), + ('huffmanTableOffset', '>L', 120), + ('huffmanTableLength', '>L', 124), + ('exthFlags', '>L', 128), + ('unknown132', '12s', 132), + ('unknown144', '16s', 144), + ('unknown160', '>L', 160), + ('unknown164', '>L', 164), + ('drmOffset', '>L', 168), + ('drmCount', '>L', 172), + ('drmSize', '>L', 176), + ('drmFlags', '>L', 180), + ('unknown184', '>Q', 184), + ('firstContentRecordNumber', '>H', 192), + ('lastContentRecordNumber', '>H', 194), + ('unknown196', '>L', 196), + ('fcisRecordNumber', '>L', 200), + ('fcisRecordCount', '>L', 204), + ('flisRecordNumber', '>L', 208), + ('flisRecordCount', '>L', 212), + ('unknown216', '8s', 216), + ('srcsRecordNumber', '>L', 224), + ('srcsRecordCount', '>L', 228), + ('numberOfCompilationDataSections', '>L', 232), + ('unknown236', '>L', 236), + ('extraRecordDataFlags', '>L', 240), + ('indxRecordOffset', '>L', 244), + ('unknown248', '>L', 248), + ('unknown252', '>L', 252), + ] + header = OrderedDict() + records = OrderedDict() + palmdoc = OrderedDict() + mobi = OrderedDict() + mobi_exth = OrderedDict() + book = OrderedDict() + compression = None + + def __init__(self, file): + if isinstance(file, str): + f = open(file, 'rb') + else: + f = file + self.filename = f.name + self.f = f + # palm database header + header = f.read(78) + for key, u_fmt, offset in self.palmdb_format: + value, = struct.unpack_from(u_fmt, header, offset) + self.header[key] = value + # palm database record + f.seek(78) + records = f.read(self.header['numberOfRecords'] * 8) + for count in range(0, self.header['numberOfRecords']): + offset, value = struct.unpack_from('>LL', records, count * 8) + attributes = value & 0xff000000 + uniqueID = value & 0xffffff + self.records[count] = (offset, attributes, uniqueID) + ident = '%s%s' % (toStr(self.header['type']), toStr(self.header['creator'])) + self.book['title'] = toStr(self.header['name']) + self.book['ident'] = ident + self.book['creationDate'] = self.datetimeFromValue(self.header['creationDate']) + self.book['modificationDate'] = self.datetimeFromValue(self.header['modificationDate']) + # ebook header + record0 = self.loadRecord(0) + if self.isPalmdoc() or self.isMobipocket(): + # palmdoc header + for key, u_fmt, offset in self.palmdoc_format: + value, = struct.unpack_from(u_fmt, record0, offset) + self.palmdoc[key] = value + if self.palmdoc['encryptionType'] == 1: + self.palmdoc['type1KeyData'], = struct.unpack_from( + '16s', + record0, + 14, + ) + self.book['compression'] = self.typeDesc( + compression_type, + self.palmdoc['compression'], + ) + self.book['encryption'] = self.typeDesc( + encryption_type, + self.palmdoc['encryptionType'], + ) + if ident == pd_file_code['MobiPocket']: + # mobi header + record0_length = len(record0) + for key, u_fmt, offset in self.mobi_format: + if record0_length < offset: + break + value, = struct.unpack_from(u_fmt, record0, offset) + self.mobi[key] = value + # encryption type 1 key data? + if ( + self.palmdoc['encryptionType'] == 2 + and self.mobi['drmOffset'] != 0xffffffff + ): + self.mobi['drmData'] = record0[ + self.mobi['drmOffset']: + self.mobi['drmOffset'] + self.mobi['drmSize'] + ] + if self.palmdoc['encryptionType'] == 1: + self.mobi['type1KeyData'], = struct.unpack_from( + '16s', + record0, + 0x10 + self.mobi['headerLength'], + ) + # exth + if ( + self.mobi['headerLength'] > 0xE4 + and self.mobi['minVersion'] >= 5 + and self.mobi['exthFlags'] & 0x40 + ): + # palmdoc length + mobi length + exth_addr = 0x10 + self.mobi['headerLength'] + offset = 0 + exthIdent, exthLength, exthCount = struct.unpack_from( + '>4sLL', + record0, + exth_addr, + ) + if toStr(exthIdent) != 'EXTH': + hexdump(record0[exth_addr:exth_addr + exthLength]) + raise Exception('exth header error: %s' % exthIdent) + offset += 12 + count = 0 + while count < exthCount: + recordType, recordLength = struct.unpack_from( + '>LL', + record0, + exth_addr + offset) + data, = struct.unpack_from( + '%ds' % (recordLength - 8), + record0, + exth_addr + offset + 8) + self.mobi_exth[recordType] = data + if DEBUG: + if not recordType in mobi_exth_type: + print(recordType, data, 'unknown type') + offset += recordLength + count += 1 + title, = struct.unpack_from( + '%ds' % self.mobi['fullNameLength'], + record0, + self.mobi['fullNameOffset'] + ) + if title: + self.book['title'] = toStr(title) + self.book['version'] = self.mobi['minVersion'] + self.book['author'] = toStr(self.mobi_exth[100] if 100 in self.mobi_exth else 'unknown') + self.book['mobiType'] = self.typeDesc( + mobi_type, + self.mobi['mobiType'], + ) + self.book['encoding'] = self.typeDesc( + encoding_type, + self.mobi['textEncoding'], + ) + self.book['srcs'] = (self.mobi['srcsRecordNumber'] != 0xffffffff) + + def __getitem__(self, name): + return self.book.get(name) + + def __len__(self): + return len(self.book) + + def __iter__(self): + return self.book.itervalues() + + def isMobipocket(self): + return self.book['ident'] == pd_file_code['MobiPocket'] + + def isPalmdoc(self): + return self.book['ident'] == pd_file_code['PalmDOC'] + + def unpackFunction(self): + compression_class = compression_type[self.palmdoc['compression']][1] + self.compression = compression_class() + if isinstance(self.compression, compression.Huffcdic): + rec_huff = self.loadRecord(self.mobi['huffmanRecordOffset']) + self.compression.loadHuff(rec_huff) + for c in range(1, self.mobi['huffmanRecordCount']): + rec_cdic = self.loadRecord(self.mobi['huffmanRecordOffset'] + c) + self.compression.loadCdic(rec_cdic) + if sys.version_info[0] < 3: + unpack = self.compression.unpack + else: + unpack = self.compression.unpack3 + return unpack + + def typeDesc(self, types, value): + if value in types: + desc = types[value] + if isinstance(desc, tuple): + return desc[0] + else: + return desc + else: + return 'unknown' + + def loadRecord(self, record_index): + """ + load palm database's record + """ + offset = self.records[record_index][0] + self.f.seek(offset) + if record_index == (self.header['numberOfRecords'] - 1): + record = self.f.read() + else: + offset2 = self.records[record_index + 1][0] + record = self.f.read(offset2 - offset) + return record + + def datetimeFromValue(self, value): + """ + If the time has the top bit set, it's an unsigned 32-bit number counting from 1st Jan 1904 + If the time has the top bit clear, it's a signed 32-bit number counting from 1st Jan 1970. + """ + flag = value & 0x80000000 + if flag: + time = datetime.datetime(1904, 1, 1) + else: + time = datetime.datetime(1970, 1, 1) + time += datetime.timedelta(seconds=value) + return time + + def decrypt(self, record): + return record + + def imageExt(self, record): + ident, = struct.unpack_from('>L', record, 0) + if ident == 0x47494638: + return '.gif' + elif ident == 0x89504e47: + return '.png' + ident = struct.unpack_from('>HHHL', record, 0) + if ident[3] == 0x4a464946: + return '.jpg' + ident, = struct.unpack_from('>4s', record, 0) + return '.%s' % ident + + def saveRecordImage(self, num, basename): + rec = self.loadRecord(num) + ext = self.imageExt(rec) + img_file = '%s%s' % (basename, ext) + with open(img_file, 'wb') as f: + f.write(rec) + return os.path.basename(img_file) + + def loadTextResource(self, data, basename): + def repl(mo): + img_idx = int(mo.group(1)) + num = img_idx_base + img_idx - 1 + img_basename = '%s_img_%05d' % (basename, img_idx) + sys.stdout.write('.') + sys.stdout.flush() + img_file = self.saveRecordImage(num, img_basename) + return toByte('<img src="%s"' % img_file) + + print('Dump image') + img_idx_base = int(self.mobi['firstImageIndex']) + img_pattern = ( + b'''<img\s+recindex=['"](\d+)['"]''', + b'''<img\s+src=['"]kindle:embed:(\d+)\?mime=image/jpg['"]''', + ) + for pattern in img_pattern: + regex = re.compile(pattern, re.I) + data = regex.sub(repl, data) + if self.mobi['textEncoding'] == 65001: + charset = 'utf-8' + else: + charset = 'cp%d' % self.mobi['textEncoding'] + data = re.sub( + b'<head>', + toByte('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=%s" />' % charset), + data, + re.I, + ) + print('') + return data + + def unpackMobi(self, output_file): + rec_num = self.palmdoc['recordCount'] + text_length = self.palmdoc['textLength'] + unpack = self.unpackFunction() + data = [] + print('Title: %s' % self.book['title']) + print('Compression Type: %s' % self.book['compression']) + print('Encryption Type: %s' % self.book['encryption']) + print('Dump html/css') + for rn in range(1, rec_num + 1): + record = self.loadRecord(rn) + extraflags = self.mobi['extraRecordDataFlags'] >> 1 + while extraflags & 0x1: + # the maximum length of trailing entries size is 32. + vint, = struct.unpack_from('>L', record[-4:], 0) + fint = decodeVarint(vint) + record = record[:-fint] + extraflags >>= 1 + if self.mobi['extraRecordDataFlags'] & 0x1: + # multibyte bytes is the last byte at the end of trailing + # entries + mb_num, = struct.unpack_from('>B', record[-1:], 0) + # bit 1-2 is length, 3-8 is unknown. plus 1 size byte + mb_num = (mb_num & 0x3) + 1 + record = record[:-mb_num] + record = self.decrypt(record) + sys.stdout.write('.') + sys.stdout.flush() + data.append(unpack(record)) + data_text = b''.join(data) + data_css = data_text[text_length:] + data_text = data_text[:text_length] + sys.stdout.write('html: %d' % text_length) + basename = os.path.splitext(output_file)[0] + if data_css: + sys.stdout.write(' / css: %d' % len(data_css)) + css_filename = '%s.css' % basename + with open(css_filename, 'wb') as f: + f.write(data_css) + data_text = re.sub( + r'''<head>''', + '<head>\n<link rel="stylesheet" href="%s" type="text/css"/>' % os.path.basename(css_filename), + data_text, + re.I + ) + print('') + if self.mobi['firstImageIndex'] != 0xffffffff: + data_text = self.loadTextResource(data_text, basename) + with open(output_file, 'wb') as f: + f.write(data_text) + # cover + if 201 in self.mobi_exth: + print('Dump cover') + cover_rn, = struct.unpack('>L', self.mobi_exth[201]) + cover_rn += self.mobi['firstImageIndex'] + self.saveRecordImage(cover_rn, '%s_cover' % basename) + print('Unpack MOBI successfully') + + def removeSrcs(self, outmobi, outsrcs=None): + srcs_rn = self.mobi['srcsRecordNumber'] + srcs_rc = self.mobi['srcsRecordCount'] + print('Title: %s' % self.book['title']) + if srcs_rn == 0xffffffff or srcs_rc == 0: + print('No SRCS section.') + return + print('Find SRCS: %d' % srcs_rn) + if outsrcs: + print('Output ZIP file: %s ' % outsrcs) + f = open(outsrcs, 'wb') + for rn in range(srcs_rn, srcs_rn + srcs_rc): + sys.stdout.write('.') + sys.stdout.flush() + rec = self.loadRecord(rn) + header = struct.unpack_from('>4L', rec, 0) + if header[0] == 0x53524353: + # SRCS + f.write(rec[16:]) + f.close() + print('') + print('Output MOBI file: %s' % outmobi) + with open(outmobi, 'wb') as f: + self.f.seek(0) + f.write(self.f.read(78)) + # replace srcs section with 2-zero bytes + recordlist_data = array.array('B', self.f.read(8 * self.header['numberOfRecords'])) + print('Fix record offset') + srcs_offset = self.records[srcs_rn][0] + for count in range(0, srcs_rc): + sys.stdout.write('.') + sys.stdout.flush() + fix_offset = srcs_offset + count * 2 + struct.pack_into('>L', recordlist_data, (srcs_rn + count) * 8, fix_offset) + offset = self.records[srcs_rn + srcs_rc][0] - srcs_offset - srcs_rc * 2 + for rn in range(srcs_rn + srcs_rc, self.header['numberOfRecords']): + sys.stdout.write('.') + sys.stdout.flush() + fix_offset = self.records[rn][0] - offset + struct.pack_into('>L', recordlist_data, rn * 8, fix_offset) + f.write(recordlist_data) + print('') + # gap + gapToDataLength = self.records[0][0] - f.tell() + if gapToDataLength: + f.write(self.f.read(gapToDataLength)) + # record + print('Write record') + record0 = array.array('B', self.loadRecord(0)) + struct.pack_into('>LL', record0, 224, 0xffffffff, 0) + f.write(record0) + for rn in range(1, srcs_rn): + sys.stdout.write('.') + sys.stdout.flush() + rec = self.loadRecord(rn) + f.write(rec) + srcs_data = b'\x00\x00' + # srcs record + for rn in range(srcs_rn, srcs_rn + srcs_rc): + sys.stdout.write('.') + sys.stdout.flush() + f.write(srcs_data) + # record + for rn in range(srcs_rn + srcs_rc, self.header['numberOfRecords']): + sys.stdout.write('.') + sys.stdout.flush() + rec = self.loadRecord(rn) + f.write(rec) + print('') + print('Remove SRCS successfully') diff --git a/book_tools/pymobi/util.py b/book_tools/pymobi/util.py new file mode 100644 index 0000000000000000000000000000000000000000..20882d8caa49132a77aaafbf0aef88d5986dc2d0 --- /dev/null +++ b/book_tools/pymobi/util.py @@ -0,0 +1,68 @@ + +def toStr(src, coding='utf-8'): + """ for python3 """ + return src.decode(coding) + + +def toByte(src, coding='utf-8'): + """ for python3 """ + return src.encode(coding) + + +def hexdump(src, length=16, sep='.'): + """ + hexdump implementation in Python + paste from https://gist.github.com/7h3rAm/5603718 + """ + FILTER = ''.join([ + (len(repr(chr(x))) == 3) and chr(x) or sep + for x in range(256) + ]) + lines = [] + for c in range(0, len(src), length): + chars = src[c:c + length] + if isinstance(chars, bytes): + hhex = ' '.join(["%02x" % x for x in chars]) + else: + hhex = ' '.join(["%02x" % ord(x) for x in chars]) + if len(hhex) > 24: + hhex = "%s %s" % (hhex[:24], hhex[24:]) + if isinstance(chars, bytes): + printable = ''.join([ + "%s" % ((x <= 127 and FILTER[x]) or sep) + for x in chars + ]) + else: + printable = ''.join([ + "%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or sep) + for x in chars + ]) + lines.append("%08x: %-*s |%s|\n" % (c, length * 3, hhex, printable)) + print(''.join(lines)) + + +def decodeVarint(vint): + """ backward-encoded Mobipocket variable-width integer. """ + fint = 0 + bitpos = 0 + while bitpos < 28: + fint |= ((vint & 0x7f) << bitpos) + if vint & 0x80: + break + vint >>= 8 + bitpos += 7 + return fint + + +def encodeVarint(fint): + """ backward-encoded Mobipocket variable-width integer. """ + vint = 0 + bitpos = 0 + while bitpos < 32: + vint |= ((fint & 0x7f) << bitpos) + if fint < 127: + vint |= (0x80 << bitpos) + break + fint >>= 7 + bitpos += 8 + return vint diff --git a/opds_catalog/sopdscan.py b/opds_catalog/sopdscan.py index 5cbe5ed0d14f0ba7b2ad0c43f4c24cda50834327..bcdd36b986938d9b8c6fd4a7f16a27cee055a687 100644 --- a/opds_catalog/sopdscan.py +++ b/opds_catalog/sopdscan.py @@ -5,6 +5,7 @@ import time import datetime import logging import re +from book_tools.format import create_bookfile from django.db import transaction @@ -193,6 +194,58 @@ class opdsScanner: self.bad_archives+=zip_process_error def processfile(self,name,full_path,file,cat,archive=0,file_size=0): + (n, e) = os.path.splitext(name) + if e.lower() in config.SOPDS_BOOK_EXTENSIONS.split(): + rel_path=os.path.relpath(full_path,config.SOPDS_ROOT_LIB) + self.logger.debug("Attempt to add book "+rel_path+"/"+name) + if opdsdb.findbook(name, rel_path, 1) == None: + if archive==0: + cat=opdsdb.addcattree(rel_path,archive) + + if isinstance(file, str): + f = open(file, 'rb') + else: + f = file + + try: + book_data = create_bookfile(f, name) + except: + book_data = None + self.logger.warning(rel_path + ' - ' + name + ' Book parse error, skipping') + self.bad_books += 1 + + if book_data: + lang = book_data.language_code.strip(self.strip_symbols) if book_data.language_code else '' + title = book_data.title.strip(self.strip_symbols) if book_data.title else n + annotation = book_data.description if book_data.description else '' + annotation = annotation.strip(self.strip_symbols) if isinstance(annotation, str) else annotation.decode().strip(self.strip_symbols) + docdate = '' + + book=opdsdb.addbook(name,rel_path,cat,e[1:],title,annotation,docdate,lang,file_size,archive) + self.books_added+=1 + + if archive!=0: + self.books_in_archives+=1 + self.logger.debug("Book "+rel_path+"/"+name+" Added ok.") + + for a in book_data.authors: + author_name = a.get('name','Unknown author').strip(self.strip_symbols) + author=opdsdb.addauthor(author_name) + opdsdb.addbauthor(book,author) + + for genre in book_data.tags: + opdsdb.addbgenre(book,opdsdb.addgenre(genre.lower().strip(self.strip_symbols))) + + for ser in self.fb2parser.series.attrss: + ser_name=ser.get('title').strip() + ser_no = ser.get('index', '0').strip() + ser_no = int(ser_no) if ser_no.isdigit() else 0 + opdsdb.addbseries(book,ser_name,ser_no) + else: + self.books_skipped+=1 + self.logger.debug("Book "+rel_path+"/"+name+" Already in DB.") + + def processfile0(self,name,full_path,file,cat,archive=0,file_size=0): (n,e)=os.path.splitext(name) if e.lower() in config.SOPDS_BOOK_EXTENSIONS.split(): rel_path=os.path.relpath(full_path,config.SOPDS_ROOT_LIB)