Skip to content
Snippets Groups Projects
Commit 6176bf42 authored by Dmitry Shelepnev's avatar Dmitry Shelepnev
Browse files

Create FB2sax parser in book_tools

parent 817f5980
No related branches found
No related tags found
No related merge requests found
...@@ -9,6 +9,7 @@ from book_tools.format.mimetype import Mimetype ...@@ -9,6 +9,7 @@ from book_tools.format.mimetype import Mimetype
from book_tools.format.util import list_zip_file_infos from book_tools.format.util import list_zip_file_infos
from book_tools.format.epub import EPub from book_tools.format.epub import EPub
from book_tools.format.fb2 import FB2, FB2Zip from book_tools.format.fb2 import FB2, FB2Zip
from book_tools.format.fb2sax import FB2sax
from book_tools.format.other import Dummy from book_tools.format.other import Dummy
#from fbreader.format.pdf import PDF #from fbreader.format.pdf import PDF
#from fbreader.format.msword import MSWord #from fbreader.format.msword import MSWord
...@@ -82,7 +83,8 @@ def create_bookfile(file, original_filename): ...@@ -82,7 +83,8 @@ def create_bookfile(file, original_filename):
if mimetype == Mimetype.EPUB: if mimetype == Mimetype.EPUB:
return EPub(file, original_filename) return EPub(file, original_filename)
elif mimetype == Mimetype.FB2: elif mimetype == Mimetype.FB2:
return FB2(file, original_filename) #return FB2(file, original_filename)
return FB2sax(file, original_filename)
elif mimetype == Mimetype.FB2_ZIP: elif mimetype == Mimetype.FB2_ZIP:
return FB2Zip(file, original_filename) return FB2Zip(file, original_filename)
elif mimetype == Mimetype.MOBI: elif mimetype == Mimetype.MOBI:
......
...@@ -206,3 +206,5 @@ class FB2Zip(FB2Base): ...@@ -206,3 +206,5 @@ class FB2Zip(FB2Base):
def __exit__(self, kind, value, traceback): def __exit__(self, kind, value, traceback):
self.__zip_file.__exit__(kind, value, traceback) self.__zip_file.__exit__(kind, value, traceback)
pass pass
import xml.parsers.expat
import traceback
import base64
from book_tools.format.bookfile import BookFile
from book_tools.format.mimetype import Mimetype
from book_tools.format.util import strip_symbols
class fb2tag:
def __init__(self,tags):
self.tags=tags
self.attrs=[]
self.attrss=[]
self.index=-1
self.size=len(self.tags)
self.values=[]
self.process_value=False
self.current_value=''
def reset(self):
self.index=-1
self.values=[]
self.attrs=[]
self.attrss=[]
self.process_value=False
self.current_value=''
def tagopen(self,tag,attrs=[]):
result=False
if (self.index+1)<self.size:
if self.tags[self.index+1]==tag:
self.index+=1
if (self.index+1)==self.size:
self.attrs=attrs
self.attrss.append(attrs)
result=True
# Возвращаем True если дошли до последнего значения дерева тэга
return result
def tagclose(self,tag):
if self.index>=0:
if self.tags[self.index]==tag:
self.index-=1
if self.process_value:
self.values.append(self.current_value)
self.process_value=False
def setvalue(self,value):
if (self.index+1)==self.size:
if self.process_value==False:
self.current_value=value
self.process_value=True
else:
self.current_value+=value
def getvalue(self):
return self.values
def gettext(self,divider='\n'):
result=''
if len(self.values)>0:
result=divider.join(self.values)
return result
def getattr(self, attr):
if len(self.attrs)>0:
val=self.attrs.get(attr)
else:
val=None
return val
def getattrs(self, attr):
if len(self.attrss)>0:
val=[a.get(attr) for a in self.attrss if attr in a]
else:
val=[]
return val
class fb2cover(fb2tag):
def __init__(self,tags):
self.iscover=False
self.cover_name=''
self._cover_data=[]
self.isfind=False
fb2tag.__init__(self,tags)
def reset(self):
self.iscover=False
self.cover_name=''
self._cover_data=[]
self.isfind=False
fb2tag.reset(self)
def tagopen(self,tag,attrs=[]):
result=fb2tag.tagopen(self,tag,attrs)
if result:
idvalue=self.getattr('id')
if idvalue!=None:
idvalue=idvalue.lower()
if idvalue==self.cover_name:
self.iscover=True
return result
def tagclose(self,tag):
if self.iscover:
self.isfind=True
self.iscover=False
fb2tag.tagclose(self,tag)
def setcovername(self,cover_name):
if cover_name!=None and cover_name!='':
self.cover_name=cover_name
def add_data(self,data):
if self.iscover:
if data!='\\n':
self._cover_data.append(data)
@property
def cover_data(self):
return ''.join(self._cover_data)
@cover_data.setter
def cover_data(self, value):
self._cover_data = [value]
class fb2parser:
def __init__(self, readcover=0):
self.rc=readcover
self.author_first=fb2tag(('description','title-info','author','first-name'))
self.author_last=fb2tag(('description','title-info','author','last-name'))
self.genre=fb2tag(('description','title-info','genre'))
self.lang=fb2tag(('description','title-info','lang'))
self.book_title=fb2tag(('description','title-info','book-title'))
self.annotation=fb2tag(('description','title-info','annotation','p'))
self.docdate=fb2tag(('description','document-info','date'))
self.series=fb2tag(('description','title-info','sequence'))
if self.rc!=0:
self.cover_name = fb2tag (('description','coverpage','image'))
self.cover_image = fb2cover (('fictionbook','binary'));
self.stoptag='description'
self.process_description=True
self.parse_error=0
self.parse_errormsg=''
def reset(self):
self.process_description=True
self.parse_error=0
self.author_first.reset()
self.author_last.reset()
self.genre.reset()
self.lang.reset()
self.book_title.reset()
self.annotation.reset()
self.series.reset()
self.docdate.reset()
if self.rc!=0:
self.cover_name.reset()
self.cover_image.reset()
def start_element(self,name,attrs):
name=name.lower()
if self.process_description:
self.author_first.tagopen(name)
self.author_last.tagopen(name)
self.genre.tagopen(name)
self.lang.tagopen(name)
self.book_title.tagopen(name)
self.annotation.tagopen(name)
self.docdate.tagopen(name)
self.series.tagopen(name,attrs)
if self.rc!=0:
if self.cover_name.tagopen(name,attrs):
cover_name=self.cover_name.getattr('l:href')
if cover_name=='' or cover_name==None:
cover_name=self.cover_name.getattr('xlink:href')
# Если имя файла не начинается с # то значит данных локально в файле fb2 - нет
if len(cover_name)>0 and cover_name[0]=='#':
cover_name=cover_name.strip('#')
else:
cover_name=None
self.cover_image.setcovername(cover_name)
if self.rc!=0:
self.cover_image.tagopen(name,attrs)
def end_element(self,name):
name=name.lower()
if self.process_description:
self.author_first.tagclose(name)
self.author_last.tagclose(name)
self.genre.tagclose(name)
self.lang.tagclose(name)
self.book_title.tagclose(name)
self.annotation.tagclose(name)
self.docdate.tagclose(name)
self.series.tagclose(name)
if self.rc!=0:
self.cover_name.tagclose(name)
if self.rc!=0:
self.cover_image.tagclose(name)
if self.cover_image.isfind:
raise StopIteration
#Выравниваем количество last_name и first_name
if name=='author':
if len(self.author_last.getvalue())>len(self.author_first.getvalue()):
self.author_first.values.append(" ")
elif len(self.author_last.getvalue())<len(self.author_first.getvalue()):
self.author_last.values.append(" ")
if name==self.stoptag:
if self.rc!=0:
if self.cover_image.cover_name == '':
raise StopIteration
else:
self.process_description=False
else:
raise StopIteration
def char_data(self,data):
if self.process_description:
self.author_first.setvalue(data)
self.author_last.setvalue(data)
self.genre.setvalue(data)
self.lang.setvalue(data)
self.book_title.setvalue(data)
self.annotation.setvalue(data)
self.docdate.setvalue(data)
if self.rc!=0:
self.cover_image.add_data(data)
def parse(self,f,hsize=0):
self.reset()
parser = xml.parsers.expat.ParserCreate()
parser.StartElementHandler = self.start_element
parser.EndElementHandler = self.end_element
parser.CharacterDataHandler = self.char_data
try:
if hsize==0:
parser.Parse(f.read(), True)
else:
parser.Parse(f.read(hsize), True)
except StopIteration:
pass
except Exception as err:
print(err)
self.parse_errormsg=err
self.parse_error=1
class FB2StructureException(Exception):
def __init__(self, error):
Exception.__init__(self, 'fb2 verification failed: %s' % error)
if isinstance(error, Exception):
print(traceback.print_exc())
class FB2sax(BookFile):
def __init__(self, file, original_filename):
BookFile.__init__(self, file, original_filename, Mimetype.FB2)
self.fb2parser = fb2parser(0)
self.file.seek(0, 0)
self.fb2parser.parse(self.file)
if self.fb2parser.parse_error != 0:
raise FB2StructureException('FB2sax parse error')
self.__detect_title()
self.__detect_authors()
self.__detect_tags()
self.__detect_series_info()
self.__detect_language()
self.__detect_docdate()
self.description = self.__detect_description()
def extract_cover_memory(self):
imgfb2parser = fb2parser(1)
self.file.seek(0, 0)
imgfb2parser.parse(self.file)
if len(imgfb2parser.cover_image.cover_data)>0:
try:
s=imgfb2parser.cover_image.cover_data
content=base64.b64decode(s)
return content
except :
return None
return None
def __detect_title(self):
res = ''
if len(self.fb2parser.book_title.getvalue()) > 0:
res = self.fb2parser.book_title.getvalue()[0].strip(strip_symbols)
if len(res) > 0:
self.__set_title__(res)
return None
def __detect_docdate(self):
res = ''
if len(self.fb2parser.docdate.getvalue()) > 0:
res = self.fb2parser.docdate.getvalue()[0].strip();
if len(res) > 0:
self.__set_docdate__(res)
return None
def __detect_authors(self):
for idx, author in enumerate(self.fb2parser.author_last.getvalue()):
last_name = author.strip(strip_symbols)
first_name = self.fb2parser.author_first.getvalue()[idx].strip(strip_symbols)
self.__add_author__(' '.join([first_name, last_name]), last_name)
return None
def __detect_language(self):
res = ''
if len(self.fb2parser.lang.getvalue()) > 0:
res = self.fb2parser.lang.getvalue()[0].strip(strip_symbols)
if len(res) > 0:
self.language_code = res
return None
def __detect_tags(self):
for genre in self.fb2parser.genre.getvalue():
self.__add_tag__(genre.lower().strip(strip_symbols))
def __detect_series_info(self):
for s in self.fb2parser.series.attrss:
ser_name = s.get('name').strip(strip_symbols)
if ser_name:
ser_no = s.get('number', '0').strip(strip_symbols)
ser_no = int(ser_no) if ser_no.isdigit() else None
self.series_info = {
'title': ser_name,
'index': ser_no
}
return None
def __detect_description(self):
res = ''
if len(self.fb2parser.annotation.getvalue()) > 0:
res = ('\n'.join(self.fb2parser.annotation.getvalue()))
if len(res) > 0:
return res
return None
def __exit__(self, kind, value, traceback):
pass
#import PythonMagick #import PythonMagick
from PIL import Image, ImageFile from PIL import Image, ImageFile
strip_symbols = ' »«\'\"\&\n-.#\\\`'
def list_zip_file_infos(zipfile): def list_zip_file_infos(zipfile):
return [info for info in zipfile.infolist() if not info.filename.endswith('/')] return [info for info in zipfile.infolist() if not info.filename.endswith('/')]
......
...@@ -130,7 +130,6 @@ def Cover(request, book_id, thumbnail=False): ...@@ -130,7 +130,6 @@ def Cover(request, book_id, thumbnail=False):
except: except:
book_data = None book_data = None
image = None image = None
print('create_bookfile exception !!!')
if image: if image:
response["Content-Type"] = 'image/jpeg' response["Content-Type"] = 'image/jpeg'
......
...@@ -7,6 +7,7 @@ import logging ...@@ -7,6 +7,7 @@ import logging
import re import re
from book_tools.format import create_bookfile from book_tools.format import create_bookfile
from book_tools.format.util import strip_symbols
from django.db import transaction from django.db import transaction
...@@ -20,7 +21,6 @@ class opdsScanner: ...@@ -20,7 +21,6 @@ class opdsScanner:
def __init__(self, logger=None): def __init__(self, logger=None):
self.fb2parser=None self.fb2parser=None
self.init_parser() self.init_parser()
self.strip_symbols = ' »«\'\"\&\n-.#\\\`'
if logger: if logger:
self.logger = logger self.logger = logger
...@@ -131,10 +131,10 @@ class opdsScanner: ...@@ -131,10 +131,10 @@ class opdsScanner:
name = "%s.%s"%(meta_data[inpx_parser.sFile],meta_data[inpx_parser.sExt]) name = "%s.%s"%(meta_data[inpx_parser.sFile],meta_data[inpx_parser.sExt])
lang=meta_data[inpx_parser.sLang].strip(self.strip_symbols) lang=meta_data[inpx_parser.sLang].strip(strip_symbols)
title=meta_data[inpx_parser.sTitle].strip(self.strip_symbols) title=meta_data[inpx_parser.sTitle].strip(strip_symbols)
annotation='' annotation=''
docdate=meta_data[inpx_parser.sDate].strip(self.strip_symbols) docdate=meta_data[inpx_parser.sDate].strip(strip_symbols)
book=opdsdb.addbook(name,self.rel_path,self.inp_cat,meta_data[inpx_parser.sExt],title,annotation,docdate,lang,meta_data[inpx_parser.sSize],opdsdb.CAT_INP) book=opdsdb.addbook(name,self.rel_path,self.inp_cat,meta_data[inpx_parser.sExt],title,annotation,docdate,lang,meta_data[inpx_parser.sSize],opdsdb.CAT_INP)
...@@ -147,7 +147,7 @@ class opdsScanner: ...@@ -147,7 +147,7 @@ class opdsScanner:
opdsdb.addbauthor(book,author) opdsdb.addbauthor(book,author)
for g in meta_data[inpx_parser.sGenre]: for g in meta_data[inpx_parser.sGenre]:
opdsdb.addbgenre(book,opdsdb.addgenre(g.lower().strip(self.strip_symbols))) opdsdb.addbgenre(book,opdsdb.addgenre(g.lower().strip(strip_symbols)))
for s in meta_data[inpx_parser.sSeries]: for s in meta_data[inpx_parser.sSeries]:
ser=opdsdb.addseries(s.strip()) ser=opdsdb.addseries(s.strip())
...@@ -213,10 +213,10 @@ class opdsScanner: ...@@ -213,10 +213,10 @@ class opdsScanner:
self.bad_books += 1 self.bad_books += 1
if book_data: if book_data:
lang = book_data.language_code.strip(self.strip_symbols) if book_data.language_code else '' lang = book_data.language_code.strip(strip_symbols) if book_data.language_code else ''
title = book_data.title.strip(self.strip_symbols) if book_data.title else n title = book_data.title.strip(strip_symbols) if book_data.title else n
annotation = book_data.description if book_data.description else '' annotation = book_data.description if book_data.description else ''
annotation = annotation.strip(self.strip_symbols) if isinstance(annotation, str) else annotation.decode('utf8').strip(self.strip_symbols) annotation = annotation.strip(strip_symbols) if isinstance(annotation, str) else annotation.decode('utf8').strip(strip_symbols)
docdate = book_data.docdate if book_data.docdate else '' docdate = book_data.docdate if book_data.docdate else ''
book=opdsdb.addbook(name,rel_path,cat,e[1:],title,annotation,docdate,lang,file_size,archive) book=opdsdb.addbook(name,rel_path,cat,e[1:],title,annotation,docdate,lang,file_size,archive)
...@@ -227,7 +227,7 @@ class opdsScanner: ...@@ -227,7 +227,7 @@ class opdsScanner:
self.logger.debug("Book "+rel_path+"/"+name+" Added ok.") self.logger.debug("Book "+rel_path+"/"+name+" Added ok.")
for a in book_data.authors: for a in book_data.authors:
author_name = a.get('name','Unknown author').strip(self.strip_symbols) author_name = a.get('name','Unknown author').strip(strip_symbols)
# Если в имени автора нет запятой, то фамилию переносим из конца в начало # Если в имени автора нет запятой, то фамилию переносим из конца в начало
if author_name.find(',')<0: if author_name.find(',')<0:
author_names = author_name.split() author_names = author_name.split()
...@@ -236,7 +236,7 @@ class opdsScanner: ...@@ -236,7 +236,7 @@ class opdsScanner:
opdsdb.addbauthor(book,author) opdsdb.addbauthor(book,author)
for genre in book_data.tags: for genre in book_data.tags:
opdsdb.addbgenre(book,opdsdb.addgenre(genre.lower().strip(self.strip_symbols))) opdsdb.addbgenre(book,opdsdb.addgenre(genre.lower().strip(strip_symbols)))
for ser in self.fb2parser.series.attrss: for ser in self.fb2parser.series.attrss:
ser_name=ser.get('title').strip() ser_name=ser.get('title').strip()
...@@ -272,9 +272,9 @@ class opdsScanner: ...@@ -272,9 +272,9 @@ class opdsScanner:
f.close() f.close()
if len(self.fb2parser.lang.getvalue())>0: if len(self.fb2parser.lang.getvalue())>0:
lang=self.fb2parser.lang.getvalue()[0].strip(self.strip_symbols) lang=self.fb2parser.lang.getvalue()[0].strip(strip_symbols)
if len(self.fb2parser.book_title.getvalue())>0: if len(self.fb2parser.book_title.getvalue())>0:
title=self.fb2parser.book_title.getvalue()[0].strip(self.strip_symbols) title=self.fb2parser.book_title.getvalue()[0].strip(strip_symbols)
if len(self.fb2parser.annotation.getvalue())>0: if len(self.fb2parser.annotation.getvalue())>0:
annotation=('\n'.join(self.fb2parser.annotation.getvalue())) annotation=('\n'.join(self.fb2parser.annotation.getvalue()))
if len(self.fb2parser.docdate.getvalue())>0: if len(self.fb2parser.docdate.getvalue())>0:
...@@ -298,14 +298,14 @@ class opdsScanner: ...@@ -298,14 +298,14 @@ class opdsScanner:
idx=0 idx=0
for l in self.fb2parser.author_last.getvalue(): for l in self.fb2parser.author_last.getvalue():
last_name=l.strip(self.strip_symbols) last_name=l.strip(strip_symbols)
first_name=self.fb2parser.author_first.getvalue()[idx].strip(self.strip_symbols) first_name=self.fb2parser.author_first.getvalue()[idx].strip(strip_symbols)
#author=opdsdb.addauthor(first_name,last_name) #author=opdsdb.addauthor(first_name,last_name)
author=opdsdb.addauthor("%s %s"%(last_name,first_name)) author=opdsdb.addauthor("%s %s"%(last_name,first_name))
opdsdb.addbauthor(book,author) opdsdb.addbauthor(book,author)
idx+=1 idx+=1
for l in self.fb2parser.genre.getvalue(): for l in self.fb2parser.genre.getvalue():
opdsdb.addbgenre(book,opdsdb.addgenre(l.lower().strip(self.strip_symbols))) opdsdb.addbgenre(book,opdsdb.addgenre(l.lower().strip(strip_symbols)))
for l in self.fb2parser.series.attrss: for l in self.fb2parser.series.attrss:
ser_name=l.get('name') ser_name=l.get('name')
if ser_name: if ser_name:
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment