fix bug when reading epubs created with mobitool (#5)

This commit is contained in:
T. von Dein
2026-01-05 11:19:48 +01:00
parent 02c99da8e9
commit 85d23c42f0
4 changed files with 53 additions and 19 deletions

View File

@@ -8,11 +8,12 @@ import (
)
var (
cleanentitles = regexp.MustCompile(`&[a-z]+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n\s]+`)
cleansvg = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
cleanenTitles = regexp.MustCompile(`&[a-z]+;`)
isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`)
cleanNewlines = regexp.MustCompile(`[\r\n\s]+`)
cleanSVG = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
cleanMarkup = regexp.MustCompile(`<[^<>]+>`)
cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
)
// Content nav-point content
@@ -25,13 +26,28 @@ type Content struct {
}
// parse XML, look for title and <p>.*</p> stuff
func (c *Content) String(content []byte) error {
doc, err := xmlquery.Parse(
strings.NewReader(
cleansvg.ReplaceAllString(
cleanentitles.ReplaceAllString(string(content), " "), "")))
func (c *Content) Extract(content []byte) error {
rawXML := cleanSVG.ReplaceAllString(
cleanenTitles.ReplaceAllString(string(content), " "), "")
var doc *xmlquery.Node
var err error
doc, err = xmlquery.Parse(strings.NewReader(rawXML))
if err != nil {
return err
if strings.Contains(err.Error(), `namespace mbp is missing`) {
fixedmbp := strings.NewReader(
cleanMobiPageBreaks.ReplaceAllString(
rawXML, `<span style="page-break-after: always" />`))
doc, err = xmlquery.Parse(fixedmbp)
if err != nil {
return err
}
} else {
return err
}
}
if c.Title == "" {
@@ -47,9 +63,9 @@ func (c *Content) String(content []byte) error {
txt := strings.Builder{}
var have_p bool
for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) {
if !isEmpty.MatchString(item.InnerText()) {
have_p = true
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
@@ -57,9 +73,9 @@ func (c *Content) String(content []byte) error {
// try <div></div>, which some ebooks use, so get all divs,
// remove markup and paragraphify the parts
for _, item := range xmlquery.Find(doc, "//div") {
if !empty.MatchString(item.InnerText()) {
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
if !isEmpty.MatchString(item.InnerText()) {
cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
}
}
}