Fix crash and add support for content in <div> (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
This commit is contained in:
T.v.Dein
2025-10-19 22:30:13 +02:00
committed by GitHub
parent 2c6e81a2c8
commit cb671b7401
3 changed files with 75 additions and 23 deletions

View File

@@ -8,9 +8,10 @@ import (
)
var (
cleanentitles = regexp.MustCompile(`&.+;`)
cleanentitles = regexp.MustCompile(`&[a-z]+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n]+`)
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
)
// Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
XML []byte
}
// parse XML, look for title and <p>.*</p> stuff
func (c *Content) String(content []byte) error {
// parse XML, look for title and <p>.*</p> stuff
doc, err := xmlquery.Parse(
strings.NewReader(
cleanentitles.ReplaceAllString(string(content), " ")))
if err != nil {
panic(err)
return err
}
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
if c.Title == "" {
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
}
// extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replaces all newlines inside with one
// paragraph, that is, we replace all newlines inside with one
// space.
txt := strings.Builder{}
var have_p bool
for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) {
have_p = true
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
if !have_p {
// try <div></div>, which some ebooks use, so get all divs,
// remove markup and paragraphify the parts
for _, item := range xmlquery.Find(doc, "//div") {
if !empty.MatchString(item.InnerText()) {
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
}
}
}
c.Body = strings.TrimSpace(txt.String())
c.XML = content