Files
epuppy/pkg/epub/content.go
T.v.Dein b50c6acff0 fix XML parsing (#2)
- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
2025-10-16 18:57:05 +02:00

58 lines
1.2 KiB
Go
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package epub
import (
"regexp"
"strings"
"github.com/antchfx/xmlquery"
)
var (
cleanentitles = regexp.MustCompile(`&.+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n]+`)
)
// Content nav-point content
type Content struct {
Src string `xml:"src,attr" json:"src"`
Empty bool
Body string
Title string
XML []byte
}
func (c *Content) String(content []byte) error {
// parse XML, look for title and <p>.*</p> stuff
doc, err := xmlquery.Parse(
strings.NewReader(
cleanentitles.ReplaceAllString(string(content), " ")))
if err != nil {
panic(err)
}
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
// extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replaces all newlines inside with one
// space.
txt := strings.Builder{}
for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) {
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
c.Body = strings.TrimSpace(txt.String())
c.XML = content
if len(c.Body) == 0 {
c.Empty = true
}
return nil
}