Files
epuppy/pkg/epub/content.go
T.v.Dein b50c6acff0 fix XML parsing (#2)
- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
2025-10-16 18:57:05 +02:00

58 lines
1.2 KiB
Go
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package epub
import (
"regexp"
"strings"
"github.com/antchfx/xmlquery"
)
var (
cleanentitles = regexp.MustCompile(`&.+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n]+`)
)
// Content nav-point content
type Content struct {
Src string `xml:"src,attr" json:"src"`
Empty bool
Body string
Title string
XML []byte
}
func (c *Content) String(content []byte) error {
// parse XML, look for title and <p>.*</p> stuff
doc, err := xmlquery.Parse(
strings.NewReader(
cleanentitles.ReplaceAllString(string(content), " ")))
if err != nil {
panic(err)
}
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
// extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replaces all newlines inside with one
// space.
txt := strings.Builder{}
for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) {
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
c.Body = strings.TrimSpace(txt.String())
c.XML = content
if len(c.Body) == 0 {
c.Empty = true
}
return nil
}