fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good. - fix chapter selection, look for `<?xml[...]` which is much more reliable - add option `-x` to dump the XML ebook source for debugging
2026-02-04 09:40:57 +01:00 · 2025-10-16 18:57:05 +02:00
parent 90d30cb3e1
commit b50c6acff0
13 changed files with 143 additions and 71 deletions
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -1,19 +1,16 @@
 package epub

 import (
-	"encoding/xml"
-	"fmt"
 	"regexp"
 	"strings"
+
+	"github.com/antchfx/xmlquery"
 )

 var (
-	cleantitle    = regexp.MustCompile(`(?s)<head>.*</head>`)
-	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
-	cleanentities = regexp.MustCompile(`&.+;`)
-	cleancomments = regexp.MustCompile(`/*.*/`)
-	cleanspace    = regexp.MustCompile(`^\s*`)
-	cleanh1       = regexp.MustCompile(`<h[1-6].*</h[1-6]>`)
+	cleanentitles = regexp.MustCompile(`&.+;`)
+	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
+	newlines      = regexp.MustCompile(`[\r\n]+`)
 )

 // Content nav-point content
@@ -26,25 +23,30 @@ type Content struct {
 }

 func (c *Content) String(content []byte) error {
-	title := Title{}
-
-	err := xml.Unmarshal(content, &title)
+	// parse XML, look for title and <p>.*</p> stuff
+	doc, err := xmlquery.Parse(
+		strings.NewReader(
+			cleanentitles.ReplaceAllString(string(content), " ")))
 	if err != nil {
-		if !strings.HasPrefix(err.Error(), "XML syntax error") {
-			return fmt.Errorf("XML parser error %w", err)
+		panic(err)
+	}
+
+	// extract the title
+	for _, item := range xmlquery.Find(doc, "//title") {
+		c.Title = strings.TrimSpace(item.InnerText())
+	}
+
+	// extract all  paragraphs, ignore any formatting  and re-fill the
+	// paragraph,  that is, we  replaces all newlines inside  with one
+	// space.
+	txt := strings.Builder{}
+	for _, item := range xmlquery.Find(doc, "//p") {
+		if !empty.MatchString(item.InnerText()) {
+			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}

-	c.Title = strings.TrimSpace(title.Content)
-
-	txt := cleantitle.ReplaceAllString(string(content), "")
-	txt = cleanh1.ReplaceAllString(txt, "")
-	txt = cleanmarkup.ReplaceAllString(txt, "")
-	txt = cleanentities.ReplaceAllString(txt, " ")
-	txt = cleancomments.ReplaceAllString(txt, "")
-	txt = strings.TrimSpace(txt)
-
-	c.Body = cleanspace.ReplaceAllString(txt, "")
+	c.Body = strings.TrimSpace(txt.String())
 	c.XML = content

 	if len(c.Body) == 0 {