Fix crash and add support for content in <div> (#9)

* fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with <div>
2026-02-04 09:40:57 +01:00 · 2025-10-19 22:30:13 +02:00
parent 2c6e81a2c8
commit cb671b7401
3 changed files with 75 additions and 23 deletions
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,9 +8,10 @@ import (
 )

 var (
-	cleanentitles = regexp.MustCompile(`&.+;`)
+	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
 	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
 	newlines      = regexp.MustCompile(`[\r\n]+`)
+	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
 )

 // Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
 	XML   []byte
 }

+// parse XML, look for title and <p>.*</p> stuff
 func (c *Content) String(content []byte) error {
-	// parse XML, look for title and <p>.*</p> stuff
 	doc, err := xmlquery.Parse(
 		strings.NewReader(
 			cleanentitles.ReplaceAllString(string(content), " ")))
 	if err != nil {
-		panic(err)
+		return err
 	}

-	// extract the title
-	for _, item := range xmlquery.Find(doc, "//title") {
-		c.Title = strings.TrimSpace(item.InnerText())
+	if c.Title == "" {
+		// extract the title
+		for _, item := range xmlquery.Find(doc, "//title") {
+			c.Title = strings.TrimSpace(item.InnerText())
+		}
 	}

 	// extract all  paragraphs, ignore any formatting  and re-fill the
-	// paragraph,  that is, we  replaces all newlines inside  with one
+	// paragraph,  that is, we  replace all newlines inside  with one
 	// space.
 	txt := strings.Builder{}
+	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
 		if !empty.MatchString(item.InnerText()) {
+			have_p = true
 			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}

+	if !have_p {
+		// try  <div></div>, which some  ebooks use, so get  all divs,
+		// remove markup and paragraphify the parts
+		for _, item := range xmlquery.Find(doc, "//div") {
+			if !empty.MatchString(item.InnerText()) {
+				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
+				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
+			}
+		}
+	}
+
 	c.Body = strings.TrimSpace(txt.String())
 	c.XML = content