Fix crash and add support for content in <div> (#9)

* fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with <div>
2026-02-04 01:30:58 +01:00 · 2025-10-19 22:30:13 +02:00
parent 2c6e81a2c8
commit cb671b7401
3 changed files with 75 additions and 23 deletions
--- a/cmd/view.go
+++ b/cmd/view.go
@@ -22,6 +22,7 @@ import (
 	"path/filepath"
 	"strings"

+	"github.com/alecthomas/repr"
 	"github.com/tlinden/epuppy/pkg/epub"
 )

@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
 		return 0, err
 	}

+	if conf.Debug {
+		repr.Println(book.Files())
+		repr.Println(book.Ncx)
+	}
+
 	buf := strings.Builder{}
 	head := strings.Builder{}

@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
 		return fmt.Println(buf.String())
 	}

+	if conf.Debug {
+		return 0, nil
+	}
+
 	return Pager(&Ebook{
 		Config:    conf,
 		Title:     head.String(),
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,9 +8,10 @@ import (
 )

 var (
-	cleanentitles = regexp.MustCompile(`&.+;`)
+	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
 	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
 	newlines      = regexp.MustCompile(`[\r\n]+`)
+	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
 )

 // Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
 	XML   []byte
 }

+// parse XML, look for title and <p>.*</p> stuff
 func (c *Content) String(content []byte) error {
-	// parse XML, look for title and <p>.*</p> stuff
 	doc, err := xmlquery.Parse(
 		strings.NewReader(
 			cleanentitles.ReplaceAllString(string(content), " ")))
 	if err != nil {
-		panic(err)
+		return err
 	}

-	// extract the title
-	for _, item := range xmlquery.Find(doc, "//title") {
-		c.Title = strings.TrimSpace(item.InnerText())
+	if c.Title == "" {
+		// extract the title
+		for _, item := range xmlquery.Find(doc, "//title") {
+			c.Title = strings.TrimSpace(item.InnerText())
+		}
 	}

 	// extract all  paragraphs, ignore any formatting  and re-fill the
-	// paragraph,  that is, we  replaces all newlines inside  with one
+	// paragraph,  that is, we  replace all newlines inside  with one
 	// space.
 	txt := strings.Builder{}
+	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
 		if !empty.MatchString(item.InnerText()) {
+			have_p = true
 			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}

+	if !have_p {
+		// try  <div></div>, which some  ebooks use, so get  all divs,
+		// remove markup and paragraphify the parts
+		for _, item := range xmlquery.Find(doc, "//div") {
+			if !empty.MatchString(item.InnerText()) {
+				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
+				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
+			}
+		}
+	}
+
 	c.Body = strings.TrimSpace(txt.String())
 	c.XML = content

--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -6,6 +6,8 @@ import (
 	"log"
 	"os"
 	"strings"
+
+	"github.com/alecthomas/repr"
 )

 // Open open a epub file
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
 		}
 	}

-	for _, file := range bk.Files() {
-		content, err := bk.readBytes(file)
+	type section struct {
+		file, title string
+	}
+
+	sections := []section{}
+
+	if len(bk.Ncx.Points) > 0 {
+		for _, block := range bk.Ncx.Points {
+			sections = append(sections,
+				section{
+					file:  "OEBPS/" + block.Content.Src,
+					title: block.Text,
+				})
+		}
+	} else {
+		for _, file := range bk.Files() {
+			sections = append(sections,
+				section{
+					file: file,
+				})
+		}
+	}
+
+	for _, section := range sections {
+		content, err := bk.readBytes(section.file)
 		if err != nil {
 			return &bk, err
 		}

-		ct := Content{Src: file}
-		if strings.Contains(string(content), "<?xml") {
-			if err := ct.String(content); err != nil {
-				return &bk, err
-			}
-
-			bk.Content = append(bk.Content, ct)
-
-			if dumpxml {
-				fmt.Println(string(ct.XML))
-			}
-		}
-
-		if strings.Contains(file, bk.CoverFile) {
+		if strings.Contains(section.file, bk.CoverFile) {
 			bk.CoverImage = content
 		}

+		ct := Content{Src: section.file, Title: section.title}
+
+		if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
+			if err := ct.String(content); err != nil {
+				return &bk, err
+			}
+		}
+
+		if dumpxml {
+			fmt.Println(string(ct.XML))
+		}
+
+		bk.Content = append(bk.Content, ct)
 	}

 	if dumpxml {
+		repr.Println(sections)
 		os.Exit(0)
 	}