From cb671b7401c66edcefc271a172d463548e686bfb Mon Sep 17 00:00:00 2001
From: "T.v.Dein" <git@daemon.de>
Date: Sun, 19 Oct 2025 22:30:13 +0200
Subject: [PATCH] Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
---
 cmd/view.go         | 10 ++++++++
 pkg/epub/content.go | 30 +++++++++++++++++------
 pkg/epub/open.go    | 58 ++++++++++++++++++++++++++++++++-------------
 3 files changed, 75 insertions(+), 23 deletions(-)
diff --git a/cmd/view.go b/cmd/view.go
index c70a525..eac636d 100644
--- a/cmd/view.go
+++ b/cmd/view.go
@@ -22,6 +22,7 @@ import (
 	"path/filepath"
 	"strings"
 
+	"github.com/alecthomas/repr"
 	"github.com/tlinden/epuppy/pkg/epub"
 )
 
@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
 		return 0, err
 	}
 
+	if conf.Debug {
+		repr.Println(book.Files())
+		repr.Println(book.Ncx)
+	}
+
 	buf := strings.Builder{}
 	head := strings.Builder{}
 
@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
 		return fmt.Println(buf.String())
 	}
 
+	if conf.Debug {
+		return 0, nil
+	}
+
 	return Pager(&Ebook{
 		Config:    conf,
 		Title:     head.String(),
diff --git a/pkg/epub/content.go b/pkg/epub/content.go
index a29a177..af2273d 100644
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,9 +8,10 @@ import (
 )
 
 var (
-	cleanentitles = regexp.MustCompile(`&.+;`)
+	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
 	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
 	newlines      = regexp.MustCompile(`[\r\n]+`)
+	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
 )
 
 // Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
 	XML   []byte
 }
 
+// parse XML, look for title and <p>.*</p> stuff
 func (c *Content) String(content []byte) error {
-	// parse XML, look for title and <p>.*</p> stuff
 	doc, err := xmlquery.Parse(
 		strings.NewReader(
 			cleanentitles.ReplaceAllString(string(content), " ")))
 	if err != nil {
-		panic(err)
+		return err
 	}
 
-	// extract the title
-	for _, item := range xmlquery.Find(doc, "//title") {
-		c.Title = strings.TrimSpace(item.InnerText())
+	if c.Title == "" {
+		// extract the title
+		for _, item := range xmlquery.Find(doc, "//title") {
+			c.Title = strings.TrimSpace(item.InnerText())
+		}
 	}
 
 	// extract all  paragraphs, ignore any formatting  and re-fill the
-	// paragraph,  that is, we  replaces all newlines inside  with one
+	// paragraph,  that is, we  replace all newlines inside  with one
 	// space.
 	txt := strings.Builder{}
+	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
 		if !empty.MatchString(item.InnerText()) {
+			have_p = true
 			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}
 
+	if !have_p {
+		// try  <div></div>, which some  ebooks use, so get  all divs,
+		// remove markup and paragraphify the parts
+		for _, item := range xmlquery.Find(doc, "//div") {
+			if !empty.MatchString(item.InnerText()) {
+				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
+				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
+			}
+		}
+	}
+
 	c.Body = strings.TrimSpace(txt.String())
 	c.XML = content
 
diff --git a/pkg/epub/open.go b/pkg/epub/open.go
index a42d2f6..2d4d925 100644
--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -6,6 +6,8 @@ import (
 	"log"
 	"os"
 	"strings"
+
+	"github.com/alecthomas/repr"
 )
 
 // Open open a epub file
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
 		}
 	}
 
-	for _, file := range bk.Files() {
-		content, err := bk.readBytes(file)
+	type section struct {
+		file, title string
+	}
+
+	sections := []section{}
+
+	if len(bk.Ncx.Points) > 0 {
+		for _, block := range bk.Ncx.Points {
+			sections = append(sections,
+				section{
+					file:  "OEBPS/" + block.Content.Src,
+					title: block.Text,
+				})
+		}
+	} else {
+		for _, file := range bk.Files() {
+			sections = append(sections,
+				section{
+					file: file,
+				})
+		}
+	}
+
+	for _, section := range sections {
+		content, err := bk.readBytes(section.file)
 		if err != nil {
 			return &bk, err
 		}
 
-		ct := Content{Src: file}
-		if strings.Contains(string(content), "<?xml") {
-			if err := ct.String(content); err != nil {
-				return &bk, err
-			}
-
-			bk.Content = append(bk.Content, ct)
-
-			if dumpxml {
-				fmt.Println(string(ct.XML))
-			}
-		}
-
-		if strings.Contains(file, bk.CoverFile) {
+		if strings.Contains(section.file, bk.CoverFile) {
 			bk.CoverImage = content
 		}
 
+		ct := Content{Src: section.file, Title: section.title}
+
+		if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
+			if err := ct.String(content); err != nil {
+				return &bk, err
+			}
+		}
+
+		if dumpxml {
+			fmt.Println(string(ct.XML))
+		}
+
+		bk.Content = append(bk.Content, ct)
 	}
 
 	if dumpxml {
+		repr.Println(sections)
 		os.Exit(0)
 	}