Fix crash and add support for content in <div> (#9)

* fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with <div>
2026-02-04 09:40:57 +01:00 · 2025-10-19 22:30:13 +02:00
parent 2c6e81a2c8
commit cb671b7401
3 changed files with 75 additions and 23 deletions
--- a/cmd/view.go
+++ b/cmd/view.go
@@ -22,6 +22,7 @@ import (
 	"path/filepath"
 	"strings"
 	"github.com/alecthomas/repr"
 	"github.com/tlinden/epuppy/pkg/epub"
 )
@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
 		return 0, err
 	}
 	if conf.Debug {
 		repr.Println(book.Files())
 		repr.Println(book.Ncx)
 	}
 	buf := strings.Builder{}
 	head := strings.Builder{}
@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
 		return fmt.Println(buf.String())
 	}
 	if conf.Debug {
 		return 0, nil
 	}
 	return Pager(&Ebook{
 		Config:    conf,
 		Title:     head.String(),
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,9 +8,10 @@ import (
 )
 var (
-	cleanentitles = regexp.MustCompile(`&.+;`)
+	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
 	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
 	newlines      = regexp.MustCompile(`[\r\n]+`)
 	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
 )
 // Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
 	XML   []byte
 }
 // parse XML, look for title and <p>.*</p> stuff
 func (c *Content) String(content []byte) error {
 	// parse XML, look for title and <p>.*</p> stuff
 	doc, err := xmlquery.Parse(
 		strings.NewReader(
 			cleanentitles.ReplaceAllString(string(content), " ")))
 	if err != nil {
-		panic(err)
+		return err
 	}
-	// extract the title
+	if c.Title == "" {
-	for _, item := range xmlquery.Find(doc, "//title") {
+		// extract the title
-		c.Title = strings.TrimSpace(item.InnerText())
+		for _, item := range xmlquery.Find(doc, "//title") {
 			c.Title = strings.TrimSpace(item.InnerText())
 		}
 	}
 	// extract all  paragraphs, ignore any formatting  and re-fill the
-	// paragraph,  that is, we  replaces all newlines inside  with one
+	// paragraph,  that is, we  replace all newlines inside  with one
 	// space.
 	txt := strings.Builder{}
 	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
 		if !empty.MatchString(item.InnerText()) {
 			have_p = true
 			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}
 	if !have_p {
 		// try  <div></div>, which some  ebooks use, so get  all divs,
 		// remove markup and paragraphify the parts
 		for _, item := range xmlquery.Find(doc, "//div") {
 			if !empty.MatchString(item.InnerText()) {
 				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
 				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
 			}
 		}
 	}
 	c.Body = strings.TrimSpace(txt.String())
 	c.XML = content
--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -6,6 +6,8 @@ import (
 	"log"
 	"os"
 	"strings"
 	"github.com/alecthomas/repr"
 )
 // Open open a epub file
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
 		}
 	}
-	for _, file := range bk.Files() {
+	type section struct {
-		content, err := bk.readBytes(file)
+		file, title string
 	}
 	sections := []section{}
 	if len(bk.Ncx.Points) > 0 {
 		for _, block := range bk.Ncx.Points {
 			sections = append(sections,
 				section{
 					file:  "OEBPS/" + block.Content.Src,
 					title: block.Text,
 				})
 		}
 	} else {
 		for _, file := range bk.Files() {
 			sections = append(sections,
 				section{
 					file: file,
 				})
 		}
 	}
 	for _, section := range sections {
 		content, err := bk.readBytes(section.file)
 		if err != nil {
 			return &bk, err
 		}
-		ct := Content{Src: file}
+		if strings.Contains(section.file, bk.CoverFile) {
 		if strings.Contains(string(content), "<?xml") {
 			if err := ct.String(content); err != nil {
 				return &bk, err
 			}
 			bk.Content = append(bk.Content, ct)
 			if dumpxml {
 				fmt.Println(string(ct.XML))
 			}
 		}
 		if strings.Contains(file, bk.CoverFile) {
 			bk.CoverImage = content
 		}
 		ct := Content{Src: section.file, Title: section.title}
 		if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
 			if err := ct.String(content); err != nil {
 				return &bk, err
 			}
 		}
 		if dumpxml {
 			fmt.Println(string(ct.XML))
 		}
 		bk.Content = append(bk.Content, ct)
 	}
 	if dumpxml {
 		repr.Println(sections)
 		os.Exit(0)
 	}