From cb671b7401c66edcefc271a172d463548e686bfb Mon Sep 17 00:00:00 2001 From: "T.v.Dein" Date: Sun, 19 Oct 2025 22:30:13 +0200 Subject: [PATCH] Fix crash and add support for content in
(#9) * fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with
--- cmd/view.go | 10 ++++++++ pkg/epub/content.go | 30 +++++++++++++++++------ pkg/epub/open.go | 58 ++++++++++++++++++++++++++++++++------------- 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/cmd/view.go b/cmd/view.go index c70a525..eac636d 100644 --- a/cmd/view.go +++ b/cmd/view.go @@ -22,6 +22,7 @@ import ( "path/filepath" "strings" + "github.com/alecthomas/repr" "github.com/tlinden/epuppy/pkg/epub" ) @@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) { return 0, err } + if conf.Debug { + repr.Println(book.Files()) + repr.Println(book.Ncx) + } + buf := strings.Builder{} head := strings.Builder{} @@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) { return fmt.Println(buf.String()) } + if conf.Debug { + return 0, nil + } + return Pager(&Ebook{ Config: conf, Title: head.String(), diff --git a/pkg/epub/content.go b/pkg/epub/content.go index a29a177..af2273d 100644 --- a/pkg/epub/content.go +++ b/pkg/epub/content.go @@ -8,9 +8,10 @@ import ( ) var ( - cleanentitles = regexp.MustCompile(`&.+;`) + cleanentitles = regexp.MustCompile(`&[a-z]+;`) empty = regexp.MustCompile(`(?s)^[\s ]*$`) newlines = regexp.MustCompile(`[\r\n]+`) + cleanmarkup = regexp.MustCompile(`<[^<>]+>`) ) // Content nav-point content @@ -22,30 +23,45 @@ type Content struct { XML []byte } +// parse XML, look for title and

.*

stuff func (c *Content) String(content []byte) error { - // parse XML, look for title and

.*

stuff doc, err := xmlquery.Parse( strings.NewReader( cleanentitles.ReplaceAllString(string(content), " "))) if err != nil { - panic(err) + return err } - // extract the title - for _, item := range xmlquery.Find(doc, "//title") { - c.Title = strings.TrimSpace(item.InnerText()) + if c.Title == "" { + // extract the title + for _, item := range xmlquery.Find(doc, "//title") { + c.Title = strings.TrimSpace(item.InnerText()) + } } // extract all paragraphs, ignore any formatting and re-fill the - // paragraph, that is, we replaces all newlines inside with one + // paragraph, that is, we replace all newlines inside with one // space. txt := strings.Builder{} + var have_p bool for _, item := range xmlquery.Find(doc, "//p") { if !empty.MatchString(item.InnerText()) { + have_p = true txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n") } } + if !have_p { + // try
, which some ebooks use, so get all divs, + // remove markup and paragraphify the parts + for _, item := range xmlquery.Find(doc, "//div") { + if !empty.MatchString(item.InnerText()) { + cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "") + txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n") + } + } + } + c.Body = strings.TrimSpace(txt.String()) c.XML = content diff --git a/pkg/epub/open.go b/pkg/epub/open.go index a42d2f6..2d4d925 100644 --- a/pkg/epub/open.go +++ b/pkg/epub/open.go @@ -6,6 +6,8 @@ import ( "log" "os" "strings" + + "github.com/alecthomas/repr" ) // Open open a epub file @@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) { } } - for _, file := range bk.Files() { - content, err := bk.readBytes(file) + type section struct { + file, title string + } + + sections := []section{} + + if len(bk.Ncx.Points) > 0 { + for _, block := range bk.Ncx.Points { + sections = append(sections, + section{ + file: "OEBPS/" + block.Content.Src, + title: block.Text, + }) + } + } else { + for _, file := range bk.Files() { + sections = append(sections, + section{ + file: file, + }) + } + } + + for _, section := range sections { + content, err := bk.readBytes(section.file) if err != nil { return &bk, err } - ct := Content{Src: file} - if strings.Contains(string(content), "