Fix crash and add support for content in <div> (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
This commit is contained in:
T.v.Dein
2025-10-19 22:30:13 +02:00
committed by GitHub
parent 2c6e81a2c8
commit cb671b7401
3 changed files with 75 additions and 23 deletions

View File

@@ -22,6 +22,7 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"github.com/alecthomas/repr"
"github.com/tlinden/epuppy/pkg/epub" "github.com/tlinden/epuppy/pkg/epub"
) )
@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
return 0, err return 0, err
} }
if conf.Debug {
repr.Println(book.Files())
repr.Println(book.Ncx)
}
buf := strings.Builder{} buf := strings.Builder{}
head := strings.Builder{} head := strings.Builder{}
@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
return fmt.Println(buf.String()) return fmt.Println(buf.String())
} }
if conf.Debug {
return 0, nil
}
return Pager(&Ebook{ return Pager(&Ebook{
Config: conf, Config: conf,
Title: head.String(), Title: head.String(),

View File

@@ -8,9 +8,10 @@ import (
) )
var ( var (
cleanentitles = regexp.MustCompile(`&.+;`) cleanentitles = regexp.MustCompile(`&[a-z]+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`) empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n]+`) newlines = regexp.MustCompile(`[\r\n]+`)
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
) )
// Content nav-point content // Content nav-point content
@@ -22,30 +23,45 @@ type Content struct {
XML []byte XML []byte
} }
// parse XML, look for title and <p>.*</p> stuff
func (c *Content) String(content []byte) error { func (c *Content) String(content []byte) error {
// parse XML, look for title and <p>.*</p> stuff
doc, err := xmlquery.Parse( doc, err := xmlquery.Parse(
strings.NewReader( strings.NewReader(
cleanentitles.ReplaceAllString(string(content), " "))) cleanentitles.ReplaceAllString(string(content), " ")))
if err != nil { if err != nil {
panic(err) return err
} }
// extract the title if c.Title == "" {
for _, item := range xmlquery.Find(doc, "//title") { // extract the title
c.Title = strings.TrimSpace(item.InnerText()) for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
} }
// extract all paragraphs, ignore any formatting and re-fill the // extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replaces all newlines inside with one // paragraph, that is, we replace all newlines inside with one
// space. // space.
txt := strings.Builder{} txt := strings.Builder{}
var have_p bool
for _, item := range xmlquery.Find(doc, "//p") { for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) { if !empty.MatchString(item.InnerText()) {
have_p = true
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n") txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
} }
} }
if !have_p {
// try <div></div>, which some ebooks use, so get all divs,
// remove markup and paragraphify the parts
for _, item := range xmlquery.Find(doc, "//div") {
if !empty.MatchString(item.InnerText()) {
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
}
}
}
c.Body = strings.TrimSpace(txt.String()) c.Body = strings.TrimSpace(txt.String())
c.XML = content c.XML = content

View File

@@ -6,6 +6,8 @@ import (
"log" "log"
"os" "os"
"strings" "strings"
"github.com/alecthomas/repr"
) )
// Open open a epub file // Open open a epub file
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
} }
} }
for _, file := range bk.Files() { type section struct {
content, err := bk.readBytes(file) file, title string
}
sections := []section{}
if len(bk.Ncx.Points) > 0 {
for _, block := range bk.Ncx.Points {
sections = append(sections,
section{
file: "OEBPS/" + block.Content.Src,
title: block.Text,
})
}
} else {
for _, file := range bk.Files() {
sections = append(sections,
section{
file: file,
})
}
}
for _, section := range sections {
content, err := bk.readBytes(section.file)
if err != nil { if err != nil {
return &bk, err return &bk, err
} }
ct := Content{Src: file} if strings.Contains(section.file, bk.CoverFile) {
if strings.Contains(string(content), "<?xml") {
if err := ct.String(content); err != nil {
return &bk, err
}
bk.Content = append(bk.Content, ct)
if dumpxml {
fmt.Println(string(ct.XML))
}
}
if strings.Contains(file, bk.CoverFile) {
bk.CoverImage = content bk.CoverImage = content
} }
ct := Content{Src: section.file, Title: section.title}
if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
if err := ct.String(content); err != nil {
return &bk, err
}
}
if dumpxml {
fmt.Println(string(ct.XML))
}
bk.Content = append(bk.Content, ct)
} }
if dumpxml { if dumpxml {
repr.Println(sections)
os.Exit(0) os.Exit(0)
} }