fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
This commit is contained in:
T.v.Dein
2025-10-16 18:57:05 +02:00
committed by GitHub
parent 90d30cb3e1
commit b50c6acff0
13 changed files with 143 additions and 71 deletions

View File

@@ -16,8 +16,7 @@ type Book struct {
Container Container `json:"-"`
Mimetype string `json:"-"`
Content []Content
fd *zip.ReadCloser
fd *zip.ReadCloser
}
// Open open resource file
@@ -34,11 +33,6 @@ func (p *Book) Files() []string {
return fns
}
// Close close file reader
func (p *Book) Close() error {
return p.fd.Close()
}
// -----------------------------------------------------------------------------
func (p *Book) filename(n string) string {
return path.Join(path.Dir(p.Container.Rootfile.Path), n)

View File

@@ -1,19 +1,16 @@
package epub
import (
"encoding/xml"
"fmt"
"regexp"
"strings"
"github.com/antchfx/xmlquery"
)
var (
cleantitle = regexp.MustCompile(`(?s)<head>.*</head>`)
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
cleanentities = regexp.MustCompile(`&.+;`)
cleancomments = regexp.MustCompile(`/*.*/`)
cleanspace = regexp.MustCompile(`^\s*`)
cleanh1 = regexp.MustCompile(`<h[1-6].*</h[1-6]>`)
cleanentitles = regexp.MustCompile(`&.+;`)
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
newlines = regexp.MustCompile(`[\r\n]+`)
)
// Content nav-point content
@@ -26,25 +23,30 @@ type Content struct {
}
func (c *Content) String(content []byte) error {
title := Title{}
err := xml.Unmarshal(content, &title)
// parse XML, look for title and <p>.*</p> stuff
doc, err := xmlquery.Parse(
strings.NewReader(
cleanentitles.ReplaceAllString(string(content), " ")))
if err != nil {
if !strings.HasPrefix(err.Error(), "XML syntax error") {
return fmt.Errorf("XML parser error %w", err)
panic(err)
}
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
// extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replaces all newlines inside with one
// space.
txt := strings.Builder{}
for _, item := range xmlquery.Find(doc, "//p") {
if !empty.MatchString(item.InnerText()) {
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
c.Title = strings.TrimSpace(title.Content)
txt := cleantitle.ReplaceAllString(string(content), "")
txt = cleanh1.ReplaceAllString(txt, "")
txt = cleanmarkup.ReplaceAllString(txt, "")
txt = cleanentities.ReplaceAllString(txt, " ")
txt = cleancomments.ReplaceAllString(txt, "")
txt = strings.TrimSpace(txt)
c.Body = cleanspace.ReplaceAllString(txt, "")
c.Body = strings.TrimSpace(txt.String())
c.XML = content
if len(c.Body) == 0 {

View File

@@ -1,36 +1,22 @@
package epub
import (
"log"
"testing"
)
func TestEpub(t *testing.T) {
bk, err := open(t, "test.epub")
_, err := open(t, "test.epub")
if err != nil {
t.Fatal(err)
}
defer func() {
if err := bk.Close(); err != nil {
log.Fatal(err)
}
}()
}
func open(t *testing.T, f string) (*Book, error) {
bk, err := Open(f)
bk, err := Open(f, false)
if err != nil {
return nil, err
}
defer func() {
if err := bk.Close(); err != nil {
log.Fatal(err)
}
}()
t.Logf("files: %+v", bk.Files())
t.Logf("book: %+v", bk)

View File

@@ -7,8 +7,9 @@ type Ncx struct {
// NavPoint nav point
type NavPoint struct {
Text string `xml:"navLabel>text" json:"text"`
Points []NavPoint `xml:"navPoint" json:"points"`
Text string `xml:"navLabel>text" json:"text"`
Content Content `xml:"content" json:"content"`
Points []NavPoint `xml:"navPoint" json:"points"`
}
type Title struct {

View File

@@ -2,12 +2,14 @@ package epub
import (
"archive/zip"
"fmt"
"log"
"os"
"strings"
)
// Open open a epub file
func Open(fn string) (*Book, error) {
func Open(fn string, dumpxml bool) (*Book, error) {
fd, err := zip.OpenReader(fn)
if err != nil {
return nil, err
@@ -55,14 +57,21 @@ func Open(fn string) (*Book, error) {
}
ct := Content{Src: file}
if strings.Contains(string(content), "DOCTYPE") {
if strings.Contains(string(content), "<?xml") {
if err := ct.String(content); err != nil {
return &bk, err
}
}
bk.Content = append(bk.Content, ct)
if dumpxml {
fmt.Println(string(ct.XML))
}
}
if dumpxml {
os.Exit(0)
}
return &bk, nil