fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
This commit is contained in:
T.v.Dein
2025-10-16 18:57:05 +02:00
committed by GitHub
parent 90d30cb3e1
commit b50c6acff0
13 changed files with 143 additions and 71 deletions

View File

@@ -2,12 +2,14 @@ package epub
import (
"archive/zip"
"fmt"
"log"
"os"
"strings"
)
// Open open a epub file
func Open(fn string) (*Book, error) {
func Open(fn string, dumpxml bool) (*Book, error) {
fd, err := zip.OpenReader(fn)
if err != nil {
return nil, err
@@ -55,14 +57,21 @@ func Open(fn string) (*Book, error) {
}
ct := Content{Src: file}
if strings.Contains(string(content), "DOCTYPE") {
if strings.Contains(string(content), "<?xml") {
if err := ct.String(content); err != nil {
return &bk, err
}
}
bk.Content = append(bk.Content, ct)
if dumpxml {
fmt.Println(string(ct.XML))
}
}
if dumpxml {
os.Exit(0)
}
return &bk, nil