mirror of
https://codeberg.org/scip/epuppy.git
synced 2025-12-16 20:11:00 +01:00
fix XML parsing (#2)
- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good. - fix chapter selection, look for `<?xml[...]` which is much more reliable - add option `-x` to dump the XML ebook source for debugging
This commit is contained in:
@@ -16,8 +16,7 @@ type Book struct {
|
||||
Container Container `json:"-"`
|
||||
Mimetype string `json:"-"`
|
||||
Content []Content
|
||||
|
||||
fd *zip.ReadCloser
|
||||
fd *zip.ReadCloser
|
||||
}
|
||||
|
||||
// Open open resource file
|
||||
@@ -34,11 +33,6 @@ func (p *Book) Files() []string {
|
||||
return fns
|
||||
}
|
||||
|
||||
// Close close file reader
|
||||
func (p *Book) Close() error {
|
||||
return p.fd.Close()
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
func (p *Book) filename(n string) string {
|
||||
return path.Join(path.Dir(p.Container.Rootfile.Path), n)
|
||||
|
||||
@@ -1,19 +1,16 @@
|
||||
package epub
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/antchfx/xmlquery"
|
||||
)
|
||||
|
||||
var (
|
||||
cleantitle = regexp.MustCompile(`(?s)<head>.*</head>`)
|
||||
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||
cleanentities = regexp.MustCompile(`&.+;`)
|
||||
cleancomments = regexp.MustCompile(`/*.*/`)
|
||||
cleanspace = regexp.MustCompile(`^\s*`)
|
||||
cleanh1 = regexp.MustCompile(`<h[1-6].*</h[1-6]>`)
|
||||
cleanentitles = regexp.MustCompile(`&.+;`)
|
||||
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||
newlines = regexp.MustCompile(`[\r\n]+`)
|
||||
)
|
||||
|
||||
// Content nav-point content
|
||||
@@ -26,25 +23,30 @@ type Content struct {
|
||||
}
|
||||
|
||||
func (c *Content) String(content []byte) error {
|
||||
title := Title{}
|
||||
|
||||
err := xml.Unmarshal(content, &title)
|
||||
// parse XML, look for title and <p>.*</p> stuff
|
||||
doc, err := xmlquery.Parse(
|
||||
strings.NewReader(
|
||||
cleanentitles.ReplaceAllString(string(content), " ")))
|
||||
if err != nil {
|
||||
if !strings.HasPrefix(err.Error(), "XML syntax error") {
|
||||
return fmt.Errorf("XML parser error %w", err)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// extract the title
|
||||
for _, item := range xmlquery.Find(doc, "//title") {
|
||||
c.Title = strings.TrimSpace(item.InnerText())
|
||||
}
|
||||
|
||||
// extract all paragraphs, ignore any formatting and re-fill the
|
||||
// paragraph, that is, we replaces all newlines inside with one
|
||||
// space.
|
||||
txt := strings.Builder{}
|
||||
for _, item := range xmlquery.Find(doc, "//p") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
c.Title = strings.TrimSpace(title.Content)
|
||||
|
||||
txt := cleantitle.ReplaceAllString(string(content), "")
|
||||
txt = cleanh1.ReplaceAllString(txt, "")
|
||||
txt = cleanmarkup.ReplaceAllString(txt, "")
|
||||
txt = cleanentities.ReplaceAllString(txt, " ")
|
||||
txt = cleancomments.ReplaceAllString(txt, "")
|
||||
txt = strings.TrimSpace(txt)
|
||||
|
||||
c.Body = cleanspace.ReplaceAllString(txt, "")
|
||||
c.Body = strings.TrimSpace(txt.String())
|
||||
c.XML = content
|
||||
|
||||
if len(c.Body) == 0 {
|
||||
|
||||
@@ -1,36 +1,22 @@
|
||||
package epub
|
||||
|
||||
import (
|
||||
"log"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEpub(t *testing.T) {
|
||||
bk, err := open(t, "test.epub")
|
||||
_, err := open(t, "test.epub")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := bk.Close(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}()
|
||||
|
||||
}
|
||||
|
||||
func open(t *testing.T, f string) (*Book, error) {
|
||||
bk, err := Open(f)
|
||||
bk, err := Open(f, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := bk.Close(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}()
|
||||
|
||||
t.Logf("files: %+v", bk.Files())
|
||||
t.Logf("book: %+v", bk)
|
||||
|
||||
|
||||
@@ -7,8 +7,9 @@ type Ncx struct {
|
||||
|
||||
// NavPoint nav point
|
||||
type NavPoint struct {
|
||||
Text string `xml:"navLabel>text" json:"text"`
|
||||
Points []NavPoint `xml:"navPoint" json:"points"`
|
||||
Text string `xml:"navLabel>text" json:"text"`
|
||||
Content Content `xml:"content" json:"content"`
|
||||
Points []NavPoint `xml:"navPoint" json:"points"`
|
||||
}
|
||||
|
||||
type Title struct {
|
||||
|
||||
@@ -2,12 +2,14 @@ package epub
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Open open a epub file
|
||||
func Open(fn string) (*Book, error) {
|
||||
func Open(fn string, dumpxml bool) (*Book, error) {
|
||||
fd, err := zip.OpenReader(fn)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -55,14 +57,21 @@ func Open(fn string) (*Book, error) {
|
||||
}
|
||||
|
||||
ct := Content{Src: file}
|
||||
|
||||
if strings.Contains(string(content), "DOCTYPE") {
|
||||
if strings.Contains(string(content), "<?xml") {
|
||||
if err := ct.String(content); err != nil {
|
||||
return &bk, err
|
||||
}
|
||||
}
|
||||
|
||||
bk.Content = append(bk.Content, ct)
|
||||
|
||||
if dumpxml {
|
||||
fmt.Println(string(ct.XML))
|
||||
}
|
||||
}
|
||||
|
||||
if dumpxml {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
return &bk, nil
|
||||
|
||||
Reference in New Issue
Block a user