mirror of
https://codeberg.org/scip/epuppy.git
synced 2025-12-18 04:51:01 +01:00
fix XML parsing (#2)
- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good. - fix chapter selection, look for `<?xml[...]` which is much more reliable - add option `-x` to dump the XML ebook source for debugging
This commit is contained in:
@@ -1,19 +1,16 @@
|
||||
package epub
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/antchfx/xmlquery"
|
||||
)
|
||||
|
||||
var (
|
||||
cleantitle = regexp.MustCompile(`(?s)<head>.*</head>`)
|
||||
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||
cleanentities = regexp.MustCompile(`&.+;`)
|
||||
cleancomments = regexp.MustCompile(`/*.*/`)
|
||||
cleanspace = regexp.MustCompile(`^\s*`)
|
||||
cleanh1 = regexp.MustCompile(`<h[1-6].*</h[1-6]>`)
|
||||
cleanentitles = regexp.MustCompile(`&.+;`)
|
||||
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||
newlines = regexp.MustCompile(`[\r\n]+`)
|
||||
)
|
||||
|
||||
// Content nav-point content
|
||||
@@ -26,25 +23,30 @@ type Content struct {
|
||||
}
|
||||
|
||||
func (c *Content) String(content []byte) error {
|
||||
title := Title{}
|
||||
|
||||
err := xml.Unmarshal(content, &title)
|
||||
// parse XML, look for title and <p>.*</p> stuff
|
||||
doc, err := xmlquery.Parse(
|
||||
strings.NewReader(
|
||||
cleanentitles.ReplaceAllString(string(content), " ")))
|
||||
if err != nil {
|
||||
if !strings.HasPrefix(err.Error(), "XML syntax error") {
|
||||
return fmt.Errorf("XML parser error %w", err)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// extract the title
|
||||
for _, item := range xmlquery.Find(doc, "//title") {
|
||||
c.Title = strings.TrimSpace(item.InnerText())
|
||||
}
|
||||
|
||||
// extract all paragraphs, ignore any formatting and re-fill the
|
||||
// paragraph, that is, we replaces all newlines inside with one
|
||||
// space.
|
||||
txt := strings.Builder{}
|
||||
for _, item := range xmlquery.Find(doc, "//p") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
c.Title = strings.TrimSpace(title.Content)
|
||||
|
||||
txt := cleantitle.ReplaceAllString(string(content), "")
|
||||
txt = cleanh1.ReplaceAllString(txt, "")
|
||||
txt = cleanmarkup.ReplaceAllString(txt, "")
|
||||
txt = cleanentities.ReplaceAllString(txt, " ")
|
||||
txt = cleancomments.ReplaceAllString(txt, "")
|
||||
txt = strings.TrimSpace(txt)
|
||||
|
||||
c.Body = cleanspace.ReplaceAllString(txt, "")
|
||||
c.Body = strings.TrimSpace(txt.String())
|
||||
c.XML = content
|
||||
|
||||
if len(c.Body) == 0 {
|
||||
|
||||
Reference in New Issue
Block a user