Files
epuppy/pkg/epub/content.go

92 lines
2.2 KiB
Go
Raw Permalink Normal View History

package epub
import (
"regexp"
"strings"
"github.com/antchfx/xmlquery"
)
var (
cleanenTitles = regexp.MustCompile(`&[a-z]+;`)
isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`)
cleanNewlines = regexp.MustCompile(`[\r\n\s]+`)
cleanSVG = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
cleanMarkup = regexp.MustCompile(`<[^<>]+>`)
cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
)
// Content nav-point content
type Content struct {
Src string `xml:"src,attr" json:"src"`
Empty bool
Body string
Title string
XML []byte
}
// parse XML, look for title and <p>.*</p> stuff
func (c *Content) Extract(content []byte) error {
rawXML := cleanSVG.ReplaceAllString(
cleanenTitles.ReplaceAllString(string(content), " "), "")
var doc *xmlquery.Node
var err error
doc, err = xmlquery.Parse(strings.NewReader(rawXML))
if err != nil {
if strings.Contains(err.Error(), `namespace mbp is missing`) {
fixedmbp := strings.NewReader(
cleanMobiPageBreaks.ReplaceAllString(
rawXML, `<span style="page-break-after: always" />`))
doc, err = xmlquery.Parse(fixedmbp)
if err != nil {
return err
}
} else {
return err
}
}
if c.Title == "" {
// extract the title
for _, item := range xmlquery.Find(doc, "//title") {
c.Title = strings.TrimSpace(item.InnerText())
}
}
// extract all paragraphs, ignore any formatting and re-fill the
// paragraph, that is, we replace all newlines inside with one
// space.
txt := strings.Builder{}
var have_p bool
for _, item := range xmlquery.Find(doc, "//p") {
if !isEmpty.MatchString(item.InnerText()) {
have_p = true
txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
}
}
if !have_p {
// try <div></div>, which some ebooks use, so get all divs,
// remove markup and paragraphify the parts
for _, item := range xmlquery.Find(doc, "//div") {
if !isEmpty.MatchString(item.InnerText()) {
cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
}
}
}
c.Body = strings.TrimSpace(txt.String())
c.XML = content
if len(c.Body) == 0 {
c.Empty = true
}
return nil
}