2025-10-15 14:36:43 +02:00
|
|
|
|
package epub
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
2025-10-15 21:42:07 +02:00
|
|
|
|
"regexp"
|
2025-10-15 14:36:43 +02:00
|
|
|
|
"strings"
|
2025-10-16 18:57:05 +02:00
|
|
|
|
|
|
|
|
|
|
"github.com/antchfx/xmlquery"
|
2025-10-15 14:36:43 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
2025-10-15 21:42:07 +02:00
|
|
|
|
var (
|
2026-01-05 08:32:09 +01:00
|
|
|
|
cleanenTitles = regexp.MustCompile(`&[a-z]+;`)
|
|
|
|
|
|
isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
|
|
|
|
|
cleanNewlines = regexp.MustCompile(`[\r\n\s]+`)
|
|
|
|
|
|
cleanSVG = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
|
|
|
|
|
|
cleanMarkup = regexp.MustCompile(`<[^<>]+>`)
|
|
|
|
|
|
cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
|
2025-10-15 21:42:07 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
2025-10-15 14:36:43 +02:00
|
|
|
|
// Content nav-point content
|
|
|
|
|
|
type Content struct {
|
|
|
|
|
|
Src string `xml:"src,attr" json:"src"`
|
|
|
|
|
|
Empty bool
|
|
|
|
|
|
Body string
|
|
|
|
|
|
Title string
|
|
|
|
|
|
XML []byte
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-10-19 22:30:13 +02:00
|
|
|
|
// parse XML, look for title and <p>.*</p> stuff
|
2026-01-05 08:32:09 +01:00
|
|
|
|
func (c *Content) Extract(content []byte) error {
|
|
|
|
|
|
rawXML := cleanSVG.ReplaceAllString(
|
|
|
|
|
|
cleanenTitles.ReplaceAllString(string(content), " "), "")
|
|
|
|
|
|
|
|
|
|
|
|
var doc *xmlquery.Node
|
|
|
|
|
|
var err error
|
|
|
|
|
|
|
|
|
|
|
|
doc, err = xmlquery.Parse(strings.NewReader(rawXML))
|
2025-10-15 14:36:43 +02:00
|
|
|
|
if err != nil {
|
2026-01-05 08:32:09 +01:00
|
|
|
|
if strings.Contains(err.Error(), `namespace mbp is missing`) {
|
|
|
|
|
|
fixedmbp := strings.NewReader(
|
|
|
|
|
|
cleanMobiPageBreaks.ReplaceAllString(
|
|
|
|
|
|
rawXML, `<span style="page-break-after: always" />`))
|
|
|
|
|
|
|
|
|
|
|
|
doc, err = xmlquery.Parse(fixedmbp)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
2025-10-15 14:36:43 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-10-19 22:30:13 +02:00
|
|
|
|
if c.Title == "" {
|
|
|
|
|
|
// extract the title
|
|
|
|
|
|
for _, item := range xmlquery.Find(doc, "//title") {
|
|
|
|
|
|
c.Title = strings.TrimSpace(item.InnerText())
|
|
|
|
|
|
}
|
2025-10-16 18:57:05 +02:00
|
|
|
|
}
|
2025-10-15 14:36:43 +02:00
|
|
|
|
|
2025-10-16 18:57:05 +02:00
|
|
|
|
// extract all paragraphs, ignore any formatting and re-fill the
|
2025-10-19 22:30:13 +02:00
|
|
|
|
// paragraph, that is, we replace all newlines inside with one
|
2025-10-16 18:57:05 +02:00
|
|
|
|
// space.
|
|
|
|
|
|
txt := strings.Builder{}
|
2025-10-19 22:30:13 +02:00
|
|
|
|
var have_p bool
|
2025-10-16 18:57:05 +02:00
|
|
|
|
for _, item := range xmlquery.Find(doc, "//p") {
|
2026-01-05 08:32:09 +01:00
|
|
|
|
if !isEmpty.MatchString(item.InnerText()) {
|
2025-10-19 22:30:13 +02:00
|
|
|
|
have_p = true
|
2026-01-05 08:32:09 +01:00
|
|
|
|
txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
2025-10-16 18:57:05 +02:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-10-15 14:36:43 +02:00
|
|
|
|
|
2025-10-19 22:30:13 +02:00
|
|
|
|
if !have_p {
|
|
|
|
|
|
// try <div></div>, which some ebooks use, so get all divs,
|
|
|
|
|
|
// remove markup and paragraphify the parts
|
|
|
|
|
|
for _, item := range xmlquery.Find(doc, "//div") {
|
2026-01-05 08:32:09 +01:00
|
|
|
|
if !isEmpty.MatchString(item.InnerText()) {
|
|
|
|
|
|
cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
|
|
|
|
|
|
txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
|
2025-10-19 22:30:13 +02:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-10-16 18:57:05 +02:00
|
|
|
|
c.Body = strings.TrimSpace(txt.String())
|
2025-10-15 14:36:43 +02:00
|
|
|
|
c.XML = content
|
|
|
|
|
|
|
|
|
|
|
|
if len(c.Body) == 0 {
|
|
|
|
|
|
c.Empty = true
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|