mirror of
https://codeberg.org/scip/epuppy.git
synced 2026-02-04 09:40:57 +01:00
fix #3:
- clean mobi page breaks - reorganize the code a little - avoid for duplicate sections
This commit is contained in:
@@ -8,11 +8,12 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
cleanentitles = regexp.MustCompile(`&[a-z]+;`)
|
||||
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||
newlines = regexp.MustCompile(`[\r\n\s]+`)
|
||||
cleansvg = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
|
||||
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||
cleanenTitles = regexp.MustCompile(`&[a-z]+;`)
|
||||
isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||
cleanNewlines = regexp.MustCompile(`[\r\n\s]+`)
|
||||
cleanSVG = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
|
||||
cleanMarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||
cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
|
||||
)
|
||||
|
||||
// Content nav-point content
|
||||
@@ -25,13 +26,28 @@ type Content struct {
|
||||
}
|
||||
|
||||
// parse XML, look for title and <p>.*</p> stuff
|
||||
func (c *Content) String(content []byte) error {
|
||||
doc, err := xmlquery.Parse(
|
||||
strings.NewReader(
|
||||
cleansvg.ReplaceAllString(
|
||||
cleanentitles.ReplaceAllString(string(content), " "), "")))
|
||||
func (c *Content) Extract(content []byte) error {
|
||||
rawXML := cleanSVG.ReplaceAllString(
|
||||
cleanenTitles.ReplaceAllString(string(content), " "), "")
|
||||
|
||||
var doc *xmlquery.Node
|
||||
var err error
|
||||
|
||||
doc, err = xmlquery.Parse(strings.NewReader(rawXML))
|
||||
if err != nil {
|
||||
return err
|
||||
if strings.Contains(err.Error(), `namespace mbp is missing`) {
|
||||
fixedmbp := strings.NewReader(
|
||||
cleanMobiPageBreaks.ReplaceAllString(
|
||||
rawXML, `<span style="page-break-after: always" />`))
|
||||
|
||||
doc, err = xmlquery.Parse(fixedmbp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if c.Title == "" {
|
||||
@@ -47,9 +63,9 @@ func (c *Content) String(content []byte) error {
|
||||
txt := strings.Builder{}
|
||||
var have_p bool
|
||||
for _, item := range xmlquery.Find(doc, "//p") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
if !isEmpty.MatchString(item.InnerText()) {
|
||||
have_p = true
|
||||
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||
txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,9 +73,9 @@ func (c *Content) String(content []byte) error {
|
||||
// try <div></div>, which some ebooks use, so get all divs,
|
||||
// remove markup and paragraphify the parts
|
||||
for _, item := range xmlquery.Find(doc, "//div") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
|
||||
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
|
||||
if !isEmpty.MatchString(item.InnerText()) {
|
||||
cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
|
||||
txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user