fix #3:

- clean mobi page breaks - reorganize the code a little - avoid for duplicate sections
2026-02-04 09:40:57 +01:00 · 2026-01-05 08:32:09 +01:00
parent 02c99da8e9
commit 372a7b1b00
3 changed files with 52 additions and 18 deletions
--- a/README.md
+++ b/README.md
@@ -96,6 +96,16 @@ Options:
 -v --version             show program version
 ```

+## Reading mobi files
+
+`epuppy`   doesn't   support  mobi   files,   but   you  can   install
+[mobitool](https://github.com/bfabiszewski/libmobi/)  and  use  it  to
+convert mobi files to epub. The ubuntu package is `libmobi-tools`. To convert, execute: 
+
+```default
+mobitool -e somebook.epub
+```
+
 ## Installation

 The tool does not have any dependencies.  Just download the binary for
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,11 +8,12 @@ import (
 )

 var (
-	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
-	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
-	newlines      = regexp.MustCompile(`[\r\n\s]+`)
-	cleansvg      = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
-	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
+	cleanenTitles       = regexp.MustCompile(`&[a-z]+;`)
+	isEmpty             = regexp.MustCompile(`(?s)^[\s ]*$`)
+	cleanNewlines       = regexp.MustCompile(`[\r\n\s]+`)
+	cleanSVG            = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
+	cleanMarkup         = regexp.MustCompile(`<[^<>]+>`)
+	cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
 )

 // Content nav-point content
@@ -25,15 +26,30 @@ type Content struct {
 }

 // parse XML, look for title and <p>.*</p> stuff
-func (c *Content) String(content []byte) error {
-	doc, err := xmlquery.Parse(
-		strings.NewReader(
-			cleansvg.ReplaceAllString(
-				cleanentitles.ReplaceAllString(string(content), " "), "")))
+func (c *Content) Extract(content []byte) error {
+	rawXML := cleanSVG.ReplaceAllString(
+		cleanenTitles.ReplaceAllString(string(content), " "), "")
+
+	var doc *xmlquery.Node
+	var err error
+
+	doc, err = xmlquery.Parse(strings.NewReader(rawXML))
+	if err != nil {
+		if strings.Contains(err.Error(), `namespace mbp is missing`) {
+			fixedmbp := strings.NewReader(
+				cleanMobiPageBreaks.ReplaceAllString(
+					rawXML, `<span style="page-break-after: always" />`))
+
+			doc, err = xmlquery.Parse(fixedmbp)
 			if err != nil {
 				return err
 			}

+		} else {
+			return err
+		}
+	}
+
 	if c.Title == "" {
 		// extract the title
 		for _, item := range xmlquery.Find(doc, "//title") {
@@ -47,9 +63,9 @@ func (c *Content) String(content []byte) error {
 	txt := strings.Builder{}
 	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
-		if !empty.MatchString(item.InnerText()) {
+		if !isEmpty.MatchString(item.InnerText()) {
 			have_p = true
-			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
+			txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}

@@ -57,9 +73,9 @@ func (c *Content) String(content []byte) error {
 		// try  <div></div>, which some  ebooks use, so get  all divs,
 		// remove markup and paragraphify the parts
 		for _, item := range xmlquery.Find(doc, "//div") {
-			if !empty.MatchString(item.InnerText()) {
-				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
-				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
+			if !isEmpty.MatchString(item.InnerText()) {
+				cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
+				txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
 			}
 		}
 	}
--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -112,6 +112,8 @@ func (bk *Book) getSections() error {

 	// we have ncx points from the TOC, try those
 	if len(bk.Ncx.Points) > 0 {
+		known := map[string]int{}
+
 		for _, block := range bk.Ncx.Points {
 			sect := Section{
 				File:  "OEBPS/" + block.Content.Src,
@@ -128,7 +130,13 @@ func (bk *Book) getSections() error {
 				}
 			}

+			if _, haveFile := known[sect.File]; !haveFile {
+				// sometimes  epub's have  many sections but  they all
+				// point to the same  file. To avoid duplicate content
+				// we ignore sections (thus files) we have already seen.
 				sections = append(sections, sect)
+				known[sect.File] = 1
+			}
 		}

 		if len(sections) < manifestcount {
@@ -189,7 +197,7 @@ func (bk *Book) readSectionContent() error {
 		ct := Content{Src: section.File, Title: section.Title}

 		if types.MatchString(section.MediaType) {
-			if err := ct.String(content); err != nil {
+			if err := ct.Extract(content); err != nil {
 				return err
 			}
 		}