bump version

fix #3 :
- clean mobi page breaks - reorganize the code a little - avoid for duplicate sections
2026-07-10 11:44:31 +02:00 · 2026-01-05 09:15:31 +01:00 · 2026-01-05 08:32:09 +01:00
4 changed files with 53 additions and 19 deletions
--- a/README.md
+++ b/README.md
@@ -96,6 +96,16 @@ Options:
 -v --version             show program version
 ```
 ## Reading mobi files
 `epuppy`   doesn't   support  mobi   files,   but   you  can   install
 [mobitool](https://github.com/bfabiszewski/libmobi/)  and  use  it  to
 convert mobi files to epub. The ubuntu package is `libmobi-tools`. To convert, execute: 
 ```default
 mobitool -e somebook.epub
 ```
 ## Installation
 The tool does not have any dependencies.  Just download the binary for
--- a/cmd/config.go
+++ b/cmd/config.go
@@ -32,7 +32,7 @@ import (
 )
 const (
-	Version string = `v0.0.7`
+	Version string = `v0.0.8`
 	Usage   string = `This is epuppy, a terminal ui ebook viewer.
 Usage: epuppy [options] <epub file>
--- a/pkg/epub/content.go
+++ b/pkg/epub/content.go
@@ -8,11 +8,12 @@ import (
 )
 var (
-	cleanentitles = regexp.MustCompile(`&[a-z]+;`)
+	cleanenTitles       = regexp.MustCompile(`&[a-z]+;`)
-	empty         = regexp.MustCompile(`(?s)^[\s ]*$`)
+	isEmpty             = regexp.MustCompile(`(?s)^[\s ]*$`)
-	newlines      = regexp.MustCompile(`[\r\n\s]+`)
+	cleanNewlines       = regexp.MustCompile(`[\r\n\s]+`)
-	cleansvg      = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
+	cleanSVG            = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
-	cleanmarkup   = regexp.MustCompile(`<[^<>]+>`)
+	cleanMarkup         = regexp.MustCompile(`<[^<>]+>`)
 	cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
 )
 // Content nav-point content
@@ -25,13 +26,28 @@ type Content struct {
 }
 // parse XML, look for title and <p>.*</p> stuff
-func (c *Content) String(content []byte) error {
+func (c *Content) Extract(content []byte) error {
-	doc, err := xmlquery.Parse(
+	rawXML := cleanSVG.ReplaceAllString(
-		strings.NewReader(
+		cleanenTitles.ReplaceAllString(string(content), " "), "")
-			cleansvg.ReplaceAllString(
+
-				cleanentitles.ReplaceAllString(string(content), " "), "")))
+	var doc *xmlquery.Node
 	var err error
 	doc, err = xmlquery.Parse(strings.NewReader(rawXML))
 	if err != nil {
-		return err
+		if strings.Contains(err.Error(), `namespace mbp is missing`) {
 			fixedmbp := strings.NewReader(
 				cleanMobiPageBreaks.ReplaceAllString(
 					rawXML, `<span style="page-break-after: always" />`))
 			doc, err = xmlquery.Parse(fixedmbp)
 			if err != nil {
 				return err
 			}
 		} else {
 			return err
 		}
 	}
 	if c.Title == "" {
@@ -47,9 +63,9 @@ func (c *Content) String(content []byte) error {
 	txt := strings.Builder{}
 	var have_p bool
 	for _, item := range xmlquery.Find(doc, "//p") {
-		if !empty.MatchString(item.InnerText()) {
+		if !isEmpty.MatchString(item.InnerText()) {
 			have_p = true
-			txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
+			txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
 		}
 	}
@@ -57,9 +73,9 @@ func (c *Content) String(content []byte) error {
 		// try  <div></div>, which some  ebooks use, so get  all divs,
 		// remove markup and paragraphify the parts
 		for _, item := range xmlquery.Find(doc, "//div") {
-			if !empty.MatchString(item.InnerText()) {
+			if !isEmpty.MatchString(item.InnerText()) {
-				cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
+				cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
-				txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
+				txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
 			}
 		}
 	}
--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -112,6 +112,8 @@ func (bk *Book) getSections() error {
 	// we have ncx points from the TOC, try those
 	if len(bk.Ncx.Points) > 0 {
 		known := map[string]int{}
 		for _, block := range bk.Ncx.Points {
 			sect := Section{
 				File:  "OEBPS/" + block.Content.Src,
@@ -128,7 +130,13 @@ func (bk *Book) getSections() error {
 				}
 			}
-			sections = append(sections, sect)
+			if _, haveFile := known[sect.File]; !haveFile {
 				// sometimes  epub's have  many sections but  they all
 				// point to the same  file. To avoid duplicate content
 				// we ignore sections (thus files) we have already seen.
 				sections = append(sections, sect)
 				known[sect.File] = 1
 			}
 		}
 		if len(sections) < manifestcount {
@@ -189,7 +197,7 @@ func (bk *Book) readSectionContent() error {
 		ct := Content{Src: section.File, Title: section.Title}
 		if types.MatchString(section.MediaType) {
-			if err := ct.String(content); err != nil {
+			if err := ct.Extract(content); err != nil {
 				return err
 			}
 		}
Author	SHA1	Message	Date
Thomas von Dein	ecbfba8809	bump version	2026-01-05 09:15:31 +01:00
Thomas von Dein	372a7b1b00	fix #3 : - clean mobi page breaks - reorganize the code a little - avoid for duplicate sections	2026-01-05 08:32:09 +01:00