pkg/epub/content.go

package epub

import (
	"regexp"
	"strings"

	"github.com/antchfx/xmlquery"
)

var (
	cleanenTitles       = regexp.MustCompile(`&[a-z]+;`)
	isEmpty             = regexp.MustCompile(`(?s)^[\s ]*$`)
	cleanNewlines       = regexp.MustCompile(`[\r\n\s]+`)
	cleanSVG            = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
	cleanMarkup         = regexp.MustCompile(`<[^<>]+>`)
	cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
)

// Content nav-point content
type Content struct {
	Src   string `xml:"src,attr" json:"src"`
	Empty bool
	Body  string
	Title string
	XML   []byte
}

// parse XML, look for title and <p>.*</p> stuff
func (c *Content) Extract(content []byte) error {
	rawXML := cleanSVG.ReplaceAllString(
		cleanenTitles.ReplaceAllString(string(content), " "), "")

	var doc *xmlquery.Node
	var err error

	doc, err = xmlquery.Parse(strings.NewReader(rawXML))
	if err != nil {
		if strings.Contains(err.Error(), `namespace mbp is missing`) {
			fixedmbp := strings.NewReader(
				cleanMobiPageBreaks.ReplaceAllString(
					rawXML, `<span style="page-break-after: always" />`))

			doc, err = xmlquery.Parse(fixedmbp)
			if err != nil {
				return err
			}

		} else {
			return err
		}
	}

	if c.Title == "" {
		// extract the title
		for _, item := range xmlquery.Find(doc, "//title") {
			c.Title = strings.TrimSpace(item.InnerText())
		}
	}

	// extract all  paragraphs, ignore any formatting  and re-fill the
	// paragraph,  that is, we  replace all newlines inside  with one
	// space.
	txt := strings.Builder{}
	var have_p bool
	for _, item := range xmlquery.Find(doc, "//p") {
		if !isEmpty.MatchString(item.InnerText()) {
			have_p = true
			txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
		}
	}

	if !have_p {
		// try  <div></div>, which some  ebooks use, so get  all divs,
		// remove markup and paragraphify the parts
		for _, item := range xmlquery.Find(doc, "//div") {
			if !isEmpty.MatchString(item.InnerText()) {
				cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
				txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
			}
		}
	}

	c.Body = strings.TrimSpace(txt.String())
	c.XML = content

	if len(c.Body) == 0 {
		c.Empty = true
	}

	return nil
}
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+								package epub
 								import (
-												add line number support and support for text files

											
										
										
											2025-10-15 21:42:07 +02:00
+									"regexp"
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+									"strings"
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
 									"github.com/antchfx/xmlquery"
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+								)
-												add line number support and support for text files

											
										
										
											2025-10-15 21:42:07 +02:00
+								var (
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+									cleanenTitles       = regexp.MustCompile(`&[a-z]+;`)
 									isEmpty             = regexp.MustCompile(`(?s)^[\s ]*$`)
 									cleanNewlines       = regexp.MustCompile(`[\r\n\s]+`)
 									cleanSVG            = regexp.MustCompile(`(<svg.+</svg>|<!\[CDATA\[.+\]\]>)`)
 									cleanMarkup         = regexp.MustCompile(`<[^<>]+>`)
 									cleanMobiPageBreaks = regexp.MustCompile(`<mbp:pagebreak/>`)
-												add line number support and support for text files

											
										
										
											2025-10-15 21:42:07 +02:00
+								)
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+								// Content nav-point content
 								type Content struct {
 									Src   string `xml:"src,attr" json:"src"`
 									Empty bool
 									Body  string
 									Title string
 									XML   []byte
 								}
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+								// parse XML, look for title and <p>.*</p> stuff
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+								func (c *Content) Extract(content []byte) error {
 									rawXML := cleanSVG.ReplaceAllString(
 										cleanenTitles.ReplaceAllString(string(content), " "), "")
 									var doc *xmlquery.Node
 									var err error
 									doc, err = xmlquery.Parse(strings.NewReader(rawXML))
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+									if err != nil {
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+										if strings.Contains(err.Error(), `namespace mbp is missing`) {
 											fixedmbp := strings.NewReader(
 												cleanMobiPageBreaks.ReplaceAllString(
 													rawXML, `<span style="page-break-after: always" />`))
 											doc, err = xmlquery.Parse(fixedmbp)
 											if err != nil {
 												return err
 											}
 										} else {
 											return err
 										}
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+									}
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+									if c.Title == "" {
 										// extract the title
 										for _, item := range xmlquery.Find(doc, "//title") {
 											c.Title = strings.TrimSpace(item.InnerText())
 										}
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+									}
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+									// extract all  paragraphs, ignore any formatting  and re-fill the
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+									// paragraph,  that is, we  replace all newlines inside  with one
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+									// space.
 									txt := strings.Builder{}
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+									var have_p bool
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+									for _, item := range xmlquery.Find(doc, "//p") {
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+										if !isEmpty.MatchString(item.InnerText()) {
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+											have_p = true
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+											txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+										}
 									}
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+									if !have_p {
 										// try  <div></div>, which some  ebooks use, so get  all divs,
 										// remove markup and paragraphify the parts
 										for _, item := range xmlquery.Find(doc, "//div") {
-												fix #3:

- clean mobi page breaks
- reorganize the code a little
- avoid for duplicate sections

											
										
										
											2026-01-05 08:32:09 +01:00
+											if !isEmpty.MatchString(item.InnerText()) {
 												cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "")
 												txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n")
-												Fix crash and add support for content in <div>  (#9)

* fix #8: better regex to remove html entities
* add opf and ncx debug
* make epub content retrieval more flexible
* fix epub content retrieval: also support html files with <div>
											
										
										
											2025-10-19 22:30:13 +02:00
+											}
 										}
 									}
-												fix XML parsing (#2)

- Use antchfx/xmlquery for easier XML parsing. No more regexp wrangling and the result is much more reliable over a variety of ebooks. Much good.
- fix chapter selection, look for `<?xml[...]` which is much more reliable
- add option `-x` to dump the XML ebook source for debugging
											
										
										
											2025-10-16 18:57:05 +02:00
+									c.Body = strings.TrimSpace(txt.String())
-												lots of fixes and additions:

- add color and color config support
- fix epub parsing
- add variable margin (left and right arrow keys)
- add help screen (hit `?`)
- add config file support

											
										
										
											2025-10-15 14:36:43 +02:00
+									c.XML = content
 									if len(c.Body) == 0 {
 										c.Empty = true
 									}
 									return nil
 								}