Fix more parser failures (#10)

* stabilize section parsing, now seems to read all ebooks I tested with * refactored Open() into smaller funcs * bump version
2026-02-04 01:30:58 +01:00 · 2025-10-20 18:54:49 +00:00
parent cb671b7401
commit f524083210
4 changed files with 144 additions and 41 deletions
--- a/cmd/config.go
+++ b/cmd/config.go
@@ -32,7 +32,7 @@ import (
 )

 const (
-	Version string = `v0.0.5`
+	Version string = `v0.0.6`
 	Usage   string = `This is epuppy, a terminal ui ebook viewer.

 Usage: epuppy [options] <epub file>
--- a/cmd/view.go
+++ b/cmd/view.go
@@ -59,8 +59,11 @@ func ViewEpub(conf *Config) (int, error) {
 	}

 	if conf.Debug {
+		repr.Println("book.Files()")
 		repr.Println(book.Files())
 		repr.Println(book.Ncx)
+		repr.Println(book.Sections)
+		repr.Println(book.Opf.Manifest)
 	}

 	buf := strings.Builder{}
@@ -84,7 +87,7 @@ func ViewEpub(conf *Config) (int, error) {
 		return fmt.Println(buf.String())
 	}

-	if conf.Debug {
+	if conf.Debug || conf.XML {
 		return 0, nil
 	}

--- a/pkg/epub/book.go
+++ b/pkg/epub/book.go
@@ -9,6 +9,11 @@ import (
 	"path"
 )

+// a section in the book
+type Section struct {
+	File, Title, MediaType string
+}
+
 // Book epub book
 type Book struct {
 	Ncx            Ncx       `json:"ncx"`
@@ -20,6 +25,8 @@ type Book struct {
 	CoverImage     []byte
 	CoverFile      string
 	CoverMediaType string
+	Sections       []Section
+	dumpxml        bool
 }

 // Open open resource file
--- a/pkg/epub/open.go
+++ b/pkg/epub/open.go
@@ -4,48 +4,87 @@ import (
 	"archive/zip"
 	"fmt"
 	"log"
-	"os"
+	"path/filepath"
+	"regexp"
 	"strings"
-
-	"github.com/alecthomas/repr"
 )

-// Open open a epub file
+var (
+	// to find content
+	types = regexp.MustCompile(`application/(xml|html|xhtml|htm)`)
+
+	// cleanup regexes
+	deanchor = regexp.MustCompile(`#.*$`)
+	cleanext = regexp.MustCompile(`^\.`)
+)
+
+// Open open a epub file and return the filled Book structure
 func Open(fn string, dumpxml bool) (*Book, error) {
+	bk, err := openFile(fn, dumpxml)
+	if err != nil {
+		return bk, err
+	}
+
+	defer func() {
+		if err := bk.fd.Close(); err != nil {
+			log.Fatal(err)
+		}
+	}()
+
+	if err := bk.getManifest(); err != nil {
+		return bk, err
+	}
+
+	if err := bk.getSections(); err != nil {
+		return bk, err
+	}
+
+	if err := bk.readSectionContent(); err != nil {
+		return bk, err
+	}
+
+	return bk, nil
+}
+
+// load the epub zip file
+func openFile(fn string, dumpxml bool) (*Book, error) {
 	fd, err := zip.OpenReader(fn)
 	if err != nil {
 		return nil, err
 	}

-	defer func() {
-		if err := fd.Close(); err != nil {
-			log.Fatal(err)
-		}
-	}()
+	bk := &Book{fd: fd, dumpxml: dumpxml}

-	bk := Book{fd: fd}
+	return bk, nil
+}
+
+// load the manifest
+func (bk *Book) getManifest() error {
 	mt, err := bk.readBytes("mimetype")
 	if err != nil {
-		return &bk, err
+		return err
 	}

 	bk.Mimetype = string(mt)

+	// contains the root path
 	err = bk.readXML("META-INF/container.xml", &bk.Container)
 	if err != nil {
-		return &bk, err
+		return err
 	}

+	// contains the OPF data
 	err = bk.readXML(bk.Container.Rootfile.Path, &bk.Opf)
 	if err != nil {
-		return &bk, err
+		return err
 	}

+	// look for TOC (might be incomplete, see below!)
 	for _, mf := range bk.Opf.Manifest {
 		if mf.ID == bk.Opf.Spine.Toc {
 			err = bk.readXML(bk.filename(mf.Href), &bk.Ncx)
 			if err != nil {
-				return &bk, err
+				return err
 			}
 		}

@@ -55,58 +94,112 @@ func Open(fn string, dumpxml bool) (*Book, error) {
 		}
 	}

-	type section struct {
-		file, title string
+	return nil
+}
+
+// extract the readable sections of the epub
+func (bk *Book) getSections() error {
+	// to store our final content sections
+	sections := []Section{}
+
+	// count the content items in the raw manifest
+	var manifestcount int
+	for _, item := range bk.Opf.Manifest {
+		if types.MatchString(item.MediaType) {
+			manifestcount++
+		}
 	}

-	sections := []section{}
-
+	// we have ncx points from the TOC, try those
 	if len(bk.Ncx.Points) > 0 {
 		for _, block := range bk.Ncx.Points {
-			sections = append(sections,
-				section{
-					file:  "OEBPS/" + block.Content.Src,
-					title: block.Text,
-				})
+			sect := Section{
+				File:  "OEBPS/" + block.Content.Src,
+				Title: block.Text,
+			}
+
+			srcfile := deanchor.ReplaceAllString(block.Content.Src, "")
+
+			for _, file := range bk.Files() {
+				if strings.Contains(file, srcfile) {
+					sect.File = file
+					sect.MediaType = "application/" + cleanext.ReplaceAllString(filepath.Ext(file), "")
+					break
+				}
+			}
+
+			sections = append(sections, sect)
+		}
+
+		if len(sections) < manifestcount {
+			// TOC  was incomplete, restart  from scratch but  use the
+			// OPF Manifest directly
+
+			sections = []Section{}
+
+			for _, item := range bk.Opf.Manifest {
+				if types.MatchString(item.MediaType) {
+					sect := Section{
+						File:      "OEBPS/" + item.Href,
+						MediaType: item.MediaType,
+					}
+
+					srcfile := deanchor.ReplaceAllString(item.Href, "")
+
+					for _, file := range bk.Files() {
+						if strings.Contains(file, srcfile) {
+							sect.File = file
+							break
+						}
+					}
+
+					sections = append(sections, sect)
+				}
+			}
 		}
 	} else {
+		// no TOC, just pull in the files directly
 		for _, file := range bk.Files() {
 			sections = append(sections,
-				section{
-					file: file,
+				Section{
+					File:      file,
+					MediaType: "application/" + cleanext.ReplaceAllString(filepath.Ext(file), ""),
 				})
 		}
 	}

-	for _, section := range sections {
-		content, err := bk.readBytes(section.file)
+	// final sections to keep
+	bk.Sections = sections
+
+	return nil
+}
+
+func (bk *Book) readSectionContent() error {
+	// now read in the actual xml contents
+	for _, section := range bk.Sections {
+		content, err := bk.readBytes(section.File)
 		if err != nil {
-			return &bk, err
+			return err
 		}

-		if strings.Contains(section.file, bk.CoverFile) {
+		if strings.Contains(section.File, bk.CoverFile) {
 			bk.CoverImage = content
 		}

-		ct := Content{Src: section.file, Title: section.title}
+		ct := Content{Src: section.File, Title: section.Title}

-		if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
+		if types.MatchString(section.MediaType) {
 			if err := ct.String(content); err != nil {
-				return &bk, err
+				return err
 			}
 		}

-		if dumpxml {
+		if bk.dumpxml {
 			fmt.Println(string(ct.XML))
 		}

 		bk.Content = append(bk.Content, ct)
 	}

-	if dumpxml {
-		repr.Println(sections)
-		os.Exit(0)
-	}
-
-	return &bk, nil
+	return nil
 }