From f524083210033e4eeace175260fa6c248c380122 Mon Sep 17 00:00:00 2001 From: "T.v.Dein" Date: Mon, 20 Oct 2025 18:54:49 +0000 Subject: [PATCH] Fix more parser failures (#10) * stabilize section parsing, now seems to read all ebooks I tested with * refactored Open() into smaller funcs * bump version --- cmd/config.go | 2 +- cmd/view.go | 5 +- pkg/epub/book.go | 7 ++ pkg/epub/open.go | 171 ++++++++++++++++++++++++++++++++++++----------- 4 files changed, 144 insertions(+), 41 deletions(-) diff --git a/cmd/config.go b/cmd/config.go index 31eda4b..164b6c7 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -32,7 +32,7 @@ import ( ) const ( - Version string = `v0.0.5` + Version string = `v0.0.6` Usage string = `This is epuppy, a terminal ui ebook viewer. Usage: epuppy [options] diff --git a/cmd/view.go b/cmd/view.go index eac636d..d5c74f7 100644 --- a/cmd/view.go +++ b/cmd/view.go @@ -59,8 +59,11 @@ func ViewEpub(conf *Config) (int, error) { } if conf.Debug { + repr.Println("book.Files()") repr.Println(book.Files()) repr.Println(book.Ncx) + repr.Println(book.Sections) + repr.Println(book.Opf.Manifest) } buf := strings.Builder{} @@ -84,7 +87,7 @@ func ViewEpub(conf *Config) (int, error) { return fmt.Println(buf.String()) } - if conf.Debug { + if conf.Debug || conf.XML { return 0, nil } diff --git a/pkg/epub/book.go b/pkg/epub/book.go index 34b90c6..f235562 100644 --- a/pkg/epub/book.go +++ b/pkg/epub/book.go @@ -9,6 +9,11 @@ import ( "path" ) +// a section in the book +type Section struct { + File, Title, MediaType string +} + // Book epub book type Book struct { Ncx Ncx `json:"ncx"` @@ -20,6 +25,8 @@ type Book struct { CoverImage []byte CoverFile string CoverMediaType string + Sections []Section + dumpxml bool } // Open open resource file diff --git a/pkg/epub/open.go b/pkg/epub/open.go index 2d4d925..ff5441f 100644 --- a/pkg/epub/open.go +++ b/pkg/epub/open.go @@ -4,48 +4,87 @@ import ( "archive/zip" "fmt" "log" - "os" + "path/filepath" + "regexp" "strings" - - "github.com/alecthomas/repr" ) -// Open open a epub file +var ( + // to find content + types = regexp.MustCompile(`application/(xml|html|xhtml|htm)`) + + // cleanup regexes + deanchor = regexp.MustCompile(`#.*$`) + cleanext = regexp.MustCompile(`^\.`) +) + +// Open open a epub file and return the filled Book structure func Open(fn string, dumpxml bool) (*Book, error) { + bk, err := openFile(fn, dumpxml) + if err != nil { + return bk, err + } + + defer func() { + if err := bk.fd.Close(); err != nil { + log.Fatal(err) + } + }() + + if err := bk.getManifest(); err != nil { + return bk, err + } + + if err := bk.getSections(); err != nil { + return bk, err + } + + if err := bk.readSectionContent(); err != nil { + return bk, err + } + + return bk, nil +} + +// load the epub zip file +func openFile(fn string, dumpxml bool) (*Book, error) { fd, err := zip.OpenReader(fn) if err != nil { return nil, err } - defer func() { - if err := fd.Close(); err != nil { - log.Fatal(err) - } - }() + bk := &Book{fd: fd, dumpxml: dumpxml} - bk := Book{fd: fd} + return bk, nil +} + +// load the manifest +func (bk *Book) getManifest() error { mt, err := bk.readBytes("mimetype") if err != nil { - return &bk, err + return err } bk.Mimetype = string(mt) + // contains the root path err = bk.readXML("META-INF/container.xml", &bk.Container) if err != nil { - return &bk, err + return err } + // contains the OPF data err = bk.readXML(bk.Container.Rootfile.Path, &bk.Opf) if err != nil { - return &bk, err + return err } + // look for TOC (might be incomplete, see below!) for _, mf := range bk.Opf.Manifest { if mf.ID == bk.Opf.Spine.Toc { err = bk.readXML(bk.filename(mf.Href), &bk.Ncx) if err != nil { - return &bk, err + return err } } @@ -55,58 +94,112 @@ func Open(fn string, dumpxml bool) (*Book, error) { } } - type section struct { - file, title string + return nil +} + +// extract the readable sections of the epub +func (bk *Book) getSections() error { + // to store our final content sections + sections := []Section{} + + // count the content items in the raw manifest + var manifestcount int + for _, item := range bk.Opf.Manifest { + if types.MatchString(item.MediaType) { + manifestcount++ + } } - sections := []section{} - + // we have ncx points from the TOC, try those if len(bk.Ncx.Points) > 0 { for _, block := range bk.Ncx.Points { - sections = append(sections, - section{ - file: "OEBPS/" + block.Content.Src, - title: block.Text, - }) + sect := Section{ + File: "OEBPS/" + block.Content.Src, + Title: block.Text, + } + + srcfile := deanchor.ReplaceAllString(block.Content.Src, "") + + for _, file := range bk.Files() { + if strings.Contains(file, srcfile) { + sect.File = file + sect.MediaType = "application/" + cleanext.ReplaceAllString(filepath.Ext(file), "") + break + } + } + + sections = append(sections, sect) + } + + if len(sections) < manifestcount { + // TOC was incomplete, restart from scratch but use the + // OPF Manifest directly + + sections = []Section{} + + for _, item := range bk.Opf.Manifest { + if types.MatchString(item.MediaType) { + sect := Section{ + File: "OEBPS/" + item.Href, + MediaType: item.MediaType, + } + + srcfile := deanchor.ReplaceAllString(item.Href, "") + + for _, file := range bk.Files() { + if strings.Contains(file, srcfile) { + sect.File = file + break + } + } + + sections = append(sections, sect) + } + } } } else { + // no TOC, just pull in the files directly for _, file := range bk.Files() { sections = append(sections, - section{ - file: file, + Section{ + File: file, + MediaType: "application/" + cleanext.ReplaceAllString(filepath.Ext(file), ""), }) } } - for _, section := range sections { - content, err := bk.readBytes(section.file) + // final sections to keep + bk.Sections = sections + + return nil +} + +func (bk *Book) readSectionContent() error { + // now read in the actual xml contents + for _, section := range bk.Sections { + content, err := bk.readBytes(section.File) if err != nil { - return &bk, err + return err } - if strings.Contains(section.file, bk.CoverFile) { + if strings.Contains(section.File, bk.CoverFile) { bk.CoverImage = content } - ct := Content{Src: section.file, Title: section.title} + ct := Content{Src: section.File, Title: section.Title} - if strings.Contains(string(content), "