Fix more parser failures (#10)

* stabilize section parsing, now seems to read all ebooks I tested with
* refactored Open() into smaller funcs
* bump version
This commit is contained in:
T.v.Dein
2025-10-20 18:54:49 +00:00
committed by GitHub
parent cb671b7401
commit f524083210
4 changed files with 144 additions and 41 deletions

View File

@@ -32,7 +32,7 @@ import (
)
const (
Version string = `v0.0.5`
Version string = `v0.0.6`
Usage string = `This is epuppy, a terminal ui ebook viewer.
Usage: epuppy [options] <epub file>

View File

@@ -59,8 +59,11 @@ func ViewEpub(conf *Config) (int, error) {
}
if conf.Debug {
repr.Println("book.Files()")
repr.Println(book.Files())
repr.Println(book.Ncx)
repr.Println(book.Sections)
repr.Println(book.Opf.Manifest)
}
buf := strings.Builder{}
@@ -84,7 +87,7 @@ func ViewEpub(conf *Config) (int, error) {
return fmt.Println(buf.String())
}
if conf.Debug {
if conf.Debug || conf.XML {
return 0, nil
}

View File

@@ -9,6 +9,11 @@ import (
"path"
)
// a section in the book
type Section struct {
File, Title, MediaType string
}
// Book epub book
type Book struct {
Ncx Ncx `json:"ncx"`
@@ -20,6 +25,8 @@ type Book struct {
CoverImage []byte
CoverFile string
CoverMediaType string
Sections []Section
dumpxml bool
}
// Open open resource file

View File

@@ -4,48 +4,87 @@ import (
"archive/zip"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/alecthomas/repr"
)
// Open open a epub file
var (
// to find content
types = regexp.MustCompile(`application/(xml|html|xhtml|htm)`)
// cleanup regexes
deanchor = regexp.MustCompile(`#.*$`)
cleanext = regexp.MustCompile(`^\.`)
)
// Open open a epub file and return the filled Book structure
func Open(fn string, dumpxml bool) (*Book, error) {
bk, err := openFile(fn, dumpxml)
if err != nil {
return bk, err
}
defer func() {
if err := bk.fd.Close(); err != nil {
log.Fatal(err)
}
}()
if err := bk.getManifest(); err != nil {
return bk, err
}
if err := bk.getSections(); err != nil {
return bk, err
}
if err := bk.readSectionContent(); err != nil {
return bk, err
}
return bk, nil
}
// load the epub zip file
func openFile(fn string, dumpxml bool) (*Book, error) {
fd, err := zip.OpenReader(fn)
if err != nil {
return nil, err
}
defer func() {
if err := fd.Close(); err != nil {
log.Fatal(err)
}
}()
bk := &Book{fd: fd, dumpxml: dumpxml}
bk := Book{fd: fd}
return bk, nil
}
// load the manifest
func (bk *Book) getManifest() error {
mt, err := bk.readBytes("mimetype")
if err != nil {
return &bk, err
return err
}
bk.Mimetype = string(mt)
// contains the root path
err = bk.readXML("META-INF/container.xml", &bk.Container)
if err != nil {
return &bk, err
return err
}
// contains the OPF data
err = bk.readXML(bk.Container.Rootfile.Path, &bk.Opf)
if err != nil {
return &bk, err
return err
}
// look for TOC (might be incomplete, see below!)
for _, mf := range bk.Opf.Manifest {
if mf.ID == bk.Opf.Spine.Toc {
err = bk.readXML(bk.filename(mf.Href), &bk.Ncx)
if err != nil {
return &bk, err
return err
}
}
@@ -55,58 +94,112 @@ func Open(fn string, dumpxml bool) (*Book, error) {
}
}
type section struct {
file, title string
return nil
}
// extract the readable sections of the epub
func (bk *Book) getSections() error {
// to store our final content sections
sections := []Section{}
// count the content items in the raw manifest
var manifestcount int
for _, item := range bk.Opf.Manifest {
if types.MatchString(item.MediaType) {
manifestcount++
}
}
sections := []section{}
// we have ncx points from the TOC, try those
if len(bk.Ncx.Points) > 0 {
for _, block := range bk.Ncx.Points {
sections = append(sections,
section{
file: "OEBPS/" + block.Content.Src,
title: block.Text,
})
sect := Section{
File: "OEBPS/" + block.Content.Src,
Title: block.Text,
}
srcfile := deanchor.ReplaceAllString(block.Content.Src, "")
for _, file := range bk.Files() {
if strings.Contains(file, srcfile) {
sect.File = file
sect.MediaType = "application/" + cleanext.ReplaceAllString(filepath.Ext(file), "")
break
}
}
sections = append(sections, sect)
}
if len(sections) < manifestcount {
// TOC was incomplete, restart from scratch but use the
// OPF Manifest directly
sections = []Section{}
for _, item := range bk.Opf.Manifest {
if types.MatchString(item.MediaType) {
sect := Section{
File: "OEBPS/" + item.Href,
MediaType: item.MediaType,
}
srcfile := deanchor.ReplaceAllString(item.Href, "")
for _, file := range bk.Files() {
if strings.Contains(file, srcfile) {
sect.File = file
break
}
}
sections = append(sections, sect)
}
}
}
} else {
// no TOC, just pull in the files directly
for _, file := range bk.Files() {
sections = append(sections,
section{
file: file,
Section{
File: file,
MediaType: "application/" + cleanext.ReplaceAllString(filepath.Ext(file), ""),
})
}
}
for _, section := range sections {
content, err := bk.readBytes(section.file)
// final sections to keep
bk.Sections = sections
return nil
}
func (bk *Book) readSectionContent() error {
// now read in the actual xml contents
for _, section := range bk.Sections {
content, err := bk.readBytes(section.File)
if err != nil {
return &bk, err
return err
}
if strings.Contains(section.file, bk.CoverFile) {
if strings.Contains(section.File, bk.CoverFile) {
bk.CoverImage = content
}
ct := Content{Src: section.file, Title: section.title}
ct := Content{Src: section.File, Title: section.Title}
if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
if types.MatchString(section.MediaType) {
if err := ct.String(content); err != nil {
return &bk, err
return err
}
}
if dumpxml {
if bk.dumpxml {
fmt.Println(string(ct.XML))
}
bk.Content = append(bk.Content, ct)
}
if dumpxml {
repr.Println(sections)
os.Exit(0)
}
return &bk, nil
return nil
}