mirror of
https://codeberg.org/scip/epuppy.git
synced 2025-12-16 20:11:00 +01:00
Fix crash and add support for content in <div> (#9)
* fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with <div>
This commit is contained in:
10
cmd/view.go
10
cmd/view.go
@@ -22,6 +22,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/alecthomas/repr"
|
||||
"github.com/tlinden/epuppy/pkg/epub"
|
||||
)
|
||||
|
||||
@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if conf.Debug {
|
||||
repr.Println(book.Files())
|
||||
repr.Println(book.Ncx)
|
||||
}
|
||||
|
||||
buf := strings.Builder{}
|
||||
head := strings.Builder{}
|
||||
|
||||
@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
|
||||
return fmt.Println(buf.String())
|
||||
}
|
||||
|
||||
if conf.Debug {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return Pager(&Ebook{
|
||||
Config: conf,
|
||||
Title: head.String(),
|
||||
|
||||
@@ -8,9 +8,10 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
cleanentitles = regexp.MustCompile(`&.+;`)
|
||||
cleanentitles = regexp.MustCompile(`&[a-z]+;`)
|
||||
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||
newlines = regexp.MustCompile(`[\r\n]+`)
|
||||
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||
)
|
||||
|
||||
// Content nav-point content
|
||||
@@ -22,30 +23,45 @@ type Content struct {
|
||||
XML []byte
|
||||
}
|
||||
|
||||
func (c *Content) String(content []byte) error {
|
||||
// parse XML, look for title and <p>.*</p> stuff
|
||||
func (c *Content) String(content []byte) error {
|
||||
doc, err := xmlquery.Parse(
|
||||
strings.NewReader(
|
||||
cleanentitles.ReplaceAllString(string(content), " ")))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return err
|
||||
}
|
||||
|
||||
if c.Title == "" {
|
||||
// extract the title
|
||||
for _, item := range xmlquery.Find(doc, "//title") {
|
||||
c.Title = strings.TrimSpace(item.InnerText())
|
||||
}
|
||||
}
|
||||
|
||||
// extract all paragraphs, ignore any formatting and re-fill the
|
||||
// paragraph, that is, we replaces all newlines inside with one
|
||||
// paragraph, that is, we replace all newlines inside with one
|
||||
// space.
|
||||
txt := strings.Builder{}
|
||||
var have_p bool
|
||||
for _, item := range xmlquery.Find(doc, "//p") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
have_p = true
|
||||
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
if !have_p {
|
||||
// try <div></div>, which some ebooks use, so get all divs,
|
||||
// remove markup and paragraphify the parts
|
||||
for _, item := range xmlquery.Find(doc, "//div") {
|
||||
if !empty.MatchString(item.InnerText()) {
|
||||
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
|
||||
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.Body = strings.TrimSpace(txt.String())
|
||||
c.XML = content
|
||||
|
||||
|
||||
@@ -6,6 +6,8 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/alecthomas/repr"
|
||||
)
|
||||
|
||||
// Open open a epub file
|
||||
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
|
||||
}
|
||||
}
|
||||
|
||||
type section struct {
|
||||
file, title string
|
||||
}
|
||||
|
||||
sections := []section{}
|
||||
|
||||
if len(bk.Ncx.Points) > 0 {
|
||||
for _, block := range bk.Ncx.Points {
|
||||
sections = append(sections,
|
||||
section{
|
||||
file: "OEBPS/" + block.Content.Src,
|
||||
title: block.Text,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
for _, file := range bk.Files() {
|
||||
content, err := bk.readBytes(file)
|
||||
sections = append(sections,
|
||||
section{
|
||||
file: file,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for _, section := range sections {
|
||||
content, err := bk.readBytes(section.file)
|
||||
if err != nil {
|
||||
return &bk, err
|
||||
}
|
||||
|
||||
ct := Content{Src: file}
|
||||
if strings.Contains(string(content), "<?xml") {
|
||||
if strings.Contains(section.file, bk.CoverFile) {
|
||||
bk.CoverImage = content
|
||||
}
|
||||
|
||||
ct := Content{Src: section.file, Title: section.title}
|
||||
|
||||
if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
|
||||
if err := ct.String(content); err != nil {
|
||||
return &bk, err
|
||||
}
|
||||
|
||||
bk.Content = append(bk.Content, ct)
|
||||
}
|
||||
|
||||
if dumpxml {
|
||||
fmt.Println(string(ct.XML))
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(file, bk.CoverFile) {
|
||||
bk.CoverImage = content
|
||||
}
|
||||
|
||||
bk.Content = append(bk.Content, ct)
|
||||
}
|
||||
|
||||
if dumpxml {
|
||||
repr.Println(sections)
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user