mirror of
https://codeberg.org/scip/epuppy.git
synced 2025-12-17 12:31:02 +01:00
Fix crash and add support for content in <div> (#9)
* fix #8: better regex to remove html entities * add opf and ncx debug * make epub content retrieval more flexible * fix epub content retrieval: also support html files with <div>
This commit is contained in:
10
cmd/view.go
10
cmd/view.go
@@ -22,6 +22,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/alecthomas/repr"
|
||||||
"github.com/tlinden/epuppy/pkg/epub"
|
"github.com/tlinden/epuppy/pkg/epub"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -57,6 +58,11 @@ func ViewEpub(conf *Config) (int, error) {
|
|||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if conf.Debug {
|
||||||
|
repr.Println(book.Files())
|
||||||
|
repr.Println(book.Ncx)
|
||||||
|
}
|
||||||
|
|
||||||
buf := strings.Builder{}
|
buf := strings.Builder{}
|
||||||
head := strings.Builder{}
|
head := strings.Builder{}
|
||||||
|
|
||||||
@@ -78,6 +84,10 @@ func ViewEpub(conf *Config) (int, error) {
|
|||||||
return fmt.Println(buf.String())
|
return fmt.Println(buf.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if conf.Debug {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
return Pager(&Ebook{
|
return Pager(&Ebook{
|
||||||
Config: conf,
|
Config: conf,
|
||||||
Title: head.String(),
|
Title: head.String(),
|
||||||
|
|||||||
@@ -8,9 +8,10 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
cleanentitles = regexp.MustCompile(`&.+;`)
|
cleanentitles = regexp.MustCompile(`&[a-z]+;`)
|
||||||
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
empty = regexp.MustCompile(`(?s)^[\s ]*$`)
|
||||||
newlines = regexp.MustCompile(`[\r\n]+`)
|
newlines = regexp.MustCompile(`[\r\n]+`)
|
||||||
|
cleanmarkup = regexp.MustCompile(`<[^<>]+>`)
|
||||||
)
|
)
|
||||||
|
|
||||||
// Content nav-point content
|
// Content nav-point content
|
||||||
@@ -22,30 +23,45 @@ type Content struct {
|
|||||||
XML []byte
|
XML []byte
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Content) String(content []byte) error {
|
|
||||||
// parse XML, look for title and <p>.*</p> stuff
|
// parse XML, look for title and <p>.*</p> stuff
|
||||||
|
func (c *Content) String(content []byte) error {
|
||||||
doc, err := xmlquery.Parse(
|
doc, err := xmlquery.Parse(
|
||||||
strings.NewReader(
|
strings.NewReader(
|
||||||
cleanentitles.ReplaceAllString(string(content), " ")))
|
cleanentitles.ReplaceAllString(string(content), " ")))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if c.Title == "" {
|
||||||
// extract the title
|
// extract the title
|
||||||
for _, item := range xmlquery.Find(doc, "//title") {
|
for _, item := range xmlquery.Find(doc, "//title") {
|
||||||
c.Title = strings.TrimSpace(item.InnerText())
|
c.Title = strings.TrimSpace(item.InnerText())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// extract all paragraphs, ignore any formatting and re-fill the
|
// extract all paragraphs, ignore any formatting and re-fill the
|
||||||
// paragraph, that is, we replaces all newlines inside with one
|
// paragraph, that is, we replace all newlines inside with one
|
||||||
// space.
|
// space.
|
||||||
txt := strings.Builder{}
|
txt := strings.Builder{}
|
||||||
|
var have_p bool
|
||||||
for _, item := range xmlquery.Find(doc, "//p") {
|
for _, item := range xmlquery.Find(doc, "//p") {
|
||||||
if !empty.MatchString(item.InnerText()) {
|
if !empty.MatchString(item.InnerText()) {
|
||||||
|
have_p = true
|
||||||
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !have_p {
|
||||||
|
// try <div></div>, which some ebooks use, so get all divs,
|
||||||
|
// remove markup and paragraphify the parts
|
||||||
|
for _, item := range xmlquery.Find(doc, "//div") {
|
||||||
|
if !empty.MatchString(item.InnerText()) {
|
||||||
|
cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "")
|
||||||
|
txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
c.Body = strings.TrimSpace(txt.String())
|
c.Body = strings.TrimSpace(txt.String())
|
||||||
c.XML = content
|
c.XML = content
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/alecthomas/repr"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Open open a epub file
|
// Open open a epub file
|
||||||
@@ -53,32 +55,56 @@ func Open(fn string, dumpxml bool) (*Book, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type section struct {
|
||||||
|
file, title string
|
||||||
|
}
|
||||||
|
|
||||||
|
sections := []section{}
|
||||||
|
|
||||||
|
if len(bk.Ncx.Points) > 0 {
|
||||||
|
for _, block := range bk.Ncx.Points {
|
||||||
|
sections = append(sections,
|
||||||
|
section{
|
||||||
|
file: "OEBPS/" + block.Content.Src,
|
||||||
|
title: block.Text,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for _, file := range bk.Files() {
|
for _, file := range bk.Files() {
|
||||||
content, err := bk.readBytes(file)
|
sections = append(sections,
|
||||||
|
section{
|
||||||
|
file: file,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, section := range sections {
|
||||||
|
content, err := bk.readBytes(section.file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &bk, err
|
return &bk, err
|
||||||
}
|
}
|
||||||
|
|
||||||
ct := Content{Src: file}
|
if strings.Contains(section.file, bk.CoverFile) {
|
||||||
if strings.Contains(string(content), "<?xml") {
|
bk.CoverImage = content
|
||||||
|
}
|
||||||
|
|
||||||
|
ct := Content{Src: section.file, Title: section.title}
|
||||||
|
|
||||||
|
if strings.Contains(string(content), "<?xml") || strings.Contains(string(content), "<!DOCTYPE") {
|
||||||
if err := ct.String(content); err != nil {
|
if err := ct.String(content); err != nil {
|
||||||
return &bk, err
|
return &bk, err
|
||||||
}
|
}
|
||||||
|
}
|
||||||
bk.Content = append(bk.Content, ct)
|
|
||||||
|
|
||||||
if dumpxml {
|
if dumpxml {
|
||||||
fmt.Println(string(ct.XML))
|
fmt.Println(string(ct.XML))
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if strings.Contains(file, bk.CoverFile) {
|
|
||||||
bk.CoverImage = content
|
|
||||||
}
|
|
||||||
|
|
||||||
|
bk.Content = append(bk.Content, ct)
|
||||||
}
|
}
|
||||||
|
|
||||||
if dumpxml {
|
if dumpxml {
|
||||||
|
repr.Println(sections)
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user