diff --git a/README.md b/README.md index 25fbc9e..d1dab9f 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,16 @@ Options: -v --version show program version ``` +## Reading mobi files + +`epuppy` doesn't support mobi files, but you can install +[mobitool](https://github.com/bfabiszewski/libmobi/) and use it to +convert mobi files to epub. The ubuntu package is `libmobi-tools`. To convert, execute: + +```default +mobitool -e somebook.epub +``` + ## Installation The tool does not have any dependencies. Just download the binary for diff --git a/pkg/epub/content.go b/pkg/epub/content.go index 8336b73..5048499 100644 --- a/pkg/epub/content.go +++ b/pkg/epub/content.go @@ -8,11 +8,12 @@ import ( ) var ( - cleanentitles = regexp.MustCompile(`&[a-z]+;`) - empty = regexp.MustCompile(`(?s)^[\s ]*$`) - newlines = regexp.MustCompile(`[\r\n\s]+`) - cleansvg = regexp.MustCompile(`(|)`) - cleanmarkup = regexp.MustCompile(`<[^<>]+>`) + cleanenTitles = regexp.MustCompile(`&[a-z]+;`) + isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`) + cleanNewlines = regexp.MustCompile(`[\r\n\s]+`) + cleanSVG = regexp.MustCompile(`(|)`) + cleanMarkup = regexp.MustCompile(`<[^<>]+>`) + cleanMobiPageBreaks = regexp.MustCompile(``) ) // Content nav-point content @@ -25,13 +26,28 @@ type Content struct { } // parse XML, look for title and

.*

stuff -func (c *Content) String(content []byte) error { - doc, err := xmlquery.Parse( - strings.NewReader( - cleansvg.ReplaceAllString( - cleanentitles.ReplaceAllString(string(content), " "), ""))) +func (c *Content) Extract(content []byte) error { + rawXML := cleanSVG.ReplaceAllString( + cleanenTitles.ReplaceAllString(string(content), " "), "") + + var doc *xmlquery.Node + var err error + + doc, err = xmlquery.Parse(strings.NewReader(rawXML)) if err != nil { - return err + if strings.Contains(err.Error(), `namespace mbp is missing`) { + fixedmbp := strings.NewReader( + cleanMobiPageBreaks.ReplaceAllString( + rawXML, ``)) + + doc, err = xmlquery.Parse(fixedmbp) + if err != nil { + return err + } + + } else { + return err + } } if c.Title == "" { @@ -47,9 +63,9 @@ func (c *Content) String(content []byte) error { txt := strings.Builder{} var have_p bool for _, item := range xmlquery.Find(doc, "//p") { - if !empty.MatchString(item.InnerText()) { + if !isEmpty.MatchString(item.InnerText()) { have_p = true - txt.WriteString(newlines.ReplaceAllString(item.InnerText(), " ") + "\n\n") + txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n") } } @@ -57,9 +73,9 @@ func (c *Content) String(content []byte) error { // try
, which some ebooks use, so get all divs, // remove markup and paragraphify the parts for _, item := range xmlquery.Find(doc, "//div") { - if !empty.MatchString(item.InnerText()) { - cleaned := cleanmarkup.ReplaceAllString(item.InnerText(), "") - txt.WriteString(newlines.ReplaceAllString(cleaned, " ") + "\n\n") + if !isEmpty.MatchString(item.InnerText()) { + cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "") + txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n") } } } diff --git a/pkg/epub/open.go b/pkg/epub/open.go index ff5441f..1462131 100644 --- a/pkg/epub/open.go +++ b/pkg/epub/open.go @@ -112,6 +112,8 @@ func (bk *Book) getSections() error { // we have ncx points from the TOC, try those if len(bk.Ncx.Points) > 0 { + known := map[string]int{} + for _, block := range bk.Ncx.Points { sect := Section{ File: "OEBPS/" + block.Content.Src, @@ -128,7 +130,13 @@ func (bk *Book) getSections() error { } } - sections = append(sections, sect) + if _, haveFile := known[sect.File]; !haveFile { + // sometimes epub's have many sections but they all + // point to the same file. To avoid duplicate content + // we ignore sections (thus files) we have already seen. + sections = append(sections, sect) + known[sect.File] = 1 + } } if len(sections) < manifestcount { @@ -189,7 +197,7 @@ func (bk *Book) readSectionContent() error { ct := Content{Src: section.File, Title: section.Title} if types.MatchString(section.MediaType) { - if err := ct.String(content); err != nil { + if err := ct.Extract(content); err != nil { return err } }