package epub import ( "archive/zip" "fmt" "log" "path/filepath" "regexp" "strings" ) var ( // to find content types = regexp.MustCompile(`application/(xml|html|xhtml|htm)`) // cleanup regexes deanchor = regexp.MustCompile(`#.*$`) cleanext = regexp.MustCompile(`^\.`) ) // Open open a epub file and return the filled Book structure func Open(fn string, dumpxml bool) (*Book, error) { bk, err := openFile(fn, dumpxml) if err != nil { return bk, err } defer func() { if err := bk.fd.Close(); err != nil { log.Fatal(err) } }() if err := bk.getManifest(); err != nil { return bk, err } if err := bk.getSections(); err != nil { return bk, err } if err := bk.readSectionContent(); err != nil { return bk, err } return bk, nil } // load the epub zip file func openFile(fn string, dumpxml bool) (*Book, error) { fd, err := zip.OpenReader(fn) if err != nil { return nil, err } bk := &Book{fd: fd, dumpxml: dumpxml} return bk, nil } // load the manifest func (bk *Book) getManifest() error { mt, err := bk.readBytes("mimetype") if err != nil { return err } bk.Mimetype = string(mt) // contains the root path err = bk.readXML("META-INF/container.xml", &bk.Container) if err != nil { return err } // contains the OPF data err = bk.readXML(bk.Container.Rootfile.Path, &bk.Opf) if err != nil { return err } // look for TOC (might be incomplete, see below!) for _, mf := range bk.Opf.Manifest { if mf.ID == bk.Opf.Spine.Toc { err = bk.readXML(bk.filename(mf.Href), &bk.Ncx) if err != nil { return err } } if mf.ID == "cover-image" { bk.CoverFile = mf.Href bk.CoverMediaType = mf.MediaType } } return nil } // extract the readable sections of the epub func (bk *Book) getSections() error { // to store our final content sections sections := []Section{} // count the content items in the raw manifest var manifestcount int for _, item := range bk.Opf.Manifest { if types.MatchString(item.MediaType) { manifestcount++ } } // we have ncx points from the TOC, try those if len(bk.Ncx.Points) > 0 { for _, block := range bk.Ncx.Points { sect := Section{ File: "OEBPS/" + block.Content.Src, Title: block.Text, } srcfile := deanchor.ReplaceAllString(block.Content.Src, "") for _, file := range bk.Files() { if strings.Contains(file, srcfile) { sect.File = file sect.MediaType = "application/" + cleanext.ReplaceAllString(filepath.Ext(file), "") break } } sections = append(sections, sect) } if len(sections) < manifestcount { // TOC was incomplete, restart from scratch but use the // OPF Manifest directly sections = []Section{} for _, item := range bk.Opf.Manifest { if types.MatchString(item.MediaType) { sect := Section{ File: "OEBPS/" + item.Href, MediaType: item.MediaType, } srcfile := deanchor.ReplaceAllString(item.Href, "") for _, file := range bk.Files() { if strings.Contains(file, srcfile) { sect.File = file break } } sections = append(sections, sect) } } } } else { // no TOC, just pull in the files directly for _, file := range bk.Files() { sections = append(sections, Section{ File: file, MediaType: "application/" + cleanext.ReplaceAllString(filepath.Ext(file), ""), }) } } // final sections to keep bk.Sections = sections return nil } func (bk *Book) readSectionContent() error { // now read in the actual xml contents for _, section := range bk.Sections { content, err := bk.readBytes(section.File) if err != nil { return err } if strings.Contains(section.File, bk.CoverFile) { bk.CoverImage = content } ct := Content{Src: section.File, Title: section.Title} if types.MatchString(section.MediaType) { if err := ct.String(content); err != nil { return err } } if bk.dumpxml { fmt.Println(string(ct.XML)) } bk.Content = append(bk.Content, ct) } return nil }