package epub import ( "regexp" "strings" "github.com/antchfx/xmlquery" ) var ( cleanenTitles = regexp.MustCompile(`&[a-z]+;`) isEmpty = regexp.MustCompile(`(?s)^[\s ]*$`) cleanNewlines = regexp.MustCompile(`[\r\n\s]+`) cleanSVG = regexp.MustCompile(`(|)`) cleanMarkup = regexp.MustCompile(`<[^<>]+>`) cleanMobiPageBreaks = regexp.MustCompile(``) ) // Content nav-point content type Content struct { Src string `xml:"src,attr" json:"src"` Empty bool Body string Title string XML []byte } // parse XML, look for title and

.*

stuff func (c *Content) Extract(content []byte) error { rawXML := cleanSVG.ReplaceAllString( cleanenTitles.ReplaceAllString(string(content), " "), "") var doc *xmlquery.Node var err error doc, err = xmlquery.Parse(strings.NewReader(rawXML)) if err != nil { if strings.Contains(err.Error(), `namespace mbp is missing`) { fixedmbp := strings.NewReader( cleanMobiPageBreaks.ReplaceAllString( rawXML, ``)) doc, err = xmlquery.Parse(fixedmbp) if err != nil { return err } } else { return err } } if c.Title == "" { // extract the title for _, item := range xmlquery.Find(doc, "//title") { c.Title = strings.TrimSpace(item.InnerText()) } } // extract all paragraphs, ignore any formatting and re-fill the // paragraph, that is, we replace all newlines inside with one // space. txt := strings.Builder{} var have_p bool for _, item := range xmlquery.Find(doc, "//p") { if !isEmpty.MatchString(item.InnerText()) { have_p = true txt.WriteString(cleanNewlines.ReplaceAllString(item.InnerText(), " ") + "\n\n") } } if !have_p { // try
, which some ebooks use, so get all divs, // remove markup and paragraphify the parts for _, item := range xmlquery.Find(doc, "//div") { if !isEmpty.MatchString(item.InnerText()) { cleaned := cleanMarkup.ReplaceAllString(item.InnerText(), "") txt.WriteString(cleanNewlines.ReplaceAllString(cleaned, " ") + "\n\n") } } } c.Body = strings.TrimSpace(txt.String()) c.XML = content if len(c.Body) == 0 { c.Empty = true } return nil }