From 3fd75fa53d7004a356622931200cc5bdda4c87b2 Mon Sep 17 00:00:00 2001 From: "T.v.Dein" Date: Tue, 16 Jan 2024 19:27:46 +0100 Subject: [PATCH] refactored out http fetching code into Fetcher{}/fetch.go --- fetch.go | 75 ++++++++++++++++++++++++++++++++++++++++++++++ http.go | 7 ++++- main.go | 9 ++---- scrape.go | 90 +++++++++++++------------------------------------------ 4 files changed, 104 insertions(+), 77 deletions(-) create mode 100644 fetch.go diff --git a/fetch.go b/fetch.go new file mode 100644 index 0000000..9544fce --- /dev/null +++ b/fetch.go @@ -0,0 +1,75 @@ +/* +Copyright © 2023-2024 Thomas von Dein + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "errors" + "io" + "log/slog" + "net/http" +) + +// convenient wrapper to fetch some web content +type Fetcher struct { + Config *Config + Client *http.Client + Useragent string // FIXME: make configurable +} + +func NewFetcher(c *Config) *Fetcher { + return &Fetcher{ + Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go + Useragent: Useragent, // default in config.go + Config: c, + } +} + +func (f *Fetcher) Get(uri string) (io.ReadCloser, error) { + req, err := http.NewRequest("GET", uri, nil) + if err != nil { + return nil, err + } + + req.Header.Set("User-Agent", f.Useragent) + + res, err := f.Client.Do(req) + if err != nil { + return nil, err + } + + if res.StatusCode != 200 { + return nil, errors.New("could not get page via HTTP") + } + + return res.Body, nil +} + +// fetch an image +func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) { + slog.Debug("fetching ad image", "uri", uri) + body, err := f.Get(uri) + if err != nil { + if f.Config.IgnoreErrors { + slog.Info("Failed to download image, error ignored", "error", err.Error()) + return nil, nil + } + return nil, err + } + + return body, nil +} diff --git a/http.go b/http.go index c68f0cd..0441101 100644 --- a/http.go +++ b/http.go @@ -27,6 +27,9 @@ import ( "time" ) +// I add an artificial "ID" to each HTTP request and the corresponding +// respose for debugging purposes so that the pair of them can be +// easier associated in debug output var letters = []rune("ABCDEF0123456789") func getid() string { @@ -37,8 +40,10 @@ func getid() string { return string(b) } +// retry after HTTP 50x errors or err!=nil const RetryCount = 3 +// used to inject debug log and implement retries type loggingTransport struct{} // escalating timeout, $retry^2 seconds @@ -75,7 +80,7 @@ func drainBody(resp *http.Response) { } } -// our logging transport with retries +// the actual logging transport with retries func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { // just requred for debugging id := getid() diff --git a/main.go b/main.go index 594694b..f88ad5a 100644 --- a/main.go +++ b/main.go @@ -22,7 +22,6 @@ import ( "fmt" "io" "log/slog" - "net/http" "os" "runtime/debug" @@ -102,8 +101,6 @@ func Main(w io.Writer) int { slog.SetDefault(debuglogger) } - // defaultlogger := log.Default() - // defaultlogger.SetOutput(w) slog.Debug("config", "conf", conf) // prepare output dir @@ -113,19 +110,19 @@ func Main(w io.Writer) int { } // used for all HTTP requests - client := &http.Client{Transport: &loggingTransport{}} + fetch := NewFetcher(conf) if len(conf.Adlinks) >= 1 { // directly backup ad listing[s] for _, uri := range conf.Adlinks { - err := ScrapeAd(conf, uri, client) + err := ScrapeAd(fetch, uri) if err != nil { return Die(err) } } } else if conf.User > 0 { // backup all ads of the given user (via config or cmdline) - err := ScrapeUser(conf, client) + err := ScrapeUser(fetch) if err != nil { return Die(err) } diff --git a/scrape.go b/scrape.go index 22d86a2..3766a94 100644 --- a/scrape.go +++ b/scrape.go @@ -20,9 +20,7 @@ package main import ( "errors" "fmt" - "io" "log/slog" - "net/http" "path/filepath" "strings" @@ -30,42 +28,21 @@ import ( "golang.org/x/sync/errgroup" ) -// fetch some web page content -func Get(uri string, client *http.Client) (io.ReadCloser, error) { - req, err := http.NewRequest("GET", uri, nil) - if err != nil { - return nil, err - } - - req.Header.Set("User-Agent", Useragent) - - res, err := client.Do(req) - if err != nil { - return nil, err - } - - if res.StatusCode != 200 { - return nil, errors.New("could not get page via HTTP") - } - - return res.Body, nil -} - // extract links from all ad listing pages (that is: use pagination) // and scrape every page -func ScrapeUser(conf *Config, client *http.Client) error { +func ScrapeUser(fetch *Fetcher) error { adlinks := []string{} - baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User) + baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User) page := 1 uri := baseuri - slog.Info("fetching ad pages", "user", conf.User) + slog.Info("fetching ad pages", "user", fetch.Config.User) for { var index Index slog.Debug("fetching page", "uri", uri) - body, err := Get(uri, client) + body, err := fetch.Get(uri) if err != nil { return err } @@ -92,12 +69,12 @@ func ScrapeUser(conf *Config, client *http.Client) error { } for i, adlink := range adlinks { - err := ScrapeAd(conf, Baseuri+adlink, client) + err := ScrapeAd(fetch, Baseuri+adlink) if err != nil { return err } - if conf.Limit > 0 && i == conf.Limit-1 { + if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 { break } } @@ -106,7 +83,7 @@ func ScrapeUser(conf *Config, client *http.Client) error { } // scrape an ad. uri is the full uri of the ad, dir is the basedir -func ScrapeAd(c *Config, uri string, client *http.Client) error { +func ScrapeAd(fetch *Fetcher, uri string) error { ad := &Ad{} // extract slug and id from uri @@ -119,7 +96,7 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error { // get the ad slog.Debug("fetching ad page", "uri", uri) - body, err := Get(uri, client) + body, err := fetch.Get(uri) if err != nil { return err } @@ -143,26 +120,31 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error { slog.Debug("extracted ad listing", "ad", ad) // write listing - addir, err := WriteAd(c, ad) + addir, err := WriteAd(fetch.Config, ad) if err != nil { return err } - c.IncrAds() + fetch.Config.IncrAds() - return ScrapeImages(c, ad, addir, client) + return ScrapeImages(fetch, ad, addir) } -func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error { +func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error { // fetch images img := 1 g := new(errgroup.Group) for _, imguri := range ad.Images { imguri := imguri - file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) + file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img)) g.Go(func() error { - err := Getimage(c, imguri, file, client) + body, err := fetch.Getimage(imguri) + if err != nil { + return err + } + + err = WriteImage(file, body) if err != nil { return err } @@ -176,39 +158,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error { return err } - c.IncrImgs(len(ad.Images)) + fetch.Config.IncrImgs(len(ad.Images)) return nil } - -// fetch an image -func Getimage(c *Config, uri, fileName string, client *http.Client) error { - slog.Debug("fetching ad image", "uri", uri) - req, err := http.NewRequest("GET", uri, nil) - if err != nil { - if c.IgnoreErrors { - slog.Info("Failed to download image, error ignored", "error", err.Error()) - } - return err - } - - req.Header.Set("User-Agent", Useragent) - - response, err := client.Do(req) - if err != nil { - return err - } - defer response.Body.Close() - - if response.StatusCode != 200 { - return errors.New("could not get image via HTTP") - } - - err = WriteImage(fileName, response.Body) - if err != nil { - return err - } - - slog.Info("wrote ad image", "image", fileName) - return nil -}