refactored out http fetching code into Fetcher{}/fetch.go

2025-12-16 20:11:01 +01:00 · 2024-01-16 19:27:46 +01:00
parent 78e5de61d2
commit 3fd75fa53d
4 changed files with 104 additions and 77 deletions
--- a/fetch.go
+++ b/fetch.go
@@ -0,0 +1,75 @@
+/*
+Copyright © 2023-2024 Thomas von Dein
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package main
+
+import (
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+)
+
+// convenient wrapper to fetch some web content
+type Fetcher struct {
+	Config    *Config
+	Client    *http.Client
+	Useragent string // FIXME: make configurable
+}
+
+func NewFetcher(c *Config) *Fetcher {
+	return &Fetcher{
+		Client:    &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
+		Useragent: Useragent,                                    // default in config.go
+		Config:    c,
+	}
+}
+
+func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
+	req, err := http.NewRequest("GET", uri, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	req.Header.Set("User-Agent", f.Useragent)
+
+	res, err := f.Client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+
+	if res.StatusCode != 200 {
+		return nil, errors.New("could not get page via HTTP")
+	}
+
+	return res.Body, nil
+}
+
+// fetch an image
+func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
+	slog.Debug("fetching ad image", "uri", uri)
+	body, err := f.Get(uri)
+	if err != nil {
+		if f.Config.IgnoreErrors {
+			slog.Info("Failed to download image, error ignored", "error", err.Error())
+			return nil, nil
+		}
+		return nil, err
+	}
+
+	return body, nil
+}
--- a/http.go
+++ b/http.go
@@ -27,6 +27,9 @@ import (
 	"time"
 )

+// I add an artificial "ID" to each HTTP request and the corresponding
+// respose for  debugging purposes  so that  the pair  of them  can be
+// easier associated in debug output
 var letters = []rune("ABCDEF0123456789")

 func getid() string {
@@ -37,8 +40,10 @@ func getid() string {
 	return string(b)
 }

+// retry after HTTP 50x errors or err!=nil
 const RetryCount = 3

+// used to inject debug log and implement retries
 type loggingTransport struct{}

 // escalating timeout, $retry^2 seconds
@@ -75,7 +80,7 @@ func drainBody(resp *http.Response) {
 	}
 }

-// our logging transport with retries
+// the actual logging transport with retries
 func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 	// just requred for debugging
 	id := getid()
--- a/main.go
+++ b/main.go
@@ -22,7 +22,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"net/http"
 	"os"
 	"runtime/debug"

@@ -102,8 +101,6 @@ func Main(w io.Writer) int {
 		slog.SetDefault(debuglogger)
 	}

-	// defaultlogger := log.Default()
-	// defaultlogger.SetOutput(w)
 	slog.Debug("config", "conf", conf)

 	// prepare output dir
@@ -113,19 +110,19 @@ func Main(w io.Writer) int {
 	}

 	// used for all HTTP requests
-	client := &http.Client{Transport: &loggingTransport{}}
+	fetch := NewFetcher(conf)

 	if len(conf.Adlinks) >= 1 {
 		// directly backup ad listing[s]
 		for _, uri := range conf.Adlinks {
-			err := ScrapeAd(conf, uri, client)
+			err := ScrapeAd(fetch, uri)
 			if err != nil {
 				return Die(err)
 			}
 		}
 	} else if conf.User > 0 {
 		// backup all ads of the given user (via config or cmdline)
-		err := ScrapeUser(conf, client)
+		err := ScrapeUser(fetch)
 		if err != nil {
 			return Die(err)
 		}
--- a/scrape.go
+++ b/scrape.go
@@ -20,9 +20,7 @@ package main
 import (
 	"errors"
 	"fmt"
-	"io"
 	"log/slog"
-	"net/http"
 	"path/filepath"
 	"strings"

@@ -30,42 +28,21 @@ import (
 	"golang.org/x/sync/errgroup"
 )

-// fetch some web page content
-func Get(uri string, client *http.Client) (io.ReadCloser, error) {
-	req, err := http.NewRequest("GET", uri, nil)
-	if err != nil {
-		return nil, err
-	}
-
-	req.Header.Set("User-Agent", Useragent)
-
-	res, err := client.Do(req)
-	if err != nil {
-		return nil, err
-	}
-
-	if res.StatusCode != 200 {
-		return nil, errors.New("could not get page via HTTP")
-	}
-
-	return res.Body, nil
-}
-
 // extract links from  all ad listing pages (that  is: use pagination)
 // and scrape every page
-func ScrapeUser(conf *Config, client *http.Client) error {
+func ScrapeUser(fetch *Fetcher) error {
 	adlinks := []string{}

-	baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
+	baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
 	page := 1
 	uri := baseuri

-	slog.Info("fetching ad pages", "user", conf.User)
+	slog.Info("fetching ad pages", "user", fetch.Config.User)

 	for {
 		var index Index
 		slog.Debug("fetching page", "uri", uri)
-		body, err := Get(uri, client)
+		body, err := fetch.Get(uri)
 		if err != nil {
 			return err
 		}
@@ -92,12 +69,12 @@ func ScrapeUser(conf *Config, client *http.Client) error {
 	}

 	for i, adlink := range adlinks {
-		err := ScrapeAd(conf, Baseuri+adlink, client)
+		err := ScrapeAd(fetch, Baseuri+adlink)
 		if err != nil {
 			return err
 		}

-		if conf.Limit > 0 && i == conf.Limit-1 {
+		if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
 			break
 		}
 	}
@@ -106,7 +83,7 @@ func ScrapeUser(conf *Config, client *http.Client) error {
 }

 // scrape an ad. uri is the full uri of the ad, dir is the basedir
-func ScrapeAd(c *Config, uri string, client *http.Client) error {
+func ScrapeAd(fetch *Fetcher, uri string) error {
 	ad := &Ad{}

 	// extract slug and id from uri
@@ -119,7 +96,7 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {

 	// get the ad
 	slog.Debug("fetching ad page", "uri", uri)
-	body, err := Get(uri, client)
+	body, err := fetch.Get(uri)
 	if err != nil {
 		return err
 	}
@@ -143,26 +120,31 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {
 	slog.Debug("extracted ad listing", "ad", ad)

 	// write listing
-	addir, err := WriteAd(c, ad)
+	addir, err := WriteAd(fetch.Config, ad)
 	if err != nil {
 		return err
 	}

-	c.IncrAds()
+	fetch.Config.IncrAds()

-	return ScrapeImages(c, ad, addir, client)
+	return ScrapeImages(fetch, ad, addir)
 }

-func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
+func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
 	// fetch images
 	img := 1
 	g := new(errgroup.Group)

 	for _, imguri := range ad.Images {
 		imguri := imguri
-		file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
+		file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
 		g.Go(func() error {
-			err := Getimage(c, imguri, file, client)
+			body, err := fetch.Getimage(imguri)
+			if err != nil {
+				return err
+			}
+
+			err = WriteImage(file, body)
 			if err != nil {
 				return err
 			}
@@ -176,39 +158,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
 		return err
 	}

-	c.IncrImgs(len(ad.Images))
+	fetch.Config.IncrImgs(len(ad.Images))

 	return nil
 }
-
-// fetch an image
-func Getimage(c *Config, uri, fileName string, client *http.Client) error {
-	slog.Debug("fetching ad image", "uri", uri)
-	req, err := http.NewRequest("GET", uri, nil)
-	if err != nil {
-		if c.IgnoreErrors {
-			slog.Info("Failed to download image, error ignored", "error", err.Error())
-		}
-		return err
-	}
-
-	req.Header.Set("User-Agent", Useragent)
-
-	response, err := client.Do(req)
-	if err != nil {
-		return err
-	}
-	defer response.Body.Close()
-
-	if response.StatusCode != 200 {
-		return errors.New("could not get image via HTTP")
-	}
-
-	err = WriteImage(fileName, response.Body)
-	if err != nil {
-		return err
-	}
-
-	slog.Info("wrote ad image", "image", fileName)
-	return nil
-}