diff --git a/README.md b/README.md index c1ac680..2c7af18 100644 --- a/README.md +++ b/README.md @@ -99,14 +99,16 @@ To install after building either copy the binary or execute `sudo make install`. ``` Usage: kleingebaeck [-dvVhmoc] [,...] Options: ---user -u Backup ads from user with uid . ---debug -d Enable debug output. ---verbose -v Enable verbose output. ---outdir -o Set output dir (default: current directory) ---limit -l Limit the ads to download to , default: load all. ---config -c Use config file (default: ~/.kleingebaeck). ---manual -m Show manual. ---help -h Show usage. +-u --user Backup ads from user with uid . +-d --debug Enable debug output. +-v --verbose Enable verbose output. +-o --outdir Set output dir (default: current directory) +-l --limit Limit the ads to download to , default: load all. +-c --config Use config file (default: ~/.kleingebaeck). + --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. +-m --manual Show manual. +-h --help Show usage. +-V --version Show program version. If one or more 's are specified, only backup those, otherwise backup all ads of the given user. diff --git a/config.go b/config.go index b240eee..81d3980 100644 --- a/config.go +++ b/config.go @@ -33,7 +33,7 @@ import ( ) const ( - VERSION string = "0.1.1" + VERSION string = "0.1.2" Baseuri string = "https://www.kleinanzeigen.de" Listuri string = "/s-bestandsliste.html" Defaultdir string = "." @@ -51,15 +51,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. Usage: kleingebaeck [-dvVhmoclu] [,...] Options: ---user -u Backup ads from user with uid . ---debug -d Enable debug output. ---verbose -v Enable verbose output. ---outdir -o Set output dir (default: current directory) ---limit -l Limit the ads to download to , default: load all. ---config -c Use config file (default: ~/.kleingebaeck). ---manual -m Show manual. ---help -h Show usage. ---version -V Show program version. +-u --user Backup ads from user with uid . +-d --debug Enable debug output. +-v --verbose Enable verbose output. +-o --outdir Set output dir (default: current directory) +-l --limit Limit the ads to download to , default: load all. +-c --config Use config file (default: ~/.kleingebaeck). + --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. +-m --manual Show manual. +-h --help Show usage. +-V --version Show program version. If one or more ad listing url's are specified, only backup those, otherwise backup all ads of the given user.` @@ -76,6 +77,7 @@ type Config struct { Adnametemplate string `koanf:"adnametemplate"` Loglevel string `koanf:"loglevel"` Limit int `koanf:"limit"` + IgnoreErrors bool `koanf:"ignoreerrors"` Adlinks []string StatsCountAds int StatsCountImages int diff --git a/http.go b/http.go index 8b4d608..c68f0cd 100644 --- a/http.go +++ b/http.go @@ -15,22 +15,18 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -// FIXME: we could also incorporate -// https://github.com/kdkumawat/golang/blob/main/http-retry/http/retry-client.go - package main import ( - "fmt" + "bytes" + "io" "log/slog" + "math" "math/rand" "net/http" - "net/http/httputil" - "os" + "time" ) -type loggingTransport struct{} - var letters = []rune("ABCDEF0123456789") func getid() string { @@ -41,25 +37,87 @@ func getid() string { return string(b) } -func (s *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { - resp, err := http.DefaultTransport.RoundTrip(req) +const RetryCount = 3 +type loggingTransport struct{} + +// escalating timeout, $retry^2 seconds +func backoff(retries int) time.Duration { + return time.Duration(math.Pow(2, float64(retries))) * time.Second +} + +// only retry in case of errors or certain non 200 HTTP codes +func shouldRetry(err error, resp *http.Response) bool { + if err != nil { + return true + } + + if resp.StatusCode == http.StatusBadGateway || + resp.StatusCode == http.StatusServiceUnavailable || + resp.StatusCode == http.StatusGatewayTimeout { + return true + } + + return false +} + +// Body needs to be drained, otherwise we can't reuse the http.Response +func drainBody(resp *http.Response) { + if resp != nil { + if resp.Body != nil { + _, err := io.Copy(io.Discard, resp.Body) + if err != nil { + // unable to copy data? uff! + panic(err) + } + resp.Body.Close() + } + } +} + +// our logging transport with retries +func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { // just requred for debugging id := getid() + + // clone the request body, put into request on retry + var bodyBytes []byte + if req.Body != nil { + bodyBytes, _ = io.ReadAll(req.Body) + req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) + } + slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host) - slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode, "contentlength", resp.ContentLength) - if len(os.Getenv("DEBUGHTTP")) > 0 { - fmt.Println("DEBUGHTTP Request ===>") - bytes, _ := httputil.DumpRequestOut(req, true) - fmt.Printf("%s\n", bytes) + // first try + resp, err := http.DefaultTransport.RoundTrip(req) + if err == nil { + slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode, + "contentlength", resp.ContentLength) + } - fmt.Println("<=== DEBUGHTTP Response") - for header, value := range resp.Header { - fmt.Printf("%s: %s\n", header, value) + // enter retry check and loop, if first req were successfull, leave loop immediately + retries := 0 + for shouldRetry(err, resp) && retries < RetryCount { + time.Sleep(backoff(retries)) + + // consume any response to reuse the connection. + drainBody(resp) + + // clone the request body again + if req.Body != nil { + req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) } - fmt.Printf("Status: %s %s\nContent-Length: %d\n\n\n", resp.Proto, resp.Status, resp.ContentLength) + // actual retry + resp, err = http.DefaultTransport.RoundTrip(req) + + if err == nil { + slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode, + "contentlength", resp.ContentLength, "retry", retries) + } + + retries++ } return resp, err diff --git a/kleingebaeck.1 b/kleingebaeck.1 index bdc3961..c763139 100644 --- a/kleingebaeck.1 +++ b/kleingebaeck.1 @@ -133,7 +133,7 @@ .\" ======================================================================== .\" .IX Title "KLEINGEBAECK 1" -.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands" +.TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l @@ -142,18 +142,19 @@ kleingebaeck \- kleinanzeigen.de backup tool .SH "SYNOPSYS" .IX Header "SYNOPSYS" -.Vb 11 +.Vb 12 \& Usage: kleingebaeck [\-dvVhmoc] [,...] \& Options: -\& \-\-user \-u Backup ads from user with uid . -\& \-\-debug \-d Enable debug output. -\& \-\-verbose \-v Enable verbose output. -\& \-\-outdir \-o Set output dir (default: current directory) -\& \-\-limit \-l Limit the ads to download to , default: load all. -\& \-\-config \-c Use config file (default: ~/.kleingebaeck). -\& \-\-manual \-m Show manual. -\& \-\-help \-h Show usage. -\& \-\-version \-V Show program version. +\& \-u \-\-user Backup ads from user with uid . +\& \-d \-\-debug Enable debug output. +\& \-v \-\-verbose Enable verbose output. +\& \-o \-\-outdir Set output dir (default: current directory) +\& \-l \-\-limit Limit the ads to download to , default: load all. +\& \-c \-\-config Use config file (default: ~/.kleingebaeck). +\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. +\& \-m \-\-manual Show manual. +\& \-h \-\-help Show usage. +\& \-V \-\-version Show program version. .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" diff --git a/kleingebaeck.go b/kleingebaeck.go index af850dd..0fc8b54 100644 --- a/kleingebaeck.go +++ b/kleingebaeck.go @@ -7,15 +7,16 @@ NAME SYNOPSYS Usage: kleingebaeck [-dvVhmoc] [,...] Options: - --user -u Backup ads from user with uid . - --debug -d Enable debug output. - --verbose -v Enable verbose output. - --outdir -o Set output dir (default: current directory) - --limit -l Limit the ads to download to , default: load all. - --config -c Use config file (default: ~/.kleingebaeck). - --manual -m Show manual. - --help -h Show usage. - --version -V Show program version. + -u --user Backup ads from user with uid . + -d --debug Enable debug output. + -v --verbose Enable verbose output. + -o --outdir Set output dir (default: current directory) + -l --limit Limit the ads to download to , default: load all. + -c --config Use config file (default: ~/.kleingebaeck). + --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. + -m --manual Show manual. + -h --help Show usage. + -V --version Show program version. DESCRIPTION This tool can be used to backup ads on the german ad page diff --git a/kleingebaeck.pod b/kleingebaeck.pod index b07e2fe..2d964ae 100644 --- a/kleingebaeck.pod +++ b/kleingebaeck.pod @@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool Usage: kleingebaeck [-dvVhmoc] [,...] Options: - --user -u Backup ads from user with uid . - --debug -d Enable debug output. - --verbose -v Enable verbose output. - --outdir -o Set output dir (default: current directory) - --limit -l Limit the ads to download to , default: load all. - --config -c Use config file (default: ~/.kleingebaeck). - --manual -m Show manual. - --help -h Show usage. - --version -V Show program version. + -u --user Backup ads from user with uid . + -d --debug Enable debug output. + -v --verbose Enable verbose output. + -o --outdir Set output dir (default: current directory) + -l --limit Limit the ads to download to , default: load all. + -c --config Use config file (default: ~/.kleingebaeck). + --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. + -m --manual Show manual. + -h --help Show usage. + -V --version Show program version. =head1 DESCRIPTION diff --git a/main.go b/main.go index b93f3c3..594694b 100644 --- a/main.go +++ b/main.go @@ -1,5 +1,5 @@ /* -Copyright © 2023 Thomas von Dein +Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/main_test.go b/main_test.go index f9dd829..f1c8dd3 100644 --- a/main_test.go +++ b/main_test.go @@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html> const ( EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` + INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1` INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1` INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar` INVALIDURI string = `https://foo.bar/weird/things` @@ -228,6 +229,12 @@ var invalidtests = []Tests{ expect: "error loading config file", exitcode: 1, }, + { + name: "503", + args: base + " " + INVALID503URI, + expect: "could not get page via HTTP", + exitcode: 1, + }, } type AdConfig struct { @@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource { content: GetTemplate(nil, empty, "HTTP 404: /eine-anzeige/ does not exist!"), status: 404, }, + { + // valid ad page but 503 + uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri), + content: GetTemplate(nil, empty, "HTTP 503: service unavailable"), + status: 503, + }, } return ads diff --git a/scrape.go b/scrape.go index 867a6fe..22d86a2 100644 --- a/scrape.go +++ b/scrape.go @@ -44,9 +44,6 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) { return nil, err } - slog.Debug("response", "code", res.StatusCode, "status", - res.Status, "size", res.ContentLength) - if res.StatusCode != 200 { return nil, errors.New("could not get page via HTTP") } @@ -165,11 +162,10 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error { imguri := imguri file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) g.Go(func() error { - err := Getimage(imguri, file, client) + err := Getimage(c, imguri, file, client) if err != nil { return err } - slog.Info("wrote ad image", "image", file) return nil }) @@ -186,10 +182,13 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error { } // fetch an image -func Getimage(uri, fileName string, client *http.Client) error { +func Getimage(c *Config, uri, fileName string, client *http.Client) error { slog.Debug("fetching ad image", "uri", uri) req, err := http.NewRequest("GET", uri, nil) if err != nil { + if c.IgnoreErrors { + slog.Info("Failed to download image, error ignored", "error", err.Error()) + } return err } @@ -210,5 +209,6 @@ func Getimage(uri, fileName string, client *http.Client) error { return err } + slog.Info("wrote ad image", "image", fileName) return nil }