Add HTTP retries and the possibility to ignore image download errors (#33)

added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
This commit is contained in:
T.v.Dein
2024-01-16 13:18:15 +01:00
committed by Thomas von Dein
parent cca3211023
commit 5fa46ff106
9 changed files with 151 additions and 73 deletions

View File

@@ -99,14 +99,16 @@ To install after building either copy the binary or execute `sudo make install`.
``` ```
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more <ad-listing-url>'s are specified, only backup those, If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user. otherwise backup all ads of the given user.

View File

@@ -33,7 +33,7 @@ import (
) )
const ( const (
VERSION string = "0.1.1" VERSION string = "0.1.2"
Baseuri string = "https://www.kleinanzeigen.de" Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html" Listuri string = "/s-bestandsliste.html"
Defaultdir string = "." Defaultdir string = "."
@@ -51,15 +51,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those, If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.` otherwise backup all ads of the given user.`
@@ -76,6 +77,7 @@ type Config struct {
Adnametemplate string `koanf:"adnametemplate"` Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"` Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"` Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
Adlinks []string Adlinks []string
StatsCountAds int StatsCountAds int
StatsCountImages int StatsCountImages int

96
http.go
View File

@@ -15,22 +15,18 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// FIXME: we could also incorporate
// https://github.com/kdkumawat/golang/blob/main/http-retry/http/retry-client.go
package main package main
import ( import (
"fmt" "bytes"
"io"
"log/slog" "log/slog"
"math"
"math/rand" "math/rand"
"net/http" "net/http"
"net/http/httputil" "time"
"os"
) )
type loggingTransport struct{}
var letters = []rune("ABCDEF0123456789") var letters = []rune("ABCDEF0123456789")
func getid() string { func getid() string {
@@ -41,25 +37,87 @@ func getid() string {
return string(b) return string(b)
} }
func (s *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { const RetryCount = 3
resp, err := http.DefaultTransport.RoundTrip(req)
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// our logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging // just requred for debugging
id := getid() id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host) slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode, "contentlength", resp.ContentLength)
if len(os.Getenv("DEBUGHTTP")) > 0 { // first try
fmt.Println("DEBUGHTTP Request ===>") resp, err := http.DefaultTransport.RoundTrip(req)
bytes, _ := httputil.DumpRequestOut(req, true) if err == nil {
fmt.Printf("%s\n", bytes) slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
fmt.Println("<=== DEBUGHTTP Response") // enter retry check and loop, if first req were successfull, leave loop immediately
for header, value := range resp.Header { retries := 0
fmt.Printf("%s: %s\n", header, value) for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
} }
fmt.Printf("Status: %s %s\nContent-Length: %d\n\n\n", resp.Proto, resp.Status, resp.ContentLength)
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
} }
return resp, err return resp, err

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands" .TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -142,18 +142,19 @@
kleingebaeck \- kleinanzeigen.de backup tool kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS" .SH "SYNOPSYS"
.IX Header "SYNOPSYS" .IX Header "SYNOPSYS"
.Vb 11 .Vb 12
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...] \& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options: \& Options:
\& \-\-user \-u <uid> Backup ads from user with uid <uid>. \& \-u \-\-user <uid> Backup ads from user with uid <uid>.
\& \-\-debug \-d Enable debug output. \& \-d \-\-debug Enable debug output.
\& \-\-verbose \-v Enable verbose output. \& \-v \-\-verbose Enable verbose output.
\& \-\-outdir \-o <dir> Set output dir (default: current directory) \& \-o \-\-outdir <dir> Set output dir (default: current directory)
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all. \& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck). \& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-manual \-m Show manual. \& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-\-help \-h Show usage. \& \-m \-\-manual Show manual.
\& \-\-version \-V Show program version. \& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve .Ve
.SH "DESCRIPTION" .SH "DESCRIPTION"
.IX Header "DESCRIPTION" .IX Header "DESCRIPTION"

View File

@@ -7,15 +7,16 @@ NAME
SYNOPSYS SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
DESCRIPTION DESCRIPTION
This tool can be used to backup ads on the german ad page This tool can be used to backup ads on the german ad page

View File

@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION =head1 DESCRIPTION

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by

View File

@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
const ( const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1` INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar` INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things` INVALIDURI string = `https://foo.bar/weird/things`
@@ -228,6 +229,12 @@ var invalidtests = []Tests{
expect: "error loading config file", expect: "error loading config file",
exitcode: 1, exitcode: 1,
}, },
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
} }
type AdConfig struct { type AdConfig struct {
@@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource {
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"), content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404, status: 404,
}, },
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
} }
return ads return ads

View File

@@ -44,9 +44,6 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
return nil, err return nil, err
} }
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
if res.StatusCode != 200 { if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP") return nil, errors.New("could not get page via HTTP")
} }
@@ -165,11 +162,10 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
imguri := imguri imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error { g.Go(func() error {
err := Getimage(imguri, file, client) err := Getimage(c, imguri, file, client)
if err != nil { if err != nil {
return err return err
} }
slog.Info("wrote ad image", "image", file)
return nil return nil
}) })
@@ -186,10 +182,13 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
} }
// fetch an image // fetch an image
func Getimage(uri, fileName string, client *http.Client) error { func Getimage(c *Config, uri, fileName string, client *http.Client) error {
slog.Debug("fetching ad image", "uri", uri) slog.Debug("fetching ad image", "uri", uri)
req, err := http.NewRequest("GET", uri, nil) req, err := http.NewRequest("GET", uri, nil)
if err != nil { if err != nil {
if c.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
}
return err return err
} }
@@ -210,5 +209,6 @@ func Getimage(uri, fileName string, client *http.Client) error {
return err return err
} }
slog.Info("wrote ad image", "image", fileName)
return nil return nil
} }