mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-16 20:11:01 +01:00
Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
This commit is contained in:
18
README.md
18
README.md
@@ -99,14 +99,16 @@ To install after building either copy the binary or execute `sudo make install`.
|
||||
```
|
||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||
Options:
|
||||
--user -u <uid> Backup ads from user with uid <uid>.
|
||||
--debug -d Enable debug output.
|
||||
--verbose -v Enable verbose output.
|
||||
--outdir -o <dir> Set output dir (default: current directory)
|
||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--manual -m Show manual.
|
||||
--help -h Show usage.
|
||||
-u --user <uid> Backup ads from user with uid <uid>.
|
||||
-d --debug Enable debug output.
|
||||
-v --verbose Enable verbose output.
|
||||
-o --outdir <dir> Set output dir (default: current directory)
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
If one or more <ad-listing-url>'s are specified, only backup those,
|
||||
otherwise backup all ads of the given user.
|
||||
|
||||
22
config.go
22
config.go
@@ -33,7 +33,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
VERSION string = "0.1.1"
|
||||
VERSION string = "0.1.2"
|
||||
Baseuri string = "https://www.kleinanzeigen.de"
|
||||
Listuri string = "/s-bestandsliste.html"
|
||||
Defaultdir string = "."
|
||||
@@ -51,15 +51,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
|
||||
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
|
||||
|
||||
Options:
|
||||
--user -u <uid> Backup ads from user with uid <uid>.
|
||||
--debug -d Enable debug output.
|
||||
--verbose -v Enable verbose output.
|
||||
--outdir -o <dir> Set output dir (default: current directory)
|
||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--manual -m Show manual.
|
||||
--help -h Show usage.
|
||||
--version -V Show program version.
|
||||
-u --user <uid> Backup ads from user with uid <uid>.
|
||||
-d --debug Enable debug output.
|
||||
-v --verbose Enable verbose output.
|
||||
-o --outdir <dir> Set output dir (default: current directory)
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
If one or more ad listing url's are specified, only backup those,
|
||||
otherwise backup all ads of the given user.`
|
||||
@@ -76,6 +77,7 @@ type Config struct {
|
||||
Adnametemplate string `koanf:"adnametemplate"`
|
||||
Loglevel string `koanf:"loglevel"`
|
||||
Limit int `koanf:"limit"`
|
||||
IgnoreErrors bool `koanf:"ignoreerrors"`
|
||||
Adlinks []string
|
||||
StatsCountAds int
|
||||
StatsCountImages int
|
||||
|
||||
96
http.go
96
http.go
@@ -15,22 +15,18 @@ You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// FIXME: we could also incorporate
|
||||
// https://github.com/kdkumawat/golang/blob/main/http-retry/http/retry-client.go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"bytes"
|
||||
"io"
|
||||
"log/slog"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
type loggingTransport struct{}
|
||||
|
||||
var letters = []rune("ABCDEF0123456789")
|
||||
|
||||
func getid() string {
|
||||
@@ -41,25 +37,87 @@ func getid() string {
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func (s *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
resp, err := http.DefaultTransport.RoundTrip(req)
|
||||
const RetryCount = 3
|
||||
|
||||
type loggingTransport struct{}
|
||||
|
||||
// escalating timeout, $retry^2 seconds
|
||||
func backoff(retries int) time.Duration {
|
||||
return time.Duration(math.Pow(2, float64(retries))) * time.Second
|
||||
}
|
||||
|
||||
// only retry in case of errors or certain non 200 HTTP codes
|
||||
func shouldRetry(err error, resp *http.Response) bool {
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
if resp.StatusCode == http.StatusBadGateway ||
|
||||
resp.StatusCode == http.StatusServiceUnavailable ||
|
||||
resp.StatusCode == http.StatusGatewayTimeout {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Body needs to be drained, otherwise we can't reuse the http.Response
|
||||
func drainBody(resp *http.Response) {
|
||||
if resp != nil {
|
||||
if resp.Body != nil {
|
||||
_, err := io.Copy(io.Discard, resp.Body)
|
||||
if err != nil {
|
||||
// unable to copy data? uff!
|
||||
panic(err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// our logging transport with retries
|
||||
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
// just requred for debugging
|
||||
id := getid()
|
||||
|
||||
// clone the request body, put into request on retry
|
||||
var bodyBytes []byte
|
||||
if req.Body != nil {
|
||||
bodyBytes, _ = io.ReadAll(req.Body)
|
||||
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||
}
|
||||
|
||||
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
|
||||
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode, "contentlength", resp.ContentLength)
|
||||
|
||||
if len(os.Getenv("DEBUGHTTP")) > 0 {
|
||||
fmt.Println("DEBUGHTTP Request ===>")
|
||||
bytes, _ := httputil.DumpRequestOut(req, true)
|
||||
fmt.Printf("%s\n", bytes)
|
||||
// first try
|
||||
resp, err := http.DefaultTransport.RoundTrip(req)
|
||||
if err == nil {
|
||||
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
|
||||
"contentlength", resp.ContentLength)
|
||||
}
|
||||
|
||||
fmt.Println("<=== DEBUGHTTP Response")
|
||||
for header, value := range resp.Header {
|
||||
fmt.Printf("%s: %s\n", header, value)
|
||||
// enter retry check and loop, if first req were successfull, leave loop immediately
|
||||
retries := 0
|
||||
for shouldRetry(err, resp) && retries < RetryCount {
|
||||
time.Sleep(backoff(retries))
|
||||
|
||||
// consume any response to reuse the connection.
|
||||
drainBody(resp)
|
||||
|
||||
// clone the request body again
|
||||
if req.Body != nil {
|
||||
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||
}
|
||||
fmt.Printf("Status: %s %s\nContent-Length: %d\n\n\n", resp.Proto, resp.Status, resp.ContentLength)
|
||||
|
||||
// actual retry
|
||||
resp, err = http.DefaultTransport.RoundTrip(req)
|
||||
|
||||
if err == nil {
|
||||
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
|
||||
"contentlength", resp.ContentLength, "retry", retries)
|
||||
}
|
||||
|
||||
retries++
|
||||
}
|
||||
|
||||
return resp, err
|
||||
|
||||
@@ -133,7 +133,7 @@
|
||||
.\" ========================================================================
|
||||
.\"
|
||||
.IX Title "KLEINGEBAECK 1"
|
||||
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands"
|
||||
.TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands"
|
||||
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
||||
.\" way too many mistakes in technical documents.
|
||||
.if n .ad l
|
||||
@@ -142,18 +142,19 @@
|
||||
kleingebaeck \- kleinanzeigen.de backup tool
|
||||
.SH "SYNOPSYS"
|
||||
.IX Header "SYNOPSYS"
|
||||
.Vb 11
|
||||
.Vb 12
|
||||
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
|
||||
\& Options:
|
||||
\& \-\-user \-u <uid> Backup ads from user with uid <uid>.
|
||||
\& \-\-debug \-d Enable debug output.
|
||||
\& \-\-verbose \-v Enable verbose output.
|
||||
\& \-\-outdir \-o <dir> Set output dir (default: current directory)
|
||||
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all.
|
||||
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
\& \-\-manual \-m Show manual.
|
||||
\& \-\-help \-h Show usage.
|
||||
\& \-\-version \-V Show program version.
|
||||
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
|
||||
\& \-d \-\-debug Enable debug output.
|
||||
\& \-v \-\-verbose Enable verbose output.
|
||||
\& \-o \-\-outdir <dir> Set output dir (default: current directory)
|
||||
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
|
||||
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
\& \-m \-\-manual Show manual.
|
||||
\& \-h \-\-help Show usage.
|
||||
\& \-V \-\-version Show program version.
|
||||
.Ve
|
||||
.SH "DESCRIPTION"
|
||||
.IX Header "DESCRIPTION"
|
||||
|
||||
@@ -7,15 +7,16 @@ NAME
|
||||
SYNOPSYS
|
||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||
Options:
|
||||
--user -u <uid> Backup ads from user with uid <uid>.
|
||||
--debug -d Enable debug output.
|
||||
--verbose -v Enable verbose output.
|
||||
--outdir -o <dir> Set output dir (default: current directory)
|
||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--manual -m Show manual.
|
||||
--help -h Show usage.
|
||||
--version -V Show program version.
|
||||
-u --user <uid> Backup ads from user with uid <uid>.
|
||||
-d --debug Enable debug output.
|
||||
-v --verbose Enable verbose output.
|
||||
-o --outdir <dir> Set output dir (default: current directory)
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
DESCRIPTION
|
||||
This tool can be used to backup ads on the german ad page
|
||||
|
||||
@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
|
||||
|
||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||
Options:
|
||||
--user -u <uid> Backup ads from user with uid <uid>.
|
||||
--debug -d Enable debug output.
|
||||
--verbose -v Enable verbose output.
|
||||
--outdir -o <dir> Set output dir (default: current directory)
|
||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--manual -m Show manual.
|
||||
--help -h Show usage.
|
||||
--version -V Show program version.
|
||||
-u --user <uid> Backup ads from user with uid <uid>.
|
||||
-d --debug Enable debug output.
|
||||
-v --verbose Enable verbose output.
|
||||
-o --outdir <dir> Set output dir (default: current directory)
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
|
||||
2
main.go
2
main.go
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright © 2023 Thomas von Dein
|
||||
Copyright © 2023-2024 Thomas von Dein
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
||||
13
main_test.go
13
main_test.go
@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
|
||||
|
||||
const (
|
||||
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
|
||||
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
|
||||
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
|
||||
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
|
||||
INVALIDURI string = `https://foo.bar/weird/things`
|
||||
@@ -228,6 +229,12 @@ var invalidtests = []Tests{
|
||||
expect: "error loading config file",
|
||||
exitcode: 1,
|
||||
},
|
||||
{
|
||||
name: "503",
|
||||
args: base + " " + INVALID503URI,
|
||||
expect: "could not get page via HTTP",
|
||||
exitcode: 1,
|
||||
},
|
||||
}
|
||||
|
||||
type AdConfig struct {
|
||||
@@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource {
|
||||
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
|
||||
status: 404,
|
||||
},
|
||||
{
|
||||
// valid ad page but 503
|
||||
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
|
||||
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
|
||||
status: 503,
|
||||
},
|
||||
}
|
||||
|
||||
return ads
|
||||
|
||||
12
scrape.go
12
scrape.go
@@ -44,9 +44,6 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("response", "code", res.StatusCode, "status",
|
||||
res.Status, "size", res.ContentLength)
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, errors.New("could not get page via HTTP")
|
||||
}
|
||||
@@ -165,11 +162,10 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
|
||||
imguri := imguri
|
||||
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
|
||||
g.Go(func() error {
|
||||
err := Getimage(imguri, file, client)
|
||||
err := Getimage(c, imguri, file, client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Info("wrote ad image", "image", file)
|
||||
|
||||
return nil
|
||||
})
|
||||
@@ -186,10 +182,13 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
|
||||
}
|
||||
|
||||
// fetch an image
|
||||
func Getimage(uri, fileName string, client *http.Client) error {
|
||||
func Getimage(c *Config, uri, fileName string, client *http.Client) error {
|
||||
slog.Debug("fetching ad image", "uri", uri)
|
||||
req, err := http.NewRequest("GET", uri, nil)
|
||||
if err != nil {
|
||||
if c.IgnoreErrors {
|
||||
slog.Info("Failed to download image, error ignored", "error", err.Error())
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -210,5 +209,6 @@ func Getimage(uri, fileName string, client *http.Client) error {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Info("wrote ad image", "image", fileName)
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user