Compare commits

..

3 Commits

Author SHA1 Message Date
T.v.Dein
78e5de61d2 Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
2024-01-16 13:20:16 +01:00
T.v.Dein
f4a9a9895c Enhancement/http (#32)
* added HTTP debug logging using `-d` or `DEBUGHTTP=1` (headers only)
2024-01-16 13:20:16 +01:00
T.v.Dein
ac5b0608d8 fix #30: revert default adnamedir to just use the slug as before (#31) 2024-01-16 13:20:16 +01:00
9 changed files with 223 additions and 68 deletions

View File

@@ -99,14 +99,16 @@ To install after building either copy the binary or execute `sudo make install`.
``` ```
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more <ad-listing-url>'s are specified, only backup those, If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user. otherwise backup all ads of the given user.

View File

@@ -33,7 +33,7 @@ import (
) )
const ( const (
VERSION string = "0.1.1" VERSION string = "0.1.2"
Baseuri string = "https://www.kleinanzeigen.de" Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html" Listuri string = "/s-bestandsliste.html"
Defaultdir string = "." Defaultdir string = "."
@@ -43,7 +43,7 @@ const (
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n" "Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}-{{.Id}}" DefaultAdNameTemplate string = "{{.Slug}}"
) )
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
@@ -51,15 +51,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those, If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.` otherwise backup all ads of the given user.`
@@ -76,6 +77,7 @@ type Config struct {
Adnametemplate string `koanf:"adnametemplate"` Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"` Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"` Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
Adlinks []string Adlinks []string
StatsCountAds int StatsCountAds int
StatsCountImages int StatsCountImages int

124
http.go Normal file
View File

@@ -0,0 +1,124 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"io"
"log/slog"
"math"
"math/rand"
"net/http"
"time"
)
var letters = []rune("ABCDEF0123456789")
func getid() string {
b := make([]rune, 8)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
const RetryCount = 3
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// our logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
// first try
resp, err := http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
// enter retry check and loop, if first req were successfull, leave loop immediately
retries := 0
for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
}
return resp, err
}

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands" .TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -142,18 +142,19 @@
kleingebaeck \- kleinanzeigen.de backup tool kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS" .SH "SYNOPSYS"
.IX Header "SYNOPSYS" .IX Header "SYNOPSYS"
.Vb 11 .Vb 12
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...] \& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options: \& Options:
\& \-\-user \-u <uid> Backup ads from user with uid <uid>. \& \-u \-\-user <uid> Backup ads from user with uid <uid>.
\& \-\-debug \-d Enable debug output. \& \-d \-\-debug Enable debug output.
\& \-\-verbose \-v Enable verbose output. \& \-v \-\-verbose Enable verbose output.
\& \-\-outdir \-o <dir> Set output dir (default: current directory) \& \-o \-\-outdir <dir> Set output dir (default: current directory)
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all. \& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck). \& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-manual \-m Show manual. \& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-\-help \-h Show usage. \& \-m \-\-manual Show manual.
\& \-\-version \-V Show program version. \& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve .Ve
.SH "DESCRIPTION" .SH "DESCRIPTION"
.IX Header "DESCRIPTION" .IX Header "DESCRIPTION"

View File

@@ -7,15 +7,16 @@ NAME
SYNOPSYS SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
DESCRIPTION DESCRIPTION
This tool can be used to backup ads on the german ad page This tool can be used to backup ads on the german ad page

View File

@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION =head1 DESCRIPTION

10
main.go
View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -22,6 +22,7 @@ import (
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
"net/http"
"os" "os"
"runtime/debug" "runtime/debug"
@@ -111,17 +112,20 @@ func Main(w io.Writer) int {
return Die(err) return Die(err)
} }
// used for all HTTP requests
client := &http.Client{Transport: &loggingTransport{}}
if len(conf.Adlinks) >= 1 { if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s] // directly backup ad listing[s]
for _, uri := range conf.Adlinks { for _, uri := range conf.Adlinks {
err := Scrape(conf, uri) err := ScrapeAd(conf, uri, client)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
} }
} else if conf.User > 0 { } else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline) // backup all ads of the given user (via config or cmdline)
err := Start(conf) err := ScrapeUser(conf, client)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }

View File

@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
const ( const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1` INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar` INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things` INVALIDURI string = `https://foo.bar/weird/things`
@@ -228,6 +229,12 @@ var invalidtests = []Tests{
expect: "error loading config file", expect: "error loading config file",
exitcode: 1, exitcode: 1,
}, },
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
} }
type AdConfig struct { type AdConfig struct {
@@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource {
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"), content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404, status: 404,
}, },
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
} }
return ads return ads
@@ -438,16 +451,18 @@ func SetIntercept(ads []Adsource) {
// we just use 2 images, put this here // we just use 2 images, put this here
for _, image := range []string{"t/1.jpg", "t/2.jpg"} { for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image))) httpmock.RegisterResponder("GET", image,
httpmock.NewBytesResponder(200, GetImage(image)))
} }
} }
func VerifyAd(ad AdConfig) error { func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
ad.Category + ad.Condition + ad.Created
// prepare ad dir name using DefaultAdNameTemplate // prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: DefaultAdNameTemplate} c := Config{Adnametemplate: "{{ .Slug }}"}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id} adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct) addir, err := AdDirName(&c, &adstruct)
if err != nil { if err != nil {

View File

@@ -44,9 +44,6 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
return nil, err return nil, err
} }
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
if res.StatusCode != 200 { if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP") return nil, errors.New("could not get page via HTTP")
} }
@@ -56,8 +53,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func Start(conf *Config) error { func ScrapeUser(conf *Config, client *http.Client) error {
client := &http.Client{}
adlinks := []string{} adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User) baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
@@ -96,7 +92,7 @@ func Start(conf *Config) error {
} }
for i, adlink := range adlinks { for i, adlink := range adlinks {
err := Scrape(conf, Baseuri+adlink) err := ScrapeAd(conf, Baseuri+adlink, client)
if err != nil { if err != nil {
return err return err
} }
@@ -110,8 +106,7 @@ func Start(conf *Config) error {
} }
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(c *Config, uri string) error { func ScrapeAd(c *Config, uri string, client *http.Client) error {
client := &http.Client{}
ad := &Ad{} ad := &Ad{}
// extract slug and id from uri // extract slug and id from uri
@@ -155,10 +150,10 @@ func Scrape(c *Config, uri string) error {
c.IncrAds() c.IncrAds()
return ScrapeImages(c, ad, addir) return ScrapeImages(c, ad, addir, client)
} }
func ScrapeImages(c *Config, ad *Ad, addir string) error { func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
// fetch images // fetch images
img := 1 img := 1
g := new(errgroup.Group) g := new(errgroup.Group)
@@ -167,11 +162,10 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
imguri := imguri imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error { g.Go(func() error {
err := Getimage(imguri, file) err := Getimage(c, imguri, file, client)
if err != nil { if err != nil {
return err return err
} }
slog.Info("wrote ad image", "image", file)
return nil return nil
}) })
@@ -188,9 +182,19 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
} }
// fetch an image // fetch an image
func Getimage(uri, fileName string) error { func Getimage(c *Config, uri, fileName string, client *http.Client) error {
slog.Debug("fetching ad image", "uri", uri) slog.Debug("fetching ad image", "uri", uri)
response, err := http.Get(uri) req, err := http.NewRequest("GET", uri, nil)
if err != nil {
if c.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
}
return err
}
req.Header.Set("User-Agent", Useragent)
response, err := client.Do(req)
if err != nil { if err != nil {
return err return err
} }
@@ -205,5 +209,6 @@ func Getimage(uri, fileName string) error {
return err return err
} }
slog.Info("wrote ad image", "image", fileName)
return nil return nil
} }