mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-17 20:41:01 +01:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
78e5de61d2 | ||
|
|
f4a9a9895c | ||
|
|
ac5b0608d8 |
18
README.md
18
README.md
@@ -99,14 +99,16 @@ To install after building either copy the binary or execute `sudo make install`.
|
|||||||
```
|
```
|
||||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||||
Options:
|
Options:
|
||||||
--user -u <uid> Backup ads from user with uid <uid>.
|
-u --user <uid> Backup ads from user with uid <uid>.
|
||||||
--debug -d Enable debug output.
|
-d --debug Enable debug output.
|
||||||
--verbose -v Enable verbose output.
|
-v --verbose Enable verbose output.
|
||||||
--outdir -o <dir> Set output dir (default: current directory)
|
-o --outdir <dir> Set output dir (default: current directory)
|
||||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||||
--manual -m Show manual.
|
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||||
--help -h Show usage.
|
-m --manual Show manual.
|
||||||
|
-h --help Show usage.
|
||||||
|
-V --version Show program version.
|
||||||
|
|
||||||
If one or more <ad-listing-url>'s are specified, only backup those,
|
If one or more <ad-listing-url>'s are specified, only backup those,
|
||||||
otherwise backup all ads of the given user.
|
otherwise backup all ads of the given user.
|
||||||
|
|||||||
24
config.go
24
config.go
@@ -33,7 +33,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
VERSION string = "0.1.1"
|
VERSION string = "0.1.2"
|
||||||
Baseuri string = "https://www.kleinanzeigen.de"
|
Baseuri string = "https://www.kleinanzeigen.de"
|
||||||
Listuri string = "/s-bestandsliste.html"
|
Listuri string = "/s-bestandsliste.html"
|
||||||
Defaultdir string = "."
|
Defaultdir string = "."
|
||||||
@@ -43,7 +43,7 @@ const (
|
|||||||
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
|
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
|
||||||
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
DefaultAdNameTemplate string = "{{.Slug}}-{{.Id}}"
|
DefaultAdNameTemplate string = "{{.Slug}}"
|
||||||
)
|
)
|
||||||
|
|
||||||
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
|
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
|
||||||
@@ -51,15 +51,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
|
|||||||
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
|
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--user -u <uid> Backup ads from user with uid <uid>.
|
-u --user <uid> Backup ads from user with uid <uid>.
|
||||||
--debug -d Enable debug output.
|
-d --debug Enable debug output.
|
||||||
--verbose -v Enable verbose output.
|
-v --verbose Enable verbose output.
|
||||||
--outdir -o <dir> Set output dir (default: current directory)
|
-o --outdir <dir> Set output dir (default: current directory)
|
||||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||||
--manual -m Show manual.
|
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||||
--help -h Show usage.
|
-m --manual Show manual.
|
||||||
--version -V Show program version.
|
-h --help Show usage.
|
||||||
|
-V --version Show program version.
|
||||||
|
|
||||||
If one or more ad listing url's are specified, only backup those,
|
If one or more ad listing url's are specified, only backup those,
|
||||||
otherwise backup all ads of the given user.`
|
otherwise backup all ads of the given user.`
|
||||||
@@ -76,6 +77,7 @@ type Config struct {
|
|||||||
Adnametemplate string `koanf:"adnametemplate"`
|
Adnametemplate string `koanf:"adnametemplate"`
|
||||||
Loglevel string `koanf:"loglevel"`
|
Loglevel string `koanf:"loglevel"`
|
||||||
Limit int `koanf:"limit"`
|
Limit int `koanf:"limit"`
|
||||||
|
IgnoreErrors bool `koanf:"ignoreerrors"`
|
||||||
Adlinks []string
|
Adlinks []string
|
||||||
StatsCountAds int
|
StatsCountAds int
|
||||||
StatsCountImages int
|
StatsCountImages int
|
||||||
|
|||||||
124
http.go
Normal file
124
http.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
/*
|
||||||
|
Copyright © 2023-2024 Thomas von Dein
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"math"
|
||||||
|
"math/rand"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var letters = []rune("ABCDEF0123456789")
|
||||||
|
|
||||||
|
func getid() string {
|
||||||
|
b := make([]rune, 8)
|
||||||
|
for i := range b {
|
||||||
|
b[i] = letters[rand.Intn(len(letters))]
|
||||||
|
}
|
||||||
|
return string(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
const RetryCount = 3
|
||||||
|
|
||||||
|
type loggingTransport struct{}
|
||||||
|
|
||||||
|
// escalating timeout, $retry^2 seconds
|
||||||
|
func backoff(retries int) time.Duration {
|
||||||
|
return time.Duration(math.Pow(2, float64(retries))) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
// only retry in case of errors or certain non 200 HTTP codes
|
||||||
|
func shouldRetry(err error, resp *http.Response) bool {
|
||||||
|
if err != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusBadGateway ||
|
||||||
|
resp.StatusCode == http.StatusServiceUnavailable ||
|
||||||
|
resp.StatusCode == http.StatusGatewayTimeout {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Body needs to be drained, otherwise we can't reuse the http.Response
|
||||||
|
func drainBody(resp *http.Response) {
|
||||||
|
if resp != nil {
|
||||||
|
if resp.Body != nil {
|
||||||
|
_, err := io.Copy(io.Discard, resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
// unable to copy data? uff!
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
resp.Body.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// our logging transport with retries
|
||||||
|
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||||
|
// just requred for debugging
|
||||||
|
id := getid()
|
||||||
|
|
||||||
|
// clone the request body, put into request on retry
|
||||||
|
var bodyBytes []byte
|
||||||
|
if req.Body != nil {
|
||||||
|
bodyBytes, _ = io.ReadAll(req.Body)
|
||||||
|
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
|
||||||
|
|
||||||
|
// first try
|
||||||
|
resp, err := http.DefaultTransport.RoundTrip(req)
|
||||||
|
if err == nil {
|
||||||
|
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
|
||||||
|
"contentlength", resp.ContentLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
// enter retry check and loop, if first req were successfull, leave loop immediately
|
||||||
|
retries := 0
|
||||||
|
for shouldRetry(err, resp) && retries < RetryCount {
|
||||||
|
time.Sleep(backoff(retries))
|
||||||
|
|
||||||
|
// consume any response to reuse the connection.
|
||||||
|
drainBody(resp)
|
||||||
|
|
||||||
|
// clone the request body again
|
||||||
|
if req.Body != nil {
|
||||||
|
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
// actual retry
|
||||||
|
resp, err = http.DefaultTransport.RoundTrip(req)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
|
||||||
|
"contentlength", resp.ContentLength, "retry", retries)
|
||||||
|
}
|
||||||
|
|
||||||
|
retries++
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp, err
|
||||||
|
}
|
||||||
@@ -133,7 +133,7 @@
|
|||||||
.\" ========================================================================
|
.\" ========================================================================
|
||||||
.\"
|
.\"
|
||||||
.IX Title "KLEINGEBAECK 1"
|
.IX Title "KLEINGEBAECK 1"
|
||||||
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands"
|
.TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands"
|
||||||
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
||||||
.\" way too many mistakes in technical documents.
|
.\" way too many mistakes in technical documents.
|
||||||
.if n .ad l
|
.if n .ad l
|
||||||
@@ -142,18 +142,19 @@
|
|||||||
kleingebaeck \- kleinanzeigen.de backup tool
|
kleingebaeck \- kleinanzeigen.de backup tool
|
||||||
.SH "SYNOPSYS"
|
.SH "SYNOPSYS"
|
||||||
.IX Header "SYNOPSYS"
|
.IX Header "SYNOPSYS"
|
||||||
.Vb 11
|
.Vb 12
|
||||||
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
|
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
|
||||||
\& Options:
|
\& Options:
|
||||||
\& \-\-user \-u <uid> Backup ads from user with uid <uid>.
|
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
|
||||||
\& \-\-debug \-d Enable debug output.
|
\& \-d \-\-debug Enable debug output.
|
||||||
\& \-\-verbose \-v Enable verbose output.
|
\& \-v \-\-verbose Enable verbose output.
|
||||||
\& \-\-outdir \-o <dir> Set output dir (default: current directory)
|
\& \-o \-\-outdir <dir> Set output dir (default: current directory)
|
||||||
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all.
|
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
|
||||||
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck).
|
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||||
\& \-\-manual \-m Show manual.
|
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||||
\& \-\-help \-h Show usage.
|
\& \-m \-\-manual Show manual.
|
||||||
\& \-\-version \-V Show program version.
|
\& \-h \-\-help Show usage.
|
||||||
|
\& \-V \-\-version Show program version.
|
||||||
.Ve
|
.Ve
|
||||||
.SH "DESCRIPTION"
|
.SH "DESCRIPTION"
|
||||||
.IX Header "DESCRIPTION"
|
.IX Header "DESCRIPTION"
|
||||||
|
|||||||
@@ -7,15 +7,16 @@ NAME
|
|||||||
SYNOPSYS
|
SYNOPSYS
|
||||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||||
Options:
|
Options:
|
||||||
--user -u <uid> Backup ads from user with uid <uid>.
|
-u --user <uid> Backup ads from user with uid <uid>.
|
||||||
--debug -d Enable debug output.
|
-d --debug Enable debug output.
|
||||||
--verbose -v Enable verbose output.
|
-v --verbose Enable verbose output.
|
||||||
--outdir -o <dir> Set output dir (default: current directory)
|
-o --outdir <dir> Set output dir (default: current directory)
|
||||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||||
--manual -m Show manual.
|
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||||
--help -h Show usage.
|
-m --manual Show manual.
|
||||||
--version -V Show program version.
|
-h --help Show usage.
|
||||||
|
-V --version Show program version.
|
||||||
|
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
This tool can be used to backup ads on the german ad page
|
This tool can be used to backup ads on the german ad page
|
||||||
|
|||||||
@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
|
|||||||
|
|
||||||
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
|
||||||
Options:
|
Options:
|
||||||
--user -u <uid> Backup ads from user with uid <uid>.
|
-u --user <uid> Backup ads from user with uid <uid>.
|
||||||
--debug -d Enable debug output.
|
-d --debug Enable debug output.
|
||||||
--verbose -v Enable verbose output.
|
-v --verbose Enable verbose output.
|
||||||
--outdir -o <dir> Set output dir (default: current directory)
|
-o --outdir <dir> Set output dir (default: current directory)
|
||||||
--limit -l <num> Limit the ads to download to <num>, default: load all.
|
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||||
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
|
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||||
--manual -m Show manual.
|
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||||
--help -h Show usage.
|
-m --manual Show manual.
|
||||||
--version -V Show program version.
|
-h --help Show usage.
|
||||||
|
-V --version Show program version.
|
||||||
|
|
||||||
=head1 DESCRIPTION
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
|||||||
10
main.go
10
main.go
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright © 2023 Thomas von Dein
|
Copyright © 2023-2024 Thomas von Dein
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@@ -22,6 +22,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
|
|
||||||
@@ -111,17 +112,20 @@ func Main(w io.Writer) int {
|
|||||||
return Die(err)
|
return Die(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// used for all HTTP requests
|
||||||
|
client := &http.Client{Transport: &loggingTransport{}}
|
||||||
|
|
||||||
if len(conf.Adlinks) >= 1 {
|
if len(conf.Adlinks) >= 1 {
|
||||||
// directly backup ad listing[s]
|
// directly backup ad listing[s]
|
||||||
for _, uri := range conf.Adlinks {
|
for _, uri := range conf.Adlinks {
|
||||||
err := Scrape(conf, uri)
|
err := ScrapeAd(conf, uri, client)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Die(err)
|
return Die(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if conf.User > 0 {
|
} else if conf.User > 0 {
|
||||||
// backup all ads of the given user (via config or cmdline)
|
// backup all ads of the given user (via config or cmdline)
|
||||||
err := Start(conf)
|
err := ScrapeUser(conf, client)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Die(err)
|
return Die(err)
|
||||||
}
|
}
|
||||||
|
|||||||
21
main_test.go
21
main_test.go
@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
|
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
|
||||||
|
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
|
||||||
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
|
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
|
||||||
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
|
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
|
||||||
INVALIDURI string = `https://foo.bar/weird/things`
|
INVALIDURI string = `https://foo.bar/weird/things`
|
||||||
@@ -228,6 +229,12 @@ var invalidtests = []Tests{
|
|||||||
expect: "error loading config file",
|
expect: "error loading config file",
|
||||||
exitcode: 1,
|
exitcode: 1,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "503",
|
||||||
|
args: base + " " + INVALID503URI,
|
||||||
|
expect: "could not get page via HTTP",
|
||||||
|
exitcode: 1,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdConfig struct {
|
type AdConfig struct {
|
||||||
@@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource {
|
|||||||
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
|
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
|
||||||
status: 404,
|
status: 404,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
// valid ad page but 503
|
||||||
|
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
|
||||||
|
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
|
||||||
|
status: 503,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
return ads
|
return ads
|
||||||
@@ -438,16 +451,18 @@ func SetIntercept(ads []Adsource) {
|
|||||||
|
|
||||||
// we just use 2 images, put this here
|
// we just use 2 images, put this here
|
||||||
for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
|
for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
|
||||||
httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image)))
|
httpmock.RegisterResponder("GET", image,
|
||||||
|
httpmock.NewBytesResponder(200, GetImage(image)))
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func VerifyAd(ad AdConfig) error {
|
func VerifyAd(ad AdConfig) error {
|
||||||
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created
|
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
|
||||||
|
ad.Category + ad.Condition + ad.Created
|
||||||
|
|
||||||
// prepare ad dir name using DefaultAdNameTemplate
|
// prepare ad dir name using DefaultAdNameTemplate
|
||||||
c := Config{Adnametemplate: DefaultAdNameTemplate}
|
c := Config{Adnametemplate: "{{ .Slug }}"}
|
||||||
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
|
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
|
||||||
addir, err := AdDirName(&c, &adstruct)
|
addir, err := AdDirName(&c, &adstruct)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
33
scrape.go
33
scrape.go
@@ -44,9 +44,6 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("response", "code", res.StatusCode, "status",
|
|
||||||
res.Status, "size", res.ContentLength)
|
|
||||||
|
|
||||||
if res.StatusCode != 200 {
|
if res.StatusCode != 200 {
|
||||||
return nil, errors.New("could not get page via HTTP")
|
return nil, errors.New("could not get page via HTTP")
|
||||||
}
|
}
|
||||||
@@ -56,8 +53,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
|||||||
|
|
||||||
// extract links from all ad listing pages (that is: use pagination)
|
// extract links from all ad listing pages (that is: use pagination)
|
||||||
// and scrape every page
|
// and scrape every page
|
||||||
func Start(conf *Config) error {
|
func ScrapeUser(conf *Config, client *http.Client) error {
|
||||||
client := &http.Client{}
|
|
||||||
adlinks := []string{}
|
adlinks := []string{}
|
||||||
|
|
||||||
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
|
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
|
||||||
@@ -96,7 +92,7 @@ func Start(conf *Config) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i, adlink := range adlinks {
|
for i, adlink := range adlinks {
|
||||||
err := Scrape(conf, Baseuri+adlink)
|
err := ScrapeAd(conf, Baseuri+adlink, client)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -110,8 +106,7 @@ func Start(conf *Config) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// scrape an ad. uri is the full uri of the ad, dir is the basedir
|
// scrape an ad. uri is the full uri of the ad, dir is the basedir
|
||||||
func Scrape(c *Config, uri string) error {
|
func ScrapeAd(c *Config, uri string, client *http.Client) error {
|
||||||
client := &http.Client{}
|
|
||||||
ad := &Ad{}
|
ad := &Ad{}
|
||||||
|
|
||||||
// extract slug and id from uri
|
// extract slug and id from uri
|
||||||
@@ -155,10 +150,10 @@ func Scrape(c *Config, uri string) error {
|
|||||||
|
|
||||||
c.IncrAds()
|
c.IncrAds()
|
||||||
|
|
||||||
return ScrapeImages(c, ad, addir)
|
return ScrapeImages(c, ad, addir, client)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapeImages(c *Config, ad *Ad, addir string) error {
|
func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
|
||||||
// fetch images
|
// fetch images
|
||||||
img := 1
|
img := 1
|
||||||
g := new(errgroup.Group)
|
g := new(errgroup.Group)
|
||||||
@@ -167,11 +162,10 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
|
|||||||
imguri := imguri
|
imguri := imguri
|
||||||
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
|
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
err := Getimage(imguri, file)
|
err := Getimage(c, imguri, file, client)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
slog.Info("wrote ad image", "image", file)
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
@@ -188,9 +182,19 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// fetch an image
|
// fetch an image
|
||||||
func Getimage(uri, fileName string) error {
|
func Getimage(c *Config, uri, fileName string, client *http.Client) error {
|
||||||
slog.Debug("fetching ad image", "uri", uri)
|
slog.Debug("fetching ad image", "uri", uri)
|
||||||
response, err := http.Get(uri)
|
req, err := http.NewRequest("GET", uri, nil)
|
||||||
|
if err != nil {
|
||||||
|
if c.IgnoreErrors {
|
||||||
|
slog.Info("Failed to download image, error ignored", "error", err.Error())
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
req.Header.Set("User-Agent", Useragent)
|
||||||
|
|
||||||
|
response, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -205,5 +209,6 @@ func Getimage(uri, fileName string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Info("wrote ad image", "image", fileName)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user