Compare commits

..

8 Commits

Author SHA1 Message Date
0726b95fd4 added docker instructions 2024-01-17 14:15:46 +01:00
21ca5c626f added environment variable support 2024-01-17 14:03:43 +01:00
7e2161d4d4 no bak 2024-01-17 13:58:21 +01:00
ef4d2dab16 added docker image support 2024-01-17 13:58:09 +01:00
T.v.Dein
3fd75fa53d refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00
T.v.Dein
78e5de61d2 Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
2024-01-16 13:20:16 +01:00
T.v.Dein
f4a9a9895c Enhancement/http (#32)
* added HTTP debug logging using `-d` or `DEBUGHTTP=1` (headers only)
2024-01-16 13:20:16 +01:00
T.v.Dein
ac5b0608d8 fix #30: revert default adnamedir to just use the slug as before (#31) 2024-01-16 13:20:16 +01:00
16 changed files with 471 additions and 122 deletions

28
.github/workflows/pushimage.yaml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ test
kleingebaeck
releases
t/out
.bak

29
Dockerfile Normal file
View File

@@ -0,0 +1,29 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
#RUN install -o 1001 -g 1001 -d /data
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

View File

@@ -94,19 +94,41 @@ installed - `make`.
To install after building either copy the binary or execute `sudo make install`.
### Using the docker image
A pre-built docker image is available, which you can use to test the
app without installing it. You need `docker-compose`. Copy the file
`docker-compose.yaml` to somewhere, cd to that directory and execute:
```shell
mkdir kleinanzeigen-backup
USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose run kleingebaeck -u XXX -v
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory must exist prior to
the execution, otherwise docker will create it as root, then
kleingebaeck will fail.
You may of course also modify the `docker-compose.yaml` to suit your needs.
If you want to build the image yourself, use the supplied Dockerfile.
## Commandline options:
```
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
--user -u <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output.
--verbose -v Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual.
--help -h Show usage.
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user.
@@ -126,6 +148,13 @@ loglevel = verbose
outdir = "test"
```
## Environment Variables
Kleingebaeck can also be configured using environment variables. Just prefix the config variables with `KLEINGEBAECK_` and put them to upper case. Eg:
```shell
% KLEINGEBAECK_OUTDIR=/backup kleingebaeck -v
```
## Usage
To setup the tool, you need to lookup your userid on

View File

@@ -23,9 +23,11 @@ import (
"os"
"path/filepath"
"runtime"
"strings"
"github.com/knadh/koanf/parsers/toml"
"github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2"
@@ -33,7 +35,7 @@ import (
)
const (
VERSION string = "0.1.1"
VERSION string = "0.1.2"
Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html"
Defaultdir string = "."
@@ -43,7 +45,7 @@ const (
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}-{{.Id}}"
DefaultAdNameTemplate string = "{{.Slug}}"
)
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
@@ -51,15 +53,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options:
--user -u <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output.
--verbose -v Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual.
--help -h Show usage.
--version -V Show program version.
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.`
@@ -76,6 +79,7 @@ type Config struct {
Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
Adlinks []string
StatsCountAds int
StatsCountImages int
@@ -160,7 +164,15 @@ func InitConfig(w io.Writer) (*Config, error) {
// else: we ignore the file if it doesn't exists
}
// command line overrides config file
// env overrides config file
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string {
return strings.Replace(strings.ToLower(
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1)
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error())
}
// command line overrides env
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error())
}

22
docker-compose.yaml Normal file
View File

@@ -0,0 +1,22 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

75
fetch.go Normal file
View File

@@ -0,0 +1,75 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

1
go.mod
View File

@@ -21,6 +21,7 @@ require (
github.com/andybalholm/cascadia v1.0.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/knadh/koanf/providers/env v0.1.0 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect

2
go.sum
View File

@@ -17,6 +17,8 @@ github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6OD
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=

129
http.go Normal file
View File

@@ -0,0 +1,129 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"io"
"log/slog"
"math"
"math/rand"
"net/http"
"time"
)
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789")
func getid() string {
b := make([]rune, 8)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
// first try
resp, err := http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
// enter retry check and loop, if first req were successfull, leave loop immediately
retries := 0
for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
}
return resp, err
}

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands"
.TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -142,18 +142,19 @@
kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS"
.IX Header "SYNOPSYS"
.Vb 11
.Vb 12
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options:
\& \-\-user \-u <uid> Backup ads from user with uid <uid>.
\& \-\-debug \-d Enable debug output.
\& \-\-verbose \-v Enable verbose output.
\& \-\-outdir \-o <dir> Set output dir (default: current directory)
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all.
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-manual \-m Show manual.
\& \-\-help \-h Show usage.
\& \-\-version \-V Show program version.
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
\& \-d \-\-debug Enable debug output.
\& \-v \-\-verbose Enable verbose output.
\& \-o \-\-outdir <dir> Set output dir (default: current directory)
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-m \-\-manual Show manual.
\& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
@@ -218,6 +219,22 @@ directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS"
.IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests

View File

@@ -7,15 +7,16 @@ NAME
SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
--user -u <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output.
--verbose -v Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual.
--help -h Show usage.
--version -V Show program version.
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
DESCRIPTION
This tool can be used to backup ads on the german ad page
@@ -73,6 +74,20 @@ SETUP
You can use the -v option to get verbose output or -d to enable
debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS
In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github:

View File

@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
--user -u <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output.
--verbose -v Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual.
--help -h Show usage.
--version -V Show program version.
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION
@@ -76,6 +77,23 @@ directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable
debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS
In order to report a bug, unexpected behavior, feature requests

11
main.go
View File

@@ -1,5 +1,5 @@
/*
Copyright © 2023 Thomas von Dein
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -101,8 +101,6 @@ func Main(w io.Writer) int {
slog.SetDefault(debuglogger)
}
// defaultlogger := log.Default()
// defaultlogger.SetOutput(w)
slog.Debug("config", "conf", conf)
// prepare output dir
@@ -111,17 +109,20 @@ func Main(w io.Writer) int {
return Die(err)
}
// used for all HTTP requests
fetch := NewFetcher(conf)
if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s]
for _, uri := range conf.Adlinks {
err := Scrape(conf, uri)
err := ScrapeAd(fetch, uri)
if err != nil {
return Die(err)
}
}
} else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline)
err := Start(conf)
err := ScrapeUser(fetch)
if err != nil {
return Die(err)
}

View File

@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things`
@@ -228,6 +229,12 @@ var invalidtests = []Tests{
expect: "error loading config file",
exitcode: 1,
},
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
}
type AdConfig struct {
@@ -410,6 +417,12 @@ func InitInvalidSources() []Adsource {
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404,
},
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
}
return ads
@@ -438,16 +451,18 @@ func SetIntercept(ads []Adsource) {
// we just use 2 images, put this here
for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image)))
httpmock.RegisterResponder("GET", image,
httpmock.NewBytesResponder(200, GetImage(image)))
}
}
func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
ad.Category + ad.Condition + ad.Created
// prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: DefaultAdNameTemplate}
c := Config{Adnametemplate: "{{ .Slug }}"}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct)
if err != nil {

View File

@@ -20,9 +20,7 @@ package main
import (
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"path/filepath"
"strings"
@@ -30,46 +28,21 @@ import (
"golang.org/x/sync/errgroup"
)
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", Useragent)
res, err := client.Do(req)
if err != nil {
return nil, err
}
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func Start(conf *Config) error {
client := &http.Client{}
func ScrapeUser(fetch *Fetcher) error {
adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
page := 1
uri := baseuri
slog.Info("fetching ad pages", "user", conf.User)
slog.Info("fetching ad pages", "user", fetch.Config.User)
for {
var index Index
slog.Debug("fetching page", "uri", uri)
body, err := Get(uri, client)
body, err := fetch.Get(uri)
if err != nil {
return err
}
@@ -96,12 +69,12 @@ func Start(conf *Config) error {
}
for i, adlink := range adlinks {
err := Scrape(conf, Baseuri+adlink)
err := ScrapeAd(fetch, Baseuri+adlink)
if err != nil {
return err
}
if conf.Limit > 0 && i == conf.Limit-1 {
if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break
}
}
@@ -110,8 +83,7 @@ func Start(conf *Config) error {
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(c *Config, uri string) error {
client := &http.Client{}
func ScrapeAd(fetch *Fetcher, uri string) error {
ad := &Ad{}
// extract slug and id from uri
@@ -124,7 +96,7 @@ func Scrape(c *Config, uri string) error {
// get the ad
slog.Debug("fetching ad page", "uri", uri)
body, err := Get(uri, client)
body, err := fetch.Get(uri)
if err != nil {
return err
}
@@ -148,30 +120,34 @@ func Scrape(c *Config, uri string) error {
slog.Debug("extracted ad listing", "ad", ad)
// write listing
addir, err := WriteAd(c, ad)
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
c.IncrAds()
fetch.Config.IncrAds()
return ScrapeImages(c, ad, addir)
return ScrapeImages(fetch, ad, addir)
}
func ScrapeImages(c *Config, ad *Ad, addir string) error {
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
err := Getimage(imguri, file)
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
err = WriteImage(file, body)
if err != nil {
return err
}
slog.Info("wrote ad image", "image", file)
return nil
})
@@ -182,28 +158,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
return err
}
c.IncrImgs(len(ad.Images))
return nil
}
// fetch an image
func Getimage(uri, fileName string) error {
slog.Debug("fetching ad image", "uri", uri)
response, err := http.Get(uri)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return errors.New("could not get image via HTTP")
}
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}
fetch.Config.IncrImgs(len(ad.Images))
return nil
}