Compare commits

...

7 Commits

Author SHA1 Message Date
T.v.Dein
634d9a4140 Add expire support (#44) 2024-01-19 14:41:47 +01:00
T.v.Dein
96fb142423 Develop (#42)
* added docker image support
* added environment variable support
* added docker instructions
* added .env hint
* use new yadu log handler
* update modules
* bump version
2024-01-19 14:36:27 +01:00
T.v.Dein
39b064cc20 Feature/docker (#37) (#38)
* added docker image support
* added environment variable support
* added docker instructions
* added .env hint
2024-01-17 14:45:12 +01:00
T.v.Dein
3fd75fa53d refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00
T.v.Dein
78e5de61d2 Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
2024-01-16 13:20:16 +01:00
T.v.Dein
f4a9a9895c Enhancement/http (#32)
* added HTTP debug logging using `-d` or `DEBUGHTTP=1` (headers only)
2024-01-16 13:20:16 +01:00
T.v.Dein
ac5b0608d8 fix #30: revert default adnamedir to just use the slug as before (#31) 2024-01-16 13:20:16 +01:00
17 changed files with 536 additions and 138 deletions

28
.github/workflows/pushimage.yaml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ test
kleingebaeck kleingebaeck
releases releases
t/out t/out
.bak

27
Dockerfile Normal file
View File

@@ -0,0 +1,27 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

View File

@@ -94,19 +94,48 @@ installed - `make`.
To install after building either copy the binary or execute `sudo make install`. To install after building either copy the binary or execute `sudo make install`.
### Using the docker image
A pre-built docker image is available, which you can use to test the
app without installing it. You need `docker-compose`. Copy the file
`docker-compose.yaml` to somewhere, cd to that directory and execute:
```shell
mkdir kleinanzeigen-backup
USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose run kleingebaeck -u XXX -v
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory `OUTDIR` must exist
prior to the execution, otherwise docker will create it as root, then
kleingebaeck will fail. You may also use a `.env` file in the same
directory containing the variables, such as:
```
USER_ID=1000
GROUP_ID=1000
OUTDIR=./kleinanzeigen-backup
```
You may of course also modify the `docker-compose.yaml` to suit your needs.
If you want to build the image yourself, use the supplied Dockerfile.
## Commandline options: ## Commandline options:
``` ```
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more <ad-listing-url>'s are specified, only backup those, If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user. otherwise backup all ads of the given user.
@@ -126,6 +155,13 @@ loglevel = verbose
outdir = "test" outdir = "test"
``` ```
## Environment Variables
Kleingebaeck can also be configured using environment variables. Just prefix the config variables with `KLEINGEBAECK_` and put them to upper case. Eg:
```shell
% KLEINGEBAECK_OUTDIR=/backup kleingebaeck -v
```
## Usage ## Usage
To setup the tool, you need to lookup your userid on To setup the tool, you need to lookup your userid on

13
ad.go
View File

@@ -20,6 +20,7 @@ package main
import ( import (
"log/slog" "log/slog"
"strings" "strings"
"time"
) )
type Index struct { type Index struct {
@@ -37,6 +38,7 @@ type Ad struct {
Created string `goquery:"#viewad-extra-info,text"` Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"` Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"` Images []string `goquery:".galleryimage-element img,[src]"`
Expire string
} }
// Used by slog to pretty print an ad // Used by slog to pretty print an ad
@@ -49,6 +51,8 @@ func (ad *Ad) LogValue() slog.Value {
slog.Int("bodysize", len(ad.Text)), slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")), slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition), slog.String("condition", ad.Condition),
slog.String("created", ad.Created),
slog.String("expire", ad.Expire),
) )
} }
@@ -67,3 +71,12 @@ func (ad *Ad) Incomplete() bool {
return false return false
} }
func (ad *Ad) CalculateExpire() {
if len(ad.Created) > 0 {
ts, err := time.Parse("02.01.2006", ad.Created)
if err == nil {
ad.Expire = ts.AddDate(0, 2, 1).Format("02.01.2006")
}
}
}

View File

@@ -23,9 +23,11 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings"
"github.com/knadh/koanf/parsers/toml" "github.com/knadh/koanf/parsers/toml"
"github.com/knadh/koanf/providers/confmap" "github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file" "github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag" "github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2" "github.com/knadh/koanf/v2"
@@ -33,17 +35,19 @@ import (
) )
const ( const (
VERSION string = "0.1.1" VERSION string = "0.2.0"
Baseuri string = "https://www.kleinanzeigen.de" Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html" Listuri string = "/s-bestandsliste.html"
Defaultdir string = "." Defaultdir string = "."
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" + DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n" "Category: {{.Category}}\nCondition: {{.Condition}}\n" +
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" + DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" +
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n" "Category: {{.Category}}\r\nCondition: {{.Condition}}\r\n" +
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}-{{.Id}}" DefaultAdNameTemplate string = "{{.Slug}}"
) )
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
@@ -51,15 +55,16 @@ const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those, If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.` otherwise backup all ads of the given user.`
@@ -76,6 +81,7 @@ type Config struct {
Adnametemplate string `koanf:"adnametemplate"` Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"` Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"` Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
Adlinks []string Adlinks []string
StatsCountAds int StatsCountAds int
StatsCountImages int StatsCountImages int
@@ -160,7 +166,15 @@ func InitConfig(w io.Writer) (*Config, error) {
// else: we ignore the file if it doesn't exists // else: we ignore the file if it doesn't exists
} }
// command line overrides config file // env overrides config file
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string {
return strings.Replace(strings.ToLower(
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1)
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error())
}
// command line overrides env
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil { if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error()) return nil, errors.New("error loading flags: " + err.Error())
} }

22
docker-compose.yaml Normal file
View File

@@ -0,0 +1,22 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

75
fetch.go Normal file
View File

@@ -0,0 +1,75 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

13
go.mod
View File

@@ -7,25 +7,30 @@ require (
github.com/jarcoal/httpmock v1.3.1 github.com/jarcoal/httpmock v1.3.1
github.com/knadh/koanf/parsers/toml v0.1.0 github.com/knadh/koanf/parsers/toml v0.1.0
github.com/knadh/koanf/providers/confmap v0.1.0 github.com/knadh/koanf/providers/confmap v0.1.0
github.com/knadh/koanf/providers/env v0.1.0
github.com/knadh/koanf/providers/file v0.1.0 github.com/knadh/koanf/providers/file v0.1.0
github.com/knadh/koanf/providers/posflag v0.1.0 github.com/knadh/koanf/providers/posflag v0.1.0
github.com/knadh/koanf/v2 v2.0.1 github.com/knadh/koanf/v2 v2.0.1
github.com/lmittmann/tint v1.0.3 github.com/lmittmann/tint v1.0.4
github.com/mattn/go-isatty v0.0.20 github.com/mattn/go-isatty v0.0.20
github.com/spf13/pflag v1.0.5 github.com/spf13/pflag v1.0.5
github.com/tlinden/yadu v0.1.0
golang.org/x/sync v0.5.0 golang.org/x/sync v0.5.0
) )
require ( require (
github.com/PuerkitoBio/goquery v1.5.0 // indirect github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.0.0 // indirect github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.6.0 // indirect golang.org/x/sys v0.14.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
) )

28
go.sum
View File

@@ -1,12 +1,16 @@
astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw= astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw=
astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno= astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno=
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww= github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww=
@@ -17,14 +21,19 @@ github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6OD
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18= github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU= github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU= github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c= github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA= github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U= github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=
github.com/knadh/koanf/providers/posflag v0.1.0/go.mod h1:SYg03v/t8ISBNrMBRMlojH8OsKowbkXV7giIbBVgbz0= github.com/knadh/koanf/providers/posflag v0.1.0/go.mod h1:SYg03v/t8ISBNrMBRMlojH8OsKowbkXV7giIbBVgbz0=
github.com/knadh/koanf/v2 v2.0.1 h1:1dYGITt1I23x8cfx8ZnldtezdyaZtfAuRtIFOiRzK7g= github.com/knadh/koanf/v2 v2.0.1 h1:1dYGITt1I23x8cfx8ZnldtezdyaZtfAuRtIFOiRzK7g=
github.com/knadh/koanf/v2 v2.0.1/go.mod h1:ZeiIlIDXTE7w1lMT6UVcNiRAS2/rCeLn/GdLNvY1Dus= github.com/knadh/koanf/v2 v2.0.1/go.mod h1:ZeiIlIDXTE7w1lMT6UVcNiRAS2/rCeLn/GdLNvY1Dus=
github.com/lmittmann/tint v1.0.3 h1:W5PHeA2D8bBJVvabNfQD/XW9HPLZK1XoPZH0cq8NouQ= github.com/lmittmann/tint v1.0.4 h1:LeYihpJ9hyGvE0w+K2okPTGUdVLfng1+nDNVR4vWISc=
github.com/lmittmann/tint v1.0.3/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE= github.com/lmittmann/tint v1.0.4/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g= github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g=
@@ -45,18 +54,27 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355 h1:EmgK+IGUz2m42bFKteLY5SYJLn/CyBrz6nkgS22K8Bk=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
github.com/tlinden/yadu v0.1.0 h1:qtCi1jxg392qVRLFyrJ2LYu6/PiKSp1LT02EX+mNLME=
github.com/tlinden/yadu v0.1.0/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

129
http.go Normal file
View File

@@ -0,0 +1,129 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"io"
"log/slog"
"math"
"math/rand"
"net/http"
"time"
)
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789")
func getid() string {
b := make([]rune, 8)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
// first try
resp, err := http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
// enter retry check and loop, if first req were successfull, leave loop immediately
retries := 0
for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
}
return resp, err
}

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands" .TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -142,18 +142,19 @@
kleingebaeck \- kleinanzeigen.de backup tool kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS" .SH "SYNOPSYS"
.IX Header "SYNOPSYS" .IX Header "SYNOPSYS"
.Vb 11 .Vb 12
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...] \& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options: \& Options:
\& \-\-user \-u <uid> Backup ads from user with uid <uid>. \& \-u \-\-user <uid> Backup ads from user with uid <uid>.
\& \-\-debug \-d Enable debug output. \& \-d \-\-debug Enable debug output.
\& \-\-verbose \-v Enable verbose output. \& \-v \-\-verbose Enable verbose output.
\& \-\-outdir \-o <dir> Set output dir (default: current directory) \& \-o \-\-outdir <dir> Set output dir (default: current directory)
\& \-\-limit \-l <num> Limit the ads to download to <num>, default: load all. \& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck). \& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-manual \-m Show manual. \& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-\-help \-h Show usage. \& \-m \-\-manual Show manual.
\& \-\-version \-V Show program version. \& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve .Ve
.SH "DESCRIPTION" .SH "DESCRIPTION"
.IX Header "DESCRIPTION" .IX Header "DESCRIPTION"
@@ -218,6 +219,22 @@ directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP .PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging. debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS" .SH "BUGS"
.IX Header "BUGS" .IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests

View File

@@ -7,15 +7,16 @@ NAME
SYNOPSYS SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
DESCRIPTION DESCRIPTION
This tool can be used to backup ads on the german ad page This tool can be used to backup ads on the german ad page
@@ -73,6 +74,20 @@ SETUP
You can use the -v option to get verbose output or -d to enable You can use the -v option to get verbose output or -d to enable
debugging. debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS BUGS
In order to report a bug, unexpected behavior, feature requests or to In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github: submit a patch, please open an issue on github:

View File

@@ -6,15 +6,16 @@ kleingebaeck - kleinanzeigen.de backup tool
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
--user -u <uid> Backup ads from user with uid <uid>. -u --user <uid> Backup ads from user with uid <uid>.
--debug -d Enable debug output. -d --debug Enable debug output.
--verbose -v Enable verbose output. -v --verbose Enable verbose output.
--outdir -o <dir> Set output dir (default: current directory) -o --outdir <dir> Set output dir (default: current directory)
--limit -l <num> Limit the ads to download to <num>, default: load all. -l --limit <num> Limit the ads to download to <num>, default: load all.
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). -c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
--help -h Show usage. -m --manual Show manual.
--version -V Show program version. -h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION =head1 DESCRIPTION
@@ -76,6 +77,23 @@ directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable You can use the B<-v> option to get verbose output or B<-d> to enable
debugging. debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS =head1 BUGS
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests

18
main.go
View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -26,6 +26,7 @@ import (
"runtime/debug" "runtime/debug"
"github.com/lmittmann/tint" "github.com/lmittmann/tint"
"github.com/tlinden/yadu"
) )
const LevelNotice = slog.Level(2) const LevelNotice = slog.Level(2)
@@ -84,14 +85,14 @@ func Main(w io.Writer) int {
if conf.Debug { if conf.Debug {
// we're using a more verbose logger in debug mode // we're using a more verbose logger in debug mode
buildInfo, _ := debug.ReadBuildInfo() buildInfo, _ := debug.ReadBuildInfo()
opts := &tint.Options{ opts := &yadu.Options{
Level: logLevel, Level: logLevel,
AddSource: true, AddSource: true,
NoColor: IsNoTty(), //NoColor: IsNoTty(),
} }
logLevel.Set(slog.LevelDebug) logLevel.Set(slog.LevelDebug)
handler := tint.NewHandler(w, opts) handler := yadu.NewHandler(w, opts)
debuglogger := slog.New(handler).With( debuglogger := slog.New(handler).With(
slog.Group("program_info", slog.Group("program_info",
slog.Int("pid", os.Getpid()), slog.Int("pid", os.Getpid()),
@@ -101,8 +102,6 @@ func Main(w io.Writer) int {
slog.SetDefault(debuglogger) slog.SetDefault(debuglogger)
} }
// defaultlogger := log.Default()
// defaultlogger.SetOutput(w)
slog.Debug("config", "conf", conf) slog.Debug("config", "conf", conf)
// prepare output dir // prepare output dir
@@ -111,17 +110,20 @@ func Main(w io.Writer) int {
return Die(err) return Die(err)
} }
// used for all HTTP requests
fetch := NewFetcher(conf)
if len(conf.Adlinks) >= 1 { if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s] // directly backup ad listing[s]
for _, uri := range conf.Adlinks { for _, uri := range conf.Adlinks {
err := Scrape(conf, uri) err := ScrapeAd(fetch, uri)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
} }
} else if conf.User > 0 { } else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline) // backup all ads of the given user (via config or cmdline)
err := Start(conf) err := ScrapeUser(fetch)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }

View File

@@ -114,6 +114,7 @@ const EMPTYPAGE string = `DOCTYPE html>
const ( const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1` INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar` INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things` INVALIDURI string = `https://foo.bar/weird/things`
@@ -144,7 +145,13 @@ var tests = []Tests{
{ {
name: "debug", name: "debug",
args: base + " -d", args: base + " -d",
expect: "program_info", expect: "error: invalid or no user id or no ad link specified",
exitcode: 1,
},
{
name: "debug-check-programinfo",
args: base + " -d",
expect: "pid:",
exitcode: 1, exitcode: 1,
}, },
{ {
@@ -168,7 +175,7 @@ var tests = []Tests{
{ {
name: "download-single-ad-debug", name: "download-single-ad-debug",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -d", args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -d",
expect: "extracted ad listing program_info.pid=", expect: "DEBUG: extracted ad listing",
exitcode: 0, exitcode: 0,
}, },
{ {
@@ -228,6 +235,12 @@ var invalidtests = []Tests{
expect: "error loading config file", expect: "error loading config file",
exitcode: 1, exitcode: 1,
}, },
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
} }
type AdConfig struct { type AdConfig struct {
@@ -410,6 +423,12 @@ func InitInvalidSources() []Adsource {
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"), content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404, status: 404,
}, },
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
} }
return ads return ads
@@ -438,16 +457,18 @@ func SetIntercept(ads []Adsource) {
// we just use 2 images, put this here // we just use 2 images, put this here
for _, image := range []string{"t/1.jpg", "t/2.jpg"} { for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image))) httpmock.RegisterResponder("GET", image,
httpmock.NewBytesResponder(200, GetImage(image)))
} }
} }
func VerifyAd(ad AdConfig) error { func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
ad.Category + ad.Condition + ad.Created
// prepare ad dir name using DefaultAdNameTemplate // prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: DefaultAdNameTemplate} c := Config{Adnametemplate: "{{ .Slug }}"}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id} adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct) addir, err := AdDirName(&c, &adstruct)
if err != nil { if err != nil {

View File

@@ -20,9 +20,7 @@ package main
import ( import (
"errors" "errors"
"fmt" "fmt"
"io"
"log/slog" "log/slog"
"net/http"
"path/filepath" "path/filepath"
"strings" "strings"
@@ -30,46 +28,21 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
) )
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", Useragent)
res, err := client.Do(req)
if err != nil {
return nil, err
}
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func Start(conf *Config) error { func ScrapeUser(fetch *Fetcher) error {
client := &http.Client{}
adlinks := []string{} adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User) baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
page := 1 page := 1
uri := baseuri uri := baseuri
slog.Info("fetching ad pages", "user", conf.User) slog.Info("fetching ad pages", "user", fetch.Config.User)
for { for {
var index Index var index Index
slog.Debug("fetching page", "uri", uri) slog.Debug("fetching page", "uri", uri)
body, err := Get(uri, client) body, err := fetch.Get(uri)
if err != nil { if err != nil {
return err return err
} }
@@ -96,12 +69,12 @@ func Start(conf *Config) error {
} }
for i, adlink := range adlinks { for i, adlink := range adlinks {
err := Scrape(conf, Baseuri+adlink) err := ScrapeAd(fetch, Baseuri+adlink)
if err != nil { if err != nil {
return err return err
} }
if conf.Limit > 0 && i == conf.Limit-1 { if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break break
} }
} }
@@ -110,8 +83,7 @@ func Start(conf *Config) error {
} }
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(c *Config, uri string) error { func ScrapeAd(fetch *Fetcher, uri string) error {
client := &http.Client{}
ad := &Ad{} ad := &Ad{}
// extract slug and id from uri // extract slug and id from uri
@@ -124,7 +96,7 @@ func Scrape(c *Config, uri string) error {
// get the ad // get the ad
slog.Debug("fetching ad page", "uri", uri) slog.Debug("fetching ad page", "uri", uri)
body, err := Get(uri, client) body, err := fetch.Get(uri)
if err != nil { if err != nil {
return err return err
} }
@@ -145,33 +117,39 @@ func Scrape(c *Config, uri string) error {
return errors.New("could not extract ad data from page, got empty struct") return errors.New("could not extract ad data from page, got empty struct")
} }
ad.CalculateExpire()
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
// write listing // write listing
addir, err := WriteAd(c, ad) addir, err := WriteAd(fetch.Config, ad)
if err != nil { if err != nil {
return err return err
} }
c.IncrAds() fetch.Config.IncrAds()
return ScrapeImages(c, ad, addir) return ScrapeImages(fetch, ad, addir)
} }
func ScrapeImages(c *Config, ad *Ad, addir string) error { func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images // fetch images
img := 1 img := 1
g := new(errgroup.Group) g := new(errgroup.Group)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
imguri := imguri imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error { g.Go(func() error {
err := Getimage(imguri, file) body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
err = WriteImage(file, body)
if err != nil { if err != nil {
return err return err
} }
slog.Info("wrote ad image", "image", file)
return nil return nil
}) })
@@ -182,28 +160,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string) error {
return err return err
} }
c.IncrImgs(len(ad.Images)) fetch.Config.IncrImgs(len(ad.Images))
return nil
}
// fetch an image
func Getimage(uri, fileName string) error {
slog.Debug("fetching ad image", "uri", uri)
response, err := http.Get(uri)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return errors.New("could not get image via HTTP")
}
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}
return nil return nil
} }