Compare commits

...

5 Commits

Author SHA1 Message Date
0726b95fd4 added docker instructions 2024-01-17 14:15:46 +01:00
21ca5c626f added environment variable support 2024-01-17 14:03:43 +01:00
7e2161d4d4 no bak 2024-01-17 13:58:21 +01:00
ef4d2dab16 added docker image support 2024-01-17 13:58:09 +01:00
T.v.Dein
3fd75fa53d refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00
15 changed files with 273 additions and 79 deletions

28
.github/workflows/pushimage.yaml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ test
kleingebaeck kleingebaeck
releases releases
t/out t/out
.bak

29
Dockerfile Normal file
View File

@@ -0,0 +1,29 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
#RUN install -o 1001 -g 1001 -d /data
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

View File

@@ -94,6 +94,26 @@ installed - `make`.
To install after building either copy the binary or execute `sudo make install`. To install after building either copy the binary or execute `sudo make install`.
### Using the docker image
A pre-built docker image is available, which you can use to test the
app without installing it. You need `docker-compose`. Copy the file
`docker-compose.yaml` to somewhere, cd to that directory and execute:
```shell
mkdir kleinanzeigen-backup
USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose run kleingebaeck -u XXX -v
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory must exist prior to
the execution, otherwise docker will create it as root, then
kleingebaeck will fail.
You may of course also modify the `docker-compose.yaml` to suit your needs.
If you want to build the image yourself, use the supplied Dockerfile.
## Commandline options: ## Commandline options:
``` ```
@@ -128,6 +148,13 @@ loglevel = verbose
outdir = "test" outdir = "test"
``` ```
## Environment Variables
Kleingebaeck can also be configured using environment variables. Just prefix the config variables with `KLEINGEBAECK_` and put them to upper case. Eg:
```shell
% KLEINGEBAECK_OUTDIR=/backup kleingebaeck -v
```
## Usage ## Usage
To setup the tool, you need to lookup your userid on To setup the tool, you need to lookup your userid on

View File

@@ -23,9 +23,11 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings"
"github.com/knadh/koanf/parsers/toml" "github.com/knadh/koanf/parsers/toml"
"github.com/knadh/koanf/providers/confmap" "github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file" "github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag" "github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2" "github.com/knadh/koanf/v2"
@@ -162,7 +164,15 @@ func InitConfig(w io.Writer) (*Config, error) {
// else: we ignore the file if it doesn't exists // else: we ignore the file if it doesn't exists
} }
// command line overrides config file // env overrides config file
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string {
return strings.Replace(strings.ToLower(
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1)
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error())
}
// command line overrides env
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil { if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error()) return nil, errors.New("error loading flags: " + err.Error())
} }

22
docker-compose.yaml Normal file
View File

@@ -0,0 +1,22 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

75
fetch.go Normal file
View File

@@ -0,0 +1,75 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

1
go.mod
View File

@@ -21,6 +21,7 @@ require (
github.com/andybalholm/cascadia v1.0.0 // indirect github.com/andybalholm/cascadia v1.0.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/knadh/koanf/providers/env v0.1.0 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect

2
go.sum
View File

@@ -17,6 +17,8 @@ github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6OD
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18= github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU= github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU= github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c= github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA= github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U= github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=

View File

@@ -27,6 +27,9 @@ import (
"time" "time"
) )
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789") var letters = []rune("ABCDEF0123456789")
func getid() string { func getid() string {
@@ -37,8 +40,10 @@ func getid() string {
return string(b) return string(b)
} }
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3 const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{} type loggingTransport struct{}
// escalating timeout, $retry^2 seconds // escalating timeout, $retry^2 seconds
@@ -75,7 +80,7 @@ func drainBody(resp *http.Response) {
} }
} }
// our logging transport with retries // the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging // just requred for debugging
id := getid() id := getid()

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands" .TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -219,6 +219,22 @@ directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP .PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging. debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS" .SH "BUGS"
.IX Header "BUGS" .IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests

View File

@@ -74,6 +74,20 @@ SETUP
You can use the -v option to get verbose output or -d to enable You can use the -v option to get verbose output or -d to enable
debugging. debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS BUGS
In order to report a bug, unexpected behavior, feature requests or to In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github: submit a patch, please open an issue on github:

View File

@@ -77,6 +77,23 @@ directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable You can use the B<-v> option to get verbose output or B<-d> to enable
debugging. debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS =head1 BUGS
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests

View File

@@ -22,7 +22,6 @@ import (
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
"net/http"
"os" "os"
"runtime/debug" "runtime/debug"
@@ -102,8 +101,6 @@ func Main(w io.Writer) int {
slog.SetDefault(debuglogger) slog.SetDefault(debuglogger)
} }
// defaultlogger := log.Default()
// defaultlogger.SetOutput(w)
slog.Debug("config", "conf", conf) slog.Debug("config", "conf", conf)
// prepare output dir // prepare output dir
@@ -113,19 +110,19 @@ func Main(w io.Writer) int {
} }
// used for all HTTP requests // used for all HTTP requests
client := &http.Client{Transport: &loggingTransport{}} fetch := NewFetcher(conf)
if len(conf.Adlinks) >= 1 { if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s] // directly backup ad listing[s]
for _, uri := range conf.Adlinks { for _, uri := range conf.Adlinks {
err := ScrapeAd(conf, uri, client) err := ScrapeAd(fetch, uri)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
} }
} else if conf.User > 0 { } else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline) // backup all ads of the given user (via config or cmdline)
err := ScrapeUser(conf, client) err := ScrapeUser(fetch)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }

View File

@@ -20,9 +20,7 @@ package main
import ( import (
"errors" "errors"
"fmt" "fmt"
"io"
"log/slog" "log/slog"
"net/http"
"path/filepath" "path/filepath"
"strings" "strings"
@@ -30,42 +28,21 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
) )
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", Useragent)
res, err := client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func ScrapeUser(conf *Config, client *http.Client) error { func ScrapeUser(fetch *Fetcher) error {
adlinks := []string{} adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User) baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
page := 1 page := 1
uri := baseuri uri := baseuri
slog.Info("fetching ad pages", "user", conf.User) slog.Info("fetching ad pages", "user", fetch.Config.User)
for { for {
var index Index var index Index
slog.Debug("fetching page", "uri", uri) slog.Debug("fetching page", "uri", uri)
body, err := Get(uri, client) body, err := fetch.Get(uri)
if err != nil { if err != nil {
return err return err
} }
@@ -92,12 +69,12 @@ func ScrapeUser(conf *Config, client *http.Client) error {
} }
for i, adlink := range adlinks { for i, adlink := range adlinks {
err := ScrapeAd(conf, Baseuri+adlink, client) err := ScrapeAd(fetch, Baseuri+adlink)
if err != nil { if err != nil {
return err return err
} }
if conf.Limit > 0 && i == conf.Limit-1 { if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break break
} }
} }
@@ -106,7 +83,7 @@ func ScrapeUser(conf *Config, client *http.Client) error {
} }
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func ScrapeAd(c *Config, uri string, client *http.Client) error { func ScrapeAd(fetch *Fetcher, uri string) error {
ad := &Ad{} ad := &Ad{}
// extract slug and id from uri // extract slug and id from uri
@@ -119,7 +96,7 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {
// get the ad // get the ad
slog.Debug("fetching ad page", "uri", uri) slog.Debug("fetching ad page", "uri", uri)
body, err := Get(uri, client) body, err := fetch.Get(uri)
if err != nil { if err != nil {
return err return err
} }
@@ -143,26 +120,31 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
// write listing // write listing
addir, err := WriteAd(c, ad) addir, err := WriteAd(fetch.Config, ad)
if err != nil { if err != nil {
return err return err
} }
c.IncrAds() fetch.Config.IncrAds()
return ScrapeImages(c, ad, addir, client) return ScrapeImages(fetch, ad, addir)
} }
func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error { func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images // fetch images
img := 1 img := 1
g := new(errgroup.Group) g := new(errgroup.Group)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
imguri := imguri imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img)) file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error { g.Go(func() error {
err := Getimage(c, imguri, file, client) body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
err = WriteImage(file, body)
if err != nil { if err != nil {
return err return err
} }
@@ -176,39 +158,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
return err return err
} }
c.IncrImgs(len(ad.Images)) fetch.Config.IncrImgs(len(ad.Images))
return nil return nil
} }
// fetch an image
func Getimage(c *Config, uri, fileName string, client *http.Client) error {
slog.Debug("fetching ad image", "uri", uri)
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
if c.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
}
return err
}
req.Header.Set("User-Agent", Useragent)
response, err := client.Do(req)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return errors.New("could not get image via HTTP")
}
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}
slog.Info("wrote ad image", "image", fileName)
return nil
}