Compare commits

..

5 Commits

Author SHA1 Message Date
0726b95fd4 added docker instructions 2024-01-17 14:15:46 +01:00
21ca5c626f added environment variable support 2024-01-17 14:03:43 +01:00
7e2161d4d4 no bak 2024-01-17 13:58:21 +01:00
ef4d2dab16 added docker image support 2024-01-17 13:58:09 +01:00
T.v.Dein
3fd75fa53d refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00
15 changed files with 273 additions and 79 deletions

28
.github/workflows/pushimage.yaml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ test
kleingebaeck
releases
t/out
.bak

29
Dockerfile Normal file
View File

@@ -0,0 +1,29 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
#RUN install -o 1001 -g 1001 -d /data
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

View File

@@ -94,6 +94,26 @@ installed - `make`.
To install after building either copy the binary or execute `sudo make install`.
### Using the docker image
A pre-built docker image is available, which you can use to test the
app without installing it. You need `docker-compose`. Copy the file
`docker-compose.yaml` to somewhere, cd to that directory and execute:
```shell
mkdir kleinanzeigen-backup
USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose run kleingebaeck -u XXX -v
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory must exist prior to
the execution, otherwise docker will create it as root, then
kleingebaeck will fail.
You may of course also modify the `docker-compose.yaml` to suit your needs.
If you want to build the image yourself, use the supplied Dockerfile.
## Commandline options:
```
@@ -128,6 +148,13 @@ loglevel = verbose
outdir = "test"
```
## Environment Variables
Kleingebaeck can also be configured using environment variables. Just prefix the config variables with `KLEINGEBAECK_` and put them to upper case. Eg:
```shell
% KLEINGEBAECK_OUTDIR=/backup kleingebaeck -v
```
## Usage
To setup the tool, you need to lookup your userid on

View File

@@ -23,9 +23,11 @@ import (
"os"
"path/filepath"
"runtime"
"strings"
"github.com/knadh/koanf/parsers/toml"
"github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2"
@@ -162,7 +164,15 @@ func InitConfig(w io.Writer) (*Config, error) {
// else: we ignore the file if it doesn't exists
}
// command line overrides config file
// env overrides config file
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string {
return strings.Replace(strings.ToLower(
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1)
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error())
}
// command line overrides env
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error())
}

22
docker-compose.yaml Normal file
View File

@@ -0,0 +1,22 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

75
fetch.go Normal file
View File

@@ -0,0 +1,75 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

1
go.mod
View File

@@ -21,6 +21,7 @@ require (
github.com/andybalholm/cascadia v1.0.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/knadh/koanf/providers/env v0.1.0 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect

2
go.sum
View File

@@ -17,6 +17,8 @@ github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6OD
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=

View File

@@ -27,6 +27,9 @@ import (
"time"
)
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789")
func getid() string {
@@ -37,8 +40,10 @@ func getid() string {
return string(b)
}
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
@@ -75,7 +80,7 @@ func drainBody(resp *http.Response) {
}
}
// our logging transport with retries
// the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-16" "1" "User Commands"
.TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -219,6 +219,22 @@ directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS"
.IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests

View File

@@ -74,6 +74,20 @@ SETUP
You can use the -v option to get verbose output or -d to enable
debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS
In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github:

View File

@@ -77,6 +77,23 @@ directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable
debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS
In order to report a bug, unexpected behavior, feature requests

View File

@@ -22,7 +22,6 @@ import (
"fmt"
"io"
"log/slog"
"net/http"
"os"
"runtime/debug"
@@ -102,8 +101,6 @@ func Main(w io.Writer) int {
slog.SetDefault(debuglogger)
}
// defaultlogger := log.Default()
// defaultlogger.SetOutput(w)
slog.Debug("config", "conf", conf)
// prepare output dir
@@ -113,19 +110,19 @@ func Main(w io.Writer) int {
}
// used for all HTTP requests
client := &http.Client{Transport: &loggingTransport{}}
fetch := NewFetcher(conf)
if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s]
for _, uri := range conf.Adlinks {
err := ScrapeAd(conf, uri, client)
err := ScrapeAd(fetch, uri)
if err != nil {
return Die(err)
}
}
} else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline)
err := ScrapeUser(conf, client)
err := ScrapeUser(fetch)
if err != nil {
return Die(err)
}

View File

@@ -20,9 +20,7 @@ package main
import (
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"path/filepath"
"strings"
@@ -30,42 +28,21 @@ import (
"golang.org/x/sync/errgroup"
)
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", Useragent)
res, err := client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func ScrapeUser(conf *Config, client *http.Client) error {
func ScrapeUser(fetch *Fetcher) error {
adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
page := 1
uri := baseuri
slog.Info("fetching ad pages", "user", conf.User)
slog.Info("fetching ad pages", "user", fetch.Config.User)
for {
var index Index
slog.Debug("fetching page", "uri", uri)
body, err := Get(uri, client)
body, err := fetch.Get(uri)
if err != nil {
return err
}
@@ -92,12 +69,12 @@ func ScrapeUser(conf *Config, client *http.Client) error {
}
for i, adlink := range adlinks {
err := ScrapeAd(conf, Baseuri+adlink, client)
err := ScrapeAd(fetch, Baseuri+adlink)
if err != nil {
return err
}
if conf.Limit > 0 && i == conf.Limit-1 {
if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break
}
}
@@ -106,7 +83,7 @@ func ScrapeUser(conf *Config, client *http.Client) error {
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func ScrapeAd(c *Config, uri string, client *http.Client) error {
func ScrapeAd(fetch *Fetcher, uri string) error {
ad := &Ad{}
// extract slug and id from uri
@@ -119,7 +96,7 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {
// get the ad
slog.Debug("fetching ad page", "uri", uri)
body, err := Get(uri, client)
body, err := fetch.Get(uri)
if err != nil {
return err
}
@@ -143,26 +120,31 @@ func ScrapeAd(c *Config, uri string, client *http.Client) error {
slog.Debug("extracted ad listing", "ad", ad)
// write listing
addir, err := WriteAd(c, ad)
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
c.IncrAds()
fetch.Config.IncrAds()
return ScrapeImages(c, ad, addir, client)
return ScrapeImages(fetch, ad, addir)
}
func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
err := Getimage(c, imguri, file, client)
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
err = WriteImage(file, body)
if err != nil {
return err
}
@@ -176,39 +158,7 @@ func ScrapeImages(c *Config, ad *Ad, addir string, client *http.Client) error {
return err
}
c.IncrImgs(len(ad.Images))
fetch.Config.IncrImgs(len(ad.Images))
return nil
}
// fetch an image
func Getimage(c *Config, uri, fileName string, client *http.Client) error {
slog.Debug("fetching ad image", "uri", uri)
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
if c.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
}
return err
}
req.Header.Set("User-Agent", Useragent)
response, err := client.Do(req)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return errors.New("could not get image via HTTP")
}
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}
slog.Info("wrote ad image", "image", fileName)
return nil
}