Compare commits

..

3 Commits

Author SHA1 Message Date
bd9d8fdb2c fix version finding 2023-12-17 17:53:01 +01:00
T.v.Dein
1ee886c504 Merge pull request #2 from TLINDEN/dev
re-orgainzied code a little, using go templates instead format string
2023-12-17 17:49:27 +01:00
T.v.Dein
d7b13e8a9a Merge pull request #1 from TLINDEN/dev
added custom template support, added more ad data, use concurrency
2023-12-16 20:35:18 +01:00
46 changed files with 345 additions and 2104 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 199 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 263 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 232 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

View File

@@ -1,47 +0,0 @@
name: build-and-test
on: [push, pull_request]
jobs:
build:
strategy:
matrix:
version: [1.21]
os: [ubuntu-latest, windows-latest, macos-latest]
name: Build
runs-on: ${{ matrix.os }}
steps:
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: ${{ matrix.version }}
id: go
- name: checkout
uses: actions/checkout@v3
- name: build
run: go build
- name: test
run: make test
- name: Update coverage report
uses: ncruces/go-coverage-report@main
with:
report: true
chart: true
amend: true
if: |
matrix.os == 'ubuntu-latest' &&
github.event_name == 'push'
continue-on-error: true
golangci:
name: lint
runs-on: ubuntu-latest
steps:
- uses: actions/setup-go@v3
with:
go-version: 1.21
- uses: actions/checkout@v3
- name: golangci-lint
uses: golangci/golangci-lint-action@v3

View File

@@ -1,28 +0,0 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}

5
.gitignore vendored
View File

@@ -1,8 +1,3 @@
test test
kleingebaeck kleingebaeck
releases releases
t/out
.bak
t/httproot/out
t/httproot/kleinanzeigen
t/httproot/favicon.ico

View File

@@ -1,27 +0,0 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

View File

@@ -50,10 +50,9 @@ install: buildlocal
install -o $(UID) -g $(GID) -m 444 $(tool).1 $(PREFIX)/man/man1/ install -o $(UID) -g $(GID) -m 444 $(tool).1 $(PREFIX)/man/man1/
clean: clean:
rm -rf $(tool) coverage.out testdata t/out rm -rf $(tool) coverage.out testdata
test: clean test: clean
mkdir -p t/out
go test ./... $(ARGS) go test ./... $(ARGS)
testfuzzy: clean testfuzzy: clean
@@ -87,6 +86,3 @@ show-versions: buildlocal
@echo @echo
@echo "### go version used for building:" @echo "### go version used for building:"
@grep -m 1 go go.mod @grep -m 1 go go.mod
lint:
golangci-lint run -p bugs -p unused

166
README.md
View File

@@ -2,9 +2,8 @@
![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png) ![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png)
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck) [![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
[![Actions](https://github.com/tlinden/kleingebaeck/actions/workflows/ci.yaml/badge.svg)](https://github.com/tlinden/kleingebaeck/actions)
[![Go Coverage](https://github.com/tlinden/kleingebaeck/wiki/coverage.svg)](https://raw.githack.com/wiki/tlinden/kleingebaeck/coverage.html)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck) ![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest) [![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
@@ -16,126 +15,25 @@ directory, each ad into its own subdirectory. The backup will contain
a textfile `Adlisting.txt` which contains the ad contents as the a textfile `Adlisting.txt` which contains the ad contents as the
title, body, price etc. All images will be downloaded as well. title, body, price etc. All images will be downloaded as well.
## Screenshots
This is the index of my kleinanzeigen.de Account:
![Index](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-index.png)
Here I download my ads on the commandline:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-download.png)
And this is the backup directory after download:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-backup.png)
Here's a directory for one ad:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-ad.png)
**The same thing under windows:**
Downloading ads:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/cmd-windows.jpg)
Backup directory after download:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/liste-windows.jpg)
And one ad listing directory:
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/adlisting-windows.jpg)
## Installation
The tool doesn't need authentication and doesn't have any The tool doesn't need authentication and doesn't have any
dependencies. Just download the binary for your platform from the dependencies. Just download the binary for your platform from the
releases page and you're good to go. releases page and you're good to go.
### Installation using a pre-compiled binary The releases also include a handy tarball which you can use to install
the tool system-wide including the manual page. Just extract it and
Go to the [latest release type: `make install`.
page](https://github.com/TLINDEN/kleingebaeck/releases/latest) and
look for your OS and platform. There are two options to install the binary:
1. Directly download the binary for your platoform,
e.g. `kleingebaeck-linux-amd64-0.0.5`, rename it to `kleingebaeck`
(or whatever you like more!) and put it into your bin dir
(e.g. `$HOME/bin` or as root to `/usr/local/bin`).
Be sure to verify the signature of the binary file. For this also download the matching `kleingebaeck-linux-amd64-0.0.5.sha256` file and:
```shell
cat kleingebaeck-linux-amd64-0.0.5.sha25 && sha256sum kleingebaeck-linux-amd64-0.0.5
```
You should see the same SHA256 hash.
2. You may also download a binary tarball for your platform,
e.g. `kleingebaeck-linux-amd64-0.0.5.tar.gz`, unpack and install
it. GNU Make is required for this:
```shell
tar xvfz kleingebaeck-linux-amd64-0.0.5.tar.gz
cd kleingebaeck-linux-amd64-0.0.5
sudo make install
```
### Installation from source
You will need the Golang toolchain in order to build from source. GNU
Make will also help but is not strictly neccessary.
If you want to compile the tool yourself, use `git clone` to clone the
repository. Then execute `go mod tidy` to install all
dependencies. Then just enter `go build` or - if you have GNU Make
installed - `make`.
To install after building either copy the binary or execute `sudo make install`.
### Using the docker image
A pre-built docker image is available, which you can use to test the
app without installing it. You need `docker-compose`. Copy the file
`docker-compose.yaml` to somewhere, cd to that directory and execute:
```shell
mkdir kleinanzeigen-backup
USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose run kleingebaeck -u XXX -v
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory `OUTDIR` must exist
prior to the execution, otherwise docker will create it as root, then
kleingebaeck will fail. You may also use a `.env` file in the same
directory containing the variables, such as:
```
USER_ID=1000
GROUP_ID=1000
OUTDIR=./kleinanzeigen-backup
```
You may of course also modify the `docker-compose.yaml` to suit your needs.
If you want to build the image yourself, use the supplied Dockerfile.
## Commandline options: ## Commandline options:
``` ```
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
-u --user <uid> Backup ads from user with uid <uid>. --user,-u <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output. --debug, -d Enable debug output.
-v --verbose Enable verbose output. --verbose,-v Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory) --output-dir,-o <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all. --manual,-m Show manual.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck). --config,-c <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more <ad-listing-url>'s are specified, only backup those, If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user. otherwise backup all ads of the given user.
@@ -144,22 +42,16 @@ otherwise backup all ads of the given user.
## Configfile ## Configfile
You can create a config file to save typing. By default You can create a config file to save typing. By default
`~/.kleingebaeck` is being used but you can specify one with `~/.kleingebaeck.hcl` is being used but you can specify one with
`-c` as well. `-c` as well.
Format is simple: Format is simple:
``` ```
user = 1010101 user = 1010101
loglevel = verbose verbose = true
outdir = "test" outdir = "test"
``` template = ""
## Environment Variables
Kleingebaeck can also be configured using environment variables. Just prefix the config variables with `KLEINGEBAECK_` and put them to upper case. Eg:
```shell
% KLEINGEBAECK_OUTDIR=/backup kleingebaeck -v
``` ```
## Usage ## Usage
@@ -197,10 +89,6 @@ variable. The supplied sample config contains the default template.
All images will be stored in the same directory. All images will be stored in the same directory.
## Documentation
You can read the documentation [online](https://github.com/TLINDEN/kleingebaeck/blob/main/kleingebaeck.pod) or locally once you have installed kleingebaeck with: `kleingebaeck --manual`.
## Kleingebäck? ## Kleingebäck?
The name is derived from "kleinanzeigen backup": "klein" (german for The name is derived from "kleinanzeigen backup": "klein" (german for
@@ -220,34 +108,6 @@ https://github.com/TLINDEN/kleingebaeck/issues.
Please repeat the failing command with debugging enabled `-d` and Please repeat the failing command with debugging enabled `-d` and
include the output in the issue. include the output in the issue.
## Related projects
I could not find any projects specifically designed to backup
kleinanzeigen.de ads, however there's a bot project which is also able
to download ads:
[kleinanzeigen-bot](https://github.com/Second-Hand-Friends/kleinanzeigen-bot/). However,
be aware that kleinanzeigen.de is actively fighting bots! Look at this
[issue](https://github.com/Second-Hand-Friends/kleinanzeigen-bot/issues/219). The
problem with these kind of bots is, that they login into your account
using your credentials. If the company is able to detect bot activity
they can associate it easily with your account and **lock you
out**. So be careful.
**kleingebäck** doesn't need to login, it just accesses public
available web pages. Kleinanzeigen.de could hardly do anything against
it, once because it is legal. There's no difference between a browser
and a commandline client. Both run on the clientside and it is not
kleinanzeigen.de's decision which software one uses to access their
pages. And second: because you can use it to download any ads, not
just yours. So it is not really clear if the activity is associated in
any way with the ad owner. In addition to that comes the fact that
kleingebäck is just a backup tool. It is not intendet to be used on a
daily basis. You cannot use it to view regular ads or maintain your
own ads. You'll need to use the mobile app or the browser page with a
login. So, in my point of view, the risk is very minimal.
There is another Tool available named [kleinanzeigen-enhanded](https://kleinanzeigen-enhanced.de/). It is a complete Ad management system targeting primarily commercial users. You have to pay a monthly fee, perhaps there's also a free version available, but I haven't checked. The tool is implemented as a Chrome browser extension, which explains why it was possible to implement it without an API. It seems to be a nice solution for power users by the looks of it. And it includes backups.
## Copyright und License ## Copyright und License
Licensed under the GNU GENERAL PUBLIC LICENSE version 3. Licensed under the GNU GENERAL PUBLIC LICENSE version 3.

82
ad.go
View File

@@ -1,82 +0,0 @@
/*
Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"log/slog"
"strings"
"time"
)
type Index struct {
Links []string `goquery:".text-module-begin a,[href]"`
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string `goquery:".addetailslist--detail--value,text"`
Category string
CategoryTree []string `goquery:".breadcrump-link,text"`
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Expire string
}
// Used by slog to pretty print an ad
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition),
slog.String("created", ad.Created),
slog.String("expire", ad.Expire),
)
}
// check for completeness. I erected these fields to be mandatory
// (though I really don't know if they really are). I consider images
// and meta optional. So, if either of the checked fields here is
// empty we return an error. All the checked fields are extracted
// using goquery. However, I think price is optional since there are
// ads for gifts as well.
//
// Note: we return true for "ad is incomplete" and false for "ad is complete"!
func (ad *Ad) Incomplete() bool {
if ad.Category == "" || ad.Created == "" || ad.Text == "" {
return true
}
return false
}
func (ad *Ad) CalculateExpire() {
if len(ad.Created) > 0 {
ts, err := time.Parse("02.01.2006", ad.Created)
if err == nil {
ad.Expire = ts.AddDate(0, 2, 1).Format("02.01.2006")
}
}
}

187
config.go
View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023-2024 Thomas von Dein Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -17,187 +17,48 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"errors"
"fmt"
"io"
"os" "os"
"path/filepath"
"runtime"
"strings"
"github.com/knadh/koanf/parsers/toml" "github.com/hashicorp/hcl/v2/hclsimple"
"github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2"
flag "github.com/spf13/pflag"
) )
const ( const (
VERSION string = "0.2.0" VERSION string = "0.0.4"
Baseuri string = "https://www.kleinanzeigen.de" Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html" Listuri string = "/s-bestandsliste.html"
Defaultdir string = "." Defaultdir string = "."
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" + DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\n" + "Category: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n"
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" +
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\n" +
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}"
) )
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options:
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.`
type Config struct { type Config struct {
Verbose bool `koanf:"verbose"` // loglevel=info Verbose bool `hcl:"verbose"`
Debug bool `koanf:"debug"` // loglevel=debug User int `hcl:"user"`
Showversion bool `koanf:"version"` // -v Outdir string `hcl:"outdir"`
Showhelp bool `koanf:"help"` // -h Template string `hcl:"template"`
Showmanual bool `koanf:"manual"` // -m
User int `koanf:"user"`
Outdir string `koanf:"outdir"`
Template string `koanf:"template"`
Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
ForceDownload bool `koanf:"force"`
Adlinks []string
StatsCountAds int
StatsCountImages int
} }
func (c *Config) IncrAds() { func ParseConfigfile(file string) (*Config, error) {
c.StatsCountAds++ c := Config{}
} if path, err := os.Stat(file); !os.IsNotExist(err) {
func (c *Config) IncrImgs(num int) {
c.StatsCountImages += num
}
// load commandline flags and config file
func InitConfig(w io.Writer) (*Config, error) {
var k = koanf.New(".")
// determine template based on os
template := DefaultTemplate
if runtime.GOOS == "windows" {
template = DefaultTemplateWin
}
// Load default values using the confmap provider.
if err := k.Load(confmap.Provider(map[string]interface{}{
"template": template,
"outdir": ".",
"loglevel": "notice",
"userid": 0,
"adnametemplate": DefaultAdNameTemplate,
}, "."), nil); err != nil {
return nil, err
}
// setup custom usage
f := flag.NewFlagSet("config", flag.ContinueOnError)
f.Usage = func() {
fmt.Fprintln(w, Usage)
os.Exit(0)
}
// parse commandline flags
f.StringP("config", "c", "", "config file")
f.StringP("outdir", "o", "", "directory where to store ads")
f.IntP("user", "u", 0, "user id")
f.IntP("limit", "l", 0, "limit ads to be downloaded (default 0, unlimited)")
f.BoolP("verbose", "v", false, "be verbose")
f.BoolP("debug", "d", false, "enable debug log")
f.BoolP("version", "V", false, "show program version")
f.BoolP("help", "h", false, "show usage")
f.BoolP("manual", "m", false, "show manual")
f.BoolP("force", "f", false, "force")
if err := f.Parse(os.Args[1:]); err != nil {
return nil, err
}
// generate a list of config files to try to load, including the
// one provided via -c, if any
var configfiles []string
configfile, _ := f.GetString("config")
home, _ := os.UserHomeDir()
if configfile != "" {
configfiles = []string{configfile}
} else {
configfiles = []string{
"/etc/kleingebaeck.conf", "/usr/local/etc/kleingebaeck.conf", // unix variants
filepath.Join(home, ".config", "kleingebaeck", "config"),
filepath.Join(home, ".kleingebaeck"),
"kleingebaeck.conf",
}
}
// Load the config file[s]
for _, cfgfile := range configfiles {
if path, err := os.Stat(cfgfile); !os.IsNotExist(err) {
if !path.IsDir() { if !path.IsDir() {
if err := k.Load(file.Provider(cfgfile), toml.Parser()); err != nil { configstring, err := os.ReadFile(file)
return nil, errors.New("error loading config file: " + err.Error()) if err != nil {
} return nil, err
}
}
// else: we ignore the file if it doesn't exists
} }
// env overrides config file err = hclsimple.Decode(
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string { path.Name(), configstring,
return strings.Replace(strings.ToLower( nil, &c,
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1) )
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error()) if err != nil {
return nil, err
}
}
} }
// command line overrides env return &c, nil
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error())
}
// fetch values
conf := &Config{}
if err := k.Unmarshal("", &conf); err != nil {
return nil, errors.New("error unmarshalling: " + err.Error())
}
// adjust loglevel
switch conf.Loglevel {
case "verbose":
conf.Verbose = true
case "debug":
conf.Debug = true
}
// are there any args left on commandline? if so threat them as adlinks
conf.Adlinks = f.Args()
return conf, nil
} }

View File

@@ -1,22 +0,0 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

View File

@@ -1,6 +1,6 @@
# #
# kleingebaeck sample configuration file. # kleingebaeck sample configuration file.
# put this to ~/.kleingebaeck. # put this to ~/.kleingebaeck.hcl.
# #
# Comments start with the '#' character. # Comments start with the '#' character.
@@ -8,23 +8,12 @@
user = 00000000 user = 00000000
# enable verbose output (same as -v), may be true or false. # enable verbose output (same as -v), may be true or false.
# other values: notice or debug verbose = true
loglevel = "verbose"
# directory where to store downloaded ads. kleingebaeck will try to # directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string. # create it. must be a quoted string.
outdir = "test" outdir = "test"
# template for stored adlistings. To enable it, remove the comment # template. leave empty to use the default one, which is:
# chars up until the last #""" # "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n"
#template=""" template = ""
#Title: {{.Title}}
#Price: {{.Price}}
#Id: {{.Id}}
#Category: {{.Category}}
#Condition: {{.Condition}}
#Created: {{.Created}}
#{{.Text}}
# """

View File

@@ -1,75 +0,0 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

45
go.mod
View File

@@ -3,36 +3,19 @@ module kleingebaeck
go 1.21 go 1.21
require ( require (
astuart.co/goq v1.0.0 astuart.co/goq v1.0.0 // indirect
github.com/jarcoal/httpmock v1.3.1 github.com/PuerkitoBio/goquery v1.5.0 // indirect
github.com/knadh/koanf/parsers/toml v0.1.0 github.com/agext/levenshtein v1.2.1 // indirect
github.com/knadh/koanf/providers/confmap v0.1.0 github.com/andybalholm/cascadia v1.0.0 // indirect
github.com/knadh/koanf/providers/env v0.1.0 github.com/apparentlymart/go-textseg/v13 v13.0.0 // indirect
github.com/knadh/koanf/providers/file v0.1.0 github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
github.com/knadh/koanf/providers/posflag v0.1.0 github.com/google/go-cmp v0.3.1 // indirect
github.com/knadh/koanf/v2 v2.0.1 github.com/hashicorp/hcl/v2 v2.19.1 // indirect
github.com/lmittmann/tint v1.0.4 github.com/lmittmann/tint v1.0.3 // indirect
github.com/mattn/go-isatty v0.0.20 github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 // indirect
github.com/spf13/pflag v1.0.5 github.com/spf13/pflag v1.0.5 // indirect
github.com/tlinden/yadu v0.1.0 github.com/zclconf/go-cty v1.13.0 // indirect
golang.org/x/sync v0.5.0 golang.org/x/net v0.0.0-20190606173856-1492cefac77f // indirect
) golang.org/x/text v0.11.0 // indirect
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/corona10/goimagehash v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.14.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
) )

89
go.sum
View File

@@ -1,84 +1,37 @@
astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw= astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw=
astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno= astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno=
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/agext/levenshtein v1.2.1 h1:QmvMAjj2aEICytGiWzmxoE0x2KZvE0fvmqMOfy2tjT8=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/agext/levenshtein v1.2.1/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/apparentlymart/go-textseg/v13 v13.0.0 h1:Y+KvPE1NYz0xl601PVImeQfFyEy6iT90AvPUL1NNfNw=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/apparentlymart/go-textseg/v13 v13.0.0/go.mod h1:ZK2fH7c4NqDTLtiYLvIkEghdlcqw7yxLeM89kiTRPUo=
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI= github.com/apparentlymart/go-textseg/v15 v15.0.0 h1:uYvfpb3DyLSCGWnctWKGj857c6ew1u1fNQOlOtuGxQY=
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI= github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/google/go-cmp v0.3.1 h1:Xye71clBPdm5HgqGwUkwhbynsUJZhDbS20FvLhQ2izg=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/hashicorp/hcl/v2 v2.19.1 h1://i05Jqznmb2EXqa39Nsvyan2o5XyMowW5fnCKW5RPI=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/hashicorp/hcl/v2 v2.19.1/go.mod h1:ThLC89FV4p9MPW804KVbe/cEXoQ8NZEh+JtMeeGErHE=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= github.com/lmittmann/tint v1.0.3 h1:W5PHeA2D8bBJVvabNfQD/XW9HPLZK1XoPZH0cq8NouQ=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/lmittmann/tint v1.0.3/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww= github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 h1:DpOJ2HYzCv8LZP15IdmG+YdwD2luVPHITV96TkirNBM=
github.com/jarcoal/httpmock v1.3.1/go.mod h1:3yb8rc4BI7TCBhFY8ng0gjuLKJNquuDNiPaZjnENuYg= github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
github.com/knadh/koanf/maps v0.1.1 h1:G5TjmUh2D7G2YWf5SQQqSiHRJEjaicvU0KpypqB3NIs=
github.com/knadh/koanf/maps v0.1.1/go.mod h1:npD/QZY3V6ghQDdcQzl1W4ICNVTkohC8E73eI2xW4yI=
github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6ODgOub/6LCI=
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=
github.com/knadh/koanf/providers/posflag v0.1.0/go.mod h1:SYg03v/t8ISBNrMBRMlojH8OsKowbkXV7giIbBVgbz0=
github.com/knadh/koanf/v2 v2.0.1 h1:1dYGITt1I23x8cfx8ZnldtezdyaZtfAuRtIFOiRzK7g=
github.com/knadh/koanf/v2 v2.0.1/go.mod h1:ZeiIlIDXTE7w1lMT6UVcNiRAS2/rCeLn/GdLNvY1Dus=
github.com/lmittmann/tint v1.0.4 h1:LeYihpJ9hyGvE0w+K2okPTGUdVLfng1+nDNVR4vWISc=
github.com/lmittmann/tint v1.0.4/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g=
github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM=
github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw=
github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/zclconf/go-cty v1.13.0 h1:It5dfKTTZHe9aeppbNOda3mN7Ag7sg6QkBNm6TkyFa0=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/zclconf/go-cty v1.13.0/go.mod h1:YKQzy/7pZ7iq2jNFzy5go57xdxdWoLLpaEp4u238AE0=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355 h1:EmgK+IGUz2m42bFKteLY5SYJLn/CyBrz6nkgS22K8Bk=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
github.com/tlinden/yadu v0.1.0 h1:qtCi1jxg392qVRLFyrJ2LYu6/PiKSp1LT02EX+mNLME=
github.com/tlinden/yadu v0.1.0/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190606173856-1492cefac77f h1:IWHgpgFqnL5AhBUBZSgBdjl2vkQUEzcY+JNKWfcgAU0=
golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

129
http.go
View File

@@ -1,129 +0,0 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"io"
"log/slog"
"math"
"math/rand"
"net/http"
"time"
)
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789")
func getid() string {
b := make([]rune, 8)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
// first try
resp, err := http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
// enter retry check and loop, if first req were successfull, leave loop immediately
retries := 0
for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
}
return resp, err
}

142
image.go
View File

@@ -1,142 +0,0 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"image/jpeg"
"log/slog"
"os"
"path/filepath"
"github.com/corona10/goimagehash"
)
const MaxDistance = 3
type Image struct {
Filename string
Hash *goimagehash.ImageHash
Data *bytes.Buffer
Uri string
}
// used for logging to avoid printing Data
func (img *Image) LogValue() slog.Value {
return slog.GroupValue(
slog.String("filename", img.Filename),
slog.String("uri", img.Uri),
slog.String("hash", img.Hash.ToString()),
)
}
// holds all images of an ad
type Cache []*goimagehash.ImageHash
func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
img := &Image{
Filename: filename,
Uri: uri,
Data: buf,
}
return img
}
// Calculate diff hash of the image
func (img *Image) CalcHash() error {
jpgdata, err := jpeg.Decode(img.Data)
if err != nil {
return err
}
hash1, err := goimagehash.DifferenceHash(jpgdata)
if err != nil {
return err
}
img.Hash = hash1
return nil
}
// checks if 2 images are similar enough to be considered the same
func (img *Image) Similar(hash *goimagehash.ImageHash) bool {
distance, err := img.Hash.Distance(hash)
if err != nil {
slog.Debug("failed to compute diff hash distance", "error", err)
return false
}
if distance < MaxDistance {
slog.Debug("distance computation", "image-A", img.Hash.ToString(),
"image-B", hash.ToString(), "distance", distance)
return true
} else {
return false
}
}
// check current image against all known hashes.
func (img *Image) SimilarExists(cache Cache) bool {
for _, otherimg := range cache {
if img.Similar(otherimg) {
return true
}
}
return false
}
// read all JPG images in a ad directory, compute diff hashes and
// store the results in the slice Images
func ReadImages(addir string, dont bool) (Cache, error) {
files, err := os.ReadDir(addir)
if err != nil {
return nil, err
}
cache := Cache{}
if dont {
// forced download, -f given
return cache, nil
}
for _, file := range files {
ext := filepath.Ext(file.Name())
if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
filename := filepath.Join(addir, file.Name())
data, err := ReadImage(filename)
if err != nil {
return nil, err
}
img := NewImage(data, filename, "")
if err = img.CalcHash(); err != nil {
return nil, err
}
slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
cache = append(cache, img.Hash)
}
}
//return nil, errors.New("ende")
return cache, nil
}

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands" .TH KLEINGEBAECK 1 "2023-12-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -142,20 +142,16 @@
kleingebaeck \- kleinanzeigen.de backup tool kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS" .SH "SYNOPSYS"
.IX Header "SYNOPSYS" .IX Header "SYNOPSYS"
.Vb 10 .Vb 9
\& This is kleingebaeck, the kleinanzeigen.de backup tool.
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...] \& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options: \& Options:
\& \-u \-\-user <uid> Backup ads from user with uid <uid>. \& \-\-user,\-u <uid> Backup ads from user with uid <uid>.
\& \-d \-\-debug Enable debug output. \& \-\-debug, \-d Enable debug output.
\& \-v \-\-verbose Enable verbose output. \& \-\-verbose,\-v Enable verbose output.
\& \-o \-\-outdir <dir> Set output dir (default: current directory) \& \-\-output\-dir,\-o <dir> Set output dir (default: current directory)
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all. \& \-\-manual,\-m Show manual.
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck). \& \-\-config,\-c <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-f \-\-force Download images even if they already exist.
\& \-m \-\-manual Show manual.
\& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve .Ve
.SH "DESCRIPTION" .SH "DESCRIPTION"
.IX Header "DESCRIPTION" .IX Header "DESCRIPTION"
@@ -168,41 +164,26 @@ title, body, price etc. All images will be downloaded as well.
.SH "CONFIGURATION" .SH "CONFIGURATION"
.IX Header "CONFIGURATION" .IX Header "CONFIGURATION"
You can create a config file to save typing. By default You can create a config file to save typing. By default
\&\f(CW\*(C`~/.kleingebaeck\*(C'\fR is being used but you can specify one with \f(CW\*(C`\-c\*(C'\fR as \&\f(CW\*(C`~/.kleingebaeck.hcl\*(C'\fR is being used but you can specify one with
well. We use \s-1TOML\s0 as our configuration language. See \&\f(CW\*(C`\-c\*(C'\fR as well.
<https://toml.io/en/>.
.PP .PP
Format is pretty simple: Format is simple:
.PP .PP
.Vb 10 .Vb 4
\& user = 1010101 \& user = 1010101
\& loglevel = verbose \& verbose = true
\& outdir = "test" \& outdir = "test"
\& template = """ \& template = ""
\& Title: {{.Title}}
\& Price: {{.Price}}
\& Id: {{.Id}}
\& Category: {{.Category}}
\& Condition: {{.Condition}}
\& Created: {{.Created}}
\&
\& {{.Text}}
\& """
.Ve .Ve
.PP .PP
Be carefull if you want to change the template. The variable is a Be carefull if you want to change the template. The default one looks like this:
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
<https://pkg.go.dev/text/template> for details how to write a
template.
.PP
If you're on windows and want to customize the output directory, put
it into single quotes to avoid the backslashes interpreted as escape
chars like this:
.PP .PP
.Vb 1 .Vb 1
\& outdir = \*(AqC:\eData\eAds\*(Aq \& Title: {{.Title}}\enPrice: {{.Price}}\enId: {{.Id}}\enCategory: {{.Category}}\enCondition: {{.Condition}}\enCreated: {{.Created}}\en\en{{.Text}}\en
.Ve .Ve
.PP
You can left out certain fields and use any formatting you like. Refer
to <https://pkg.go.dev/text/template> for details how to write a template.
.SH "SETUP" .SH "SETUP"
.IX Header "SETUP" .IX Header "SETUP"
To setup the tool, you need to lookup your userid on To setup the tool, you need to lookup your userid on
@@ -220,22 +201,6 @@ directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP .PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging. debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS" .SH "BUGS"
.IX Header "BUGS" .IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests
@@ -254,20 +219,7 @@ Also there's currently no parallelization implemented. This will
change in the future. change in the future.
.SH "LICENSE" .SH "LICENSE"
.IX Header "LICENSE" .IX Header "LICENSE"
Copyright 2023\-2024 Thomas von Dein Licensed under the \s-1GNU GENERAL PUBLIC LICENSE\s0 version 3.
.PP
This program is free software: you can redistribute it and/or modify
it under the terms of the \s-1GNU\s0 General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
.PP
This program is distributed in the hope that it will be useful,
but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE.\s0 See the
\&\s-1GNU\s0 General Public License for more details.
.PP
You should have received a copy of the \s-1GNU\s0 General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.SH "Author" .SH "Author"
.IX Header "Author" .IX Header "Author"
T.v.Dein <tom \s-1AT\s0 vondein \s-1DOT\s0 org> T.v.Dein <tom \s-1AT\s0 vondein \s-1DOT\s0 org>

View File

@@ -5,19 +5,15 @@ NAME
kleingebaeck - kleinanzeigen.de backup tool kleingebaeck - kleinanzeigen.de backup tool
SYNOPSYS SYNOPSYS
This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
-u --user <uid> Backup ads from user with uid <uid>. --user,-u <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output. --debug, -d Enable debug output.
-v --verbose Enable verbose output. --verbose,-v Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory) --output-dir,-o <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all. --manual,-m Show manual.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck). --config,-c <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
DESCRIPTION DESCRIPTION
This tool can be used to backup ads on the german ad page This tool can be used to backup ads on the german ad page
@@ -30,36 +26,24 @@ DESCRIPTION
CONFIGURATION CONFIGURATION
You can create a config file to save typing. By default You can create a config file to save typing. By default
"~/.kleingebaeck" is being used but you can specify one with "-c" as "~/.kleingebaeck.hcl" is being used but you can specify one with "-c" as
well. We use TOML as our configuration language. See well.
<https://toml.io/en/>.
Format is pretty simple: Format is simple:
user = 1010101 user = 1010101
loglevel = verbose verbose = true
outdir = "test" outdir = "test"
template = """ template = ""
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
Condition: {{.Condition}}
Created: {{.Created}}
{{.Text}} Be carefull if you want to change the template. The default one looks
"""
Be carefull if you want to change the template. The variable is a
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
<https://pkg.go.dev/text/template> for details how to write a template.
If you're on windows and want to customize the output directory, put it
into single quotes to avoid the backslashes interpreted as escape chars
like this: like this:
outdir = 'C:\Data\Ads' Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n
You can left out certain fields and use any formatting you like. Refer
to <https://pkg.go.dev/text/template> for details how to write a
template.
SETUP SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de. To setup the tool, you need to lookup your userid on kleinanzeigen.de.
@@ -75,20 +59,6 @@ SETUP
You can use the -v option to get verbose output or -d to enable You can use the -v option to get verbose output or -d to enable
debugging. debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS BUGS
In order to report a bug, unexpected behavior, feature requests or to In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github: submit a patch, please open an issue on github:
@@ -106,20 +76,7 @@ LIMITATIONS
in the future. in the future.
LICENSE LICENSE
Copyright 2023-2024 Thomas von Dein Licensed under the GNU GENERAL PUBLIC LICENSE version 3.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
Author Author
T.v.Dein <tom AT vondein DOT org> T.v.Dein <tom AT vondein DOT org>

View File

@@ -4,19 +4,15 @@ kleingebaeck - kleinanzeigen.de backup tool
=head1 SYNOPSYS =head1 SYNOPSYS
This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options: Options:
-u --user <uid> Backup ads from user with uid <uid>. --user,-u <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output. --debug, -d Enable debug output.
-v --verbose Enable verbose output. --verbose,-v Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory) --output-dir,-o <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all. --manual,-m Show manual.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck). --config,-c <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION =head1 DESCRIPTION
@@ -30,37 +26,22 @@ title, body, price etc. All images will be downloaded as well.
=head1 CONFIGURATION =head1 CONFIGURATION
You can create a config file to save typing. By default You can create a config file to save typing. By default
C<~/.kleingebaeck> is being used but you can specify one with C<-c> as C<~/.kleingebaeck.hcl> is being used but you can specify one with
well. We use TOML as our configuration language. See C<-c> as well.
L<https://toml.io/en/>.
Format is pretty simple: Format is simple:
user = 1010101 user = 1010101
loglevel = verbose verbose = true
outdir = "test" outdir = "test"
template = """ template = ""
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
Condition: {{.Condition}}
Created: {{.Created}}
{{.Text}} Be carefull if you want to change the template. The default one looks like this:
"""
Be carefull if you want to change the template. The variable is a Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
L<https://pkg.go.dev/text/template> for details how to write a
template.
If you're on windows and want to customize the output directory, put You can left out certain fields and use any formatting you like. Refer
it into single quotes to avoid the backslashes interpreted as escape to L<https://pkg.go.dev/text/template> for details how to write a template.
chars like this:
outdir = 'C:\Data\Ads'
=head1 SETUP =head1 SETUP
@@ -78,23 +59,6 @@ directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable You can use the B<-v> option to get verbose output or B<-d> to enable
debugging. debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS =head1 BUGS
In order to report a bug, unexpected behavior, feature requests In order to report a bug, unexpected behavior, feature requests
@@ -115,20 +79,7 @@ change in the future.
=head1 LICENSE =head1 LICENSE
Copyright 2023-2024 Thomas von Dein Licensed under the GNU GENERAL PUBLIC LICENSE version 3.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=head1 Author =head1 Author

124
main.go
View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023-2024 Thomas von Dein Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -20,22 +20,34 @@ package main
import ( import (
"errors" "errors"
"fmt" "fmt"
"io"
"log/slog" "log/slog"
"os" "os"
"runtime/debug" "runtime/debug"
"github.com/lmittmann/tint" "github.com/lmittmann/tint"
"github.com/tlinden/yadu" flag "github.com/spf13/pflag"
) )
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
--user,-u <uid> Backup ads from user with uid <uid>.
--debug, -d Enable debug output.
--verbose,-v Enable verbose output.
--output-dir,-o <dir> Set output dir (default: current directory)
--manual,-m Show manual.
--config,-c <file> Use config file <file> (default: ~/.kleingebaeck).
If one or more <ad-listing-url>'s are specified, only backup those,
otherwise backup all ads of the given user.`
const LevelNotice = slog.Level(2) const LevelNotice = slog.Level(2)
func main() { func main() {
os.Exit(Main(os.Stdout)) os.Exit(Main())
} }
func Main(w io.Writer) int { func Main() int {
logLevel := &slog.LevelVar{} logLevel := &slog.LevelVar{}
opts := &tint.Options{ opts := &tint.Options{
Level: logLevel, Level: logLevel,
@@ -47,30 +59,44 @@ func Main(w io.Writer) int {
} }
return a return a
}, },
NoColor: IsNoTty(),
} }
logLevel.Set(LevelNotice) logLevel.Set(LevelNotice)
handler := tint.NewHandler(w, opts) var handler slog.Handler = tint.NewHandler(os.Stdout, opts)
logger := slog.New(handler) logger := slog.New(handler)
slog.SetDefault(logger) slog.SetDefault(logger)
conf, err := InitConfig(w) showversion := false
if err != nil { showhelp := false
return Die(err) showmanual := false
} enabledebug := false
enableverbose := false
uid := 0
configfile := os.Getenv("HOME") + "/.kleingebaeck.hcl"
dir := ""
if conf.Showversion { flag.BoolVarP(&enabledebug, "debug", "d", false, "debug mode")
fmt.Fprintf(w, "This is kleingebaeck version %s\n", VERSION) flag.BoolVarP(&enableverbose, "verbose", "v", false, "be verbose")
flag.BoolVarP(&showversion, "version", "V", false, "show version")
flag.BoolVarP(&showhelp, "help", "h", false, "show usage")
flag.BoolVarP(&showmanual, "manual", "m", false, "show manual")
flag.IntVarP(&uid, "user", "u", uid, "user id")
flag.StringVarP(&dir, "output-dir", "o", dir, "where to store ads")
flag.StringVarP(&configfile, "config", "c", configfile, "config file")
flag.Parse()
if showversion {
fmt.Printf("This is kleingebaeck version %s\n", VERSION)
return 0 return 0
} }
if conf.Showhelp { if showhelp {
fmt.Fprintln(w, Usage) fmt.Println(Usage)
return 0 return 0
} }
if conf.Showmanual { if showmanual {
err := man() err := man()
if err != nil { if err != nil {
return Die(err) return Die(err)
@@ -78,21 +104,25 @@ func Main(w io.Writer) int {
return 0 return 0
} }
if conf.Verbose { conf, err := ParseConfigfile(configfile)
if err != nil {
return Die(err)
}
if enableverbose || conf.Verbose {
logLevel.Set(slog.LevelInfo) logLevel.Set(slog.LevelInfo)
} }
if conf.Debug { if enabledebug {
// we're using a more verbose logger in debug mode // we're using a more verbose logger in debug mode
buildInfo, _ := debug.ReadBuildInfo() buildInfo, _ := debug.ReadBuildInfo()
opts := &yadu.Options{ opts := &tint.Options{
Level: logLevel, Level: logLevel,
AddSource: true, AddSource: true,
//NoColor: IsNoTty(),
} }
logLevel.Set(slog.LevelDebug) logLevel.Set(slog.LevelDebug)
handler := yadu.NewHandler(w, opts) var handler slog.Handler = tint.NewHandler(os.Stdout, opts)
debuglogger := slog.New(handler).With( debuglogger := slog.New(handler).With(
slog.Group("program_info", slog.Group("program_info",
slog.Int("pid", os.Getpid()), slog.Int("pid", os.Getpid()),
@@ -104,42 +134,50 @@ func Main(w io.Writer) int {
slog.Debug("config", "conf", conf) slog.Debug("config", "conf", conf)
if len(dir) == 0 {
if len(conf.Outdir) > 0 {
dir = conf.Outdir
} else {
dir = Defaultdir
}
}
// prepare output dir // prepare output dir
err = Mkdir(conf.Outdir) err = Mkdir(dir)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
// used for all HTTP requests // which template to use
fetch := NewFetcher(conf) template := DefaultTemplate
if len(conf.Template) > 0 {
template = conf.Template
}
if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s] // directly backup ad listing[s]
for _, uri := range conf.Adlinks { if len(flag.Args()) >= 1 {
err := ScrapeAd(fetch, uri) for _, uri := range flag.Args() {
err := Scrape(uri, dir, template)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
} }
} else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline)
err := ScrapeUser(fetch)
if err != nil {
return Die(err)
}
} else {
return Die(errors.New("invalid or no user id or no ad link specified"))
}
if conf.StatsCountAds > 0 { return 0
adstr := "ads" }
if conf.StatsCountAds == 1 {
adstr = "ad" // backup all ads of the given user (via config or cmdline)
if uid == 0 && conf.User > 0 {
uid = conf.User
}
if uid > 0 {
err := Start(fmt.Sprintf("%d", uid), dir, template)
if err != nil {
return Die(err)
} }
fmt.Fprintf(w, "Successfully downloaded %d %s with %d images to %s.\n",
conf.StatsCountAds, adstr, conf.StatsCountImages, conf.Outdir)
} else { } else {
fmt.Fprintf(w, "No ads found.") return Die(errors.New("invalid or no user id specified"))
} }
return 0 return 0

View File

@@ -1,555 +0,0 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"errors"
"fmt"
"os"
"strings"
"testing"
tpl "text/template"
"github.com/jarcoal/httpmock"
)
// the ad list, aka:
// https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
// Note, that this HTML code is reduced to the max, so that it only
// contains the stuff required to satisfy goquery
const LISTTPL string = `<!DOCTYPE html>
<html lang="de" >
<head>
<title>Ads</title>
</head>
<body>
{{ range . }}
<h2 class="text-module-begin">
<a class="ellipsis"
href="/s-anzeige/{{ .Slug }}/{{ .Id }}">{{ .Title }}</a>
</h2>
{{ end }}
</body>
</html>
`
// an actual ad listing, aka:
// https://www.kleinanzeigen.de/s-anzeige/ad-text-slug/1010101010
// Note, that this HTML code is reduced to the max, so that it only
// contains the stuff required to satisfy goquery
const ADTPL string = `DOCTYPE html>
<html lang="de">
<head>
<title>Ad Listing</title>
</head>
<body>
<div class="l-container-row">
<div id="vap-brdcrmb" class="breadcrump">
<a class="breadcrump-link" itemprop="url" href="/" title="Kleinanzeigen ">
<span itemprop="title">Kleinanzeigen </span>
</a>
<a class="breadcrump-link" itemprop="url" href="/egal">
<span itemprop="title">{{ .Category }}</span></a>
</div>
</div>
{{ range $image := .Images }}
<div class="galleryimage-element" data-ix="3">
<img src="{{ $image }}"/>
</div>
{{ end }}
<h1 id="viewad-title" class="boxedarticle--title" itemprop="name" data-soldlabel="Verkauft">
{{ .Title }}</h1>
<div class="boxedarticle--flex--container">
<h2 class="boxedarticle--price" id="viewad-price">
{{ .Price }}</h2>
</div>
<div id="viewad-extra-info" class="boxedarticle--details--full">
<div><i class="icon icon-small icon-calendar-gray-simple"></i><span>{{ .Created }}</span></div>
</div>
<div class="splitlinebox l-container-row" id="viewad-details">
<ul class="addetailslist">
<li class="addetailslist--detail">
Zustand<span class="addetailslist--detail--value" >
{{ .Condition }}</span>
</li>
</ul>
</div>
<div class="l-container last-paragraph-no-margin-bottom">
<p id="viewad-description-text" class="text-force-linebreak " itemprop="description">
{{ .Text }}
</p>
</div>
</body>
</html>
`
const EMPTYPAGE string = `DOCTYPE html>
<html lang="de">
<head></head>
<body></body>
</html>
`
const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things`
)
var base = "kleingebaeck -c t/config-empty.conf"
type Tests struct {
name string
args string
expect string
exitcode int
}
var tests = []Tests{
{
name: "version",
args: base + " -V",
expect: "This is",
exitcode: 0,
},
{
name: "help",
args: base + " -h",
expect: "Usage:",
exitcode: 0,
},
{
name: "debug",
args: base + " -d",
expect: "error: invalid or no user id or no ad link specified",
exitcode: 1,
},
{
name: "debug-check-programinfo",
args: base + " -d",
expect: "pid:",
exitcode: 1,
},
{
name: "no-args-no-user",
args: base,
expect: "invalid or no user id",
exitcode: 1,
},
{
name: "download-single-ad",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1",
expect: "Successfully downloaded 1 ad with 2 images to t/out",
exitcode: 0,
},
{
name: "download-single-ad-verbose",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -v",
expect: "wrote ad listing",
exitcode: 0,
},
{
name: "download-single-ad-debug",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -d",
expect: "DEBUG: extracted ad listing",
exitcode: 0,
},
{
name: "download-all-ads",
args: base + " -o t/out -u 1",
expect: "Successfully downloaded 6 ads with 12 images to t/out",
exitcode: 0,
},
{
name: "download-all-ads-using-config",
args: "kleingebaeck -c t/fullconfig.conf",
expect: "Successfully downloaded 6 ads with 12 images to t/out",
exitcode: 0,
},
}
var invalidtests = []Tests{
{
name: "empty-ad",
args: base + " " + EMPTYURI,
expect: "could not extract ad data from page, got empty struct",
exitcode: 1,
},
{
name: "invalid-ad",
args: base + " " + INVALIDURI,
expect: "invalid uri",
exitcode: 1,
},
{
name: "invalid-path",
args: base + " " + INVALIDPATHURI,
expect: "could not extract ad data from page, got empty struct",
exitcode: 1,
},
{
name: "404",
args: base + " " + INVALID404URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
{
name: "outdir-no-exists",
args: base + " -o t/foo/bar/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -v",
expect: "Failure",
exitcode: 1,
},
{
name: "wrong-flag",
args: base + " -X",
expect: "unknown shorthand flag: 'X' in -X",
exitcode: 1,
},
{
name: "no-config",
args: "kleingebaeck -c t/invalid.conf",
expect: "error loading config file",
exitcode: 1,
},
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
}
type AdConfig struct {
Title string
Slug string
Id string
Price string
Category string
Condition string
Created string
Text string
Images []string // files in ./t/
}
var adsrc = []AdConfig{
{
Title: "First Ad",
Id: "1", Price: "5€",
Category: "Klimbim",
Text: "Thing to sale",
Slug: "first-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Secnd Ad",
Id: "2", Price: "5€",
Category: "Kram",
Text: "Thing to sale",
Slug: "second-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Third Ad",
Id: "3",
Price: "5€",
Category: "Kuddelmuddel",
Text: "Thing to sale",
Slug: "third-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Forth Ad",
Id: "4",
Price: "5€",
Category: "Krempel",
Text: "Thing to sale",
Slug: "fourth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Fifth Ad",
Id: "5",
Price: "5€",
Category: "Kladderadatsch",
Text: "Thing to sale",
Slug: "fifth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Sixth Ad",
Id: "6",
Price: "5€",
Category: "Klunker",
Text: "Thing to sale",
Slug: "sixth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
}
// An Adsource is used to construct a httpmock responder for a
// particular url. So, the code (scrape.go) scrapes
// https://kleinanzeigen.de, but in reality httpmock captures the
// request and responds with our mock data
type Adsource struct {
uri string
content string
status int
}
// Render a HTML template for an adlisting or an ad
func GetTemplate(l []AdConfig, a AdConfig, htmltemplate string) string {
tmpl, err := tpl.New("template").Parse(htmltemplate)
if err != nil {
panic(err)
}
var out bytes.Buffer
if len(a.Id) == 0 {
err = tmpl.Execute(&out, l)
} else {
err = tmpl.Execute(&out, a)
}
if err != nil {
panic(err)
}
return out.String()
}
// Initialize the valid sources for the httpmock responder
func InitValidSources() []Adsource {
// valid ad listing page 1
list1 := []AdConfig{
adsrc[0], adsrc[1], adsrc[2],
}
// valid ad listing page 2
list2 := []AdConfig{
adsrc[3], adsrc[4], adsrc[5],
}
// valid ad listing page 3, which is empty
list3 := []AdConfig{}
// used to signal GetTemplate() to render a listing
empty := AdConfig{}
// prepare urls for the listing pages
ads := []Adsource{
{
uri: fmt.Sprintf("%s%s?userId=1", Baseuri, Listuri),
content: GetTemplate(list1, empty, LISTTPL),
},
{
uri: fmt.Sprintf("%s%s?userId=1&pageNum=2", Baseuri, Listuri),
content: GetTemplate(list2, empty, LISTTPL),
},
{
uri: fmt.Sprintf("%s%s?userId=1&pageNum=3", Baseuri, Listuri),
content: GetTemplate(list3, empty, LISTTPL),
},
}
// prepare urls for the ads
for _, ad := range adsrc {
ads = append(ads, Adsource{
uri: fmt.Sprintf("%s/s-anzeige/%s/%s", Baseuri, ad.Slug, ad.Id),
content: GetTemplate(nil, ad, ADTPL),
})
//panic(GetTemplate(nil, ad, ADTPL))
}
return ads
}
func InitInvalidSources() []Adsource {
empty := AdConfig{}
ads := []Adsource{
{
// valid ad page but without content
uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri),
content: GetTemplate(nil, empty, EMPTYPAGE),
},
{
// some random foreign webpage
uri: INVALIDURI,
content: GetTemplate(nil, empty, "<html>foo</html>"),
},
{
// some invalid page path
uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri),
content: GetTemplate(nil, empty, "<html></html>"),
},
{
// some none-ad page
uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404,
},
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
}
return ads
}
// load a test image from disk
func GetImage(path string) []byte {
dat, err := os.ReadFile(path)
if err != nil {
panic(err)
}
return dat
}
// setup httpmock
func SetIntercept(ads []Adsource) {
for _, ad := range ads {
if ad.status == 0 {
ad.status = 200
}
httpmock.RegisterResponder("GET", ad.uri,
httpmock.NewStringResponder(ad.status, ad.content))
}
// we just use 2 images, put this here
for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
httpmock.RegisterResponder("GET", image,
httpmock.NewBytesResponder(200, GetImage(image)))
}
}
func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
ad.Category + ad.Condition + ad.Created
// prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: "{{ .Slug }}"}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct)
if err != nil {
return err
}
file := fmt.Sprintf("t/out/%s/Adlisting.txt", addir)
content, err := os.ReadFile(file)
if err != nil {
return err
}
if body != strings.TrimSpace(string(content)) {
msg := fmt.Sprintf("ad content doesn't match.\nExpect: %s\n Got: %s\n", body, content)
return errors.New(msg)
}
return nil
}
func TestMain(t *testing.T) {
oldargs := os.Args
defer func() { os.Args = oldargs }()
httpmock.Activate()
defer httpmock.DeactivateAndReset()
// prepare httpmock responders
SetIntercept(InitValidSources())
// run commandline tests
for _, tt := range tests {
var buf bytes.Buffer
os.Args = strings.Split(tt.args, " ")
ret := Main(&buf)
if ret != tt.exitcode {
t.Errorf("%s with cmd <%s> did not exit with %d but %d",
tt.name, tt.args, tt.exitcode, ret)
}
if !strings.Contains(buf.String(), tt.expect) {
t.Errorf("%s with cmd <%s> output did not match.\nExpect: %s\n Got: %s\n",
tt.name, tt.args, tt.expect, buf.String())
}
}
// verify if downloaded ads match
for _, ad := range adsrc {
if err := VerifyAd(ad); err != nil {
t.Errorf(err.Error())
}
}
}
func TestMainInvalids(t *testing.T) {
oldargs := os.Args
defer func() { os.Args = oldargs }()
httpmock.Activate()
defer httpmock.DeactivateAndReset()
// prepare httpmock responders
SetIntercept(InitInvalidSources())
// run commandline tests
for _, tt := range invalidtests {
var buf bytes.Buffer
os.Args = strings.Split(tt.args, " ")
ret := Main(&buf)
if ret != tt.exitcode {
t.Errorf("%s with cmd <%s> did not exit with %d but %d",
tt.name, tt.args, tt.exitcode, ret)
}
if !strings.Contains(buf.String(), tt.expect) {
t.Errorf("%s with cmd <%s> output did not match.\nExpect: %s\n Got: %s\n",
tt.name, tt.args, tt.expect, buf.String())
}
}
}

View File

@@ -40,11 +40,6 @@ for D in $DIST; do
os=${D/\/*/} os=${D/\/*/}
arch=${D/*\//} arch=${D/*\//}
binfile="releases/${tool}-${os}-${arch}-${version}" binfile="releases/${tool}-${os}-${arch}-${version}"
if test "$os" = "windows"; then
binfile="${binfile}.exe"
fi
tardir="${tool}-${os}-${arch}-${version}" tardir="${tool}-${os}-${arch}-${version}"
tarfile="releases/${tool}-${os}-${arch}-${version}.tar.gz" tarfile="releases/${tool}-${os}-${arch}-${version}.tar.gz"
set -x set -x

197
scrape.go
View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023-2024 Thomas von Dein Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -18,32 +18,80 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"bytes"
"errors" "errors"
"fmt" "fmt"
"io"
"log/slog" "log/slog"
"path/filepath" "net/http"
"strings" "strings"
"sync"
"astuart.co/goq" "astuart.co/goq"
"golang.org/x/sync/errgroup"
) )
type Index struct {
Links []string `goquery:".text-module-begin a,[href]"`
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string
Category string
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Meta []string `goquery:".addetailslist--detail--value,text"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", Useragent)
res, err := client.Do(req)
if err != nil {
return nil, err
}
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
return res.Body, nil
}
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func ScrapeUser(fetch *Fetcher) error { func Start(uid string, dir string, template string) error {
client := &http.Client{}
adlinks := []string{} adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User) baseuri := Baseuri + Listuri + "?userId=" + uid
page := 1 page := 1
uri := baseuri uri := baseuri
slog.Info("fetching ad pages", "user", fetch.Config.User) slog.Info("fetching ad pages", "user", uid)
for { for {
var index Index var index Index
slog.Debug("fetching page", "uri", uri) slog.Debug("fetching page", "uri", uri)
body, err := fetch.Get(uri) body, err := Get(uri, client)
if err != nil { if err != nil {
return err return err
} }
@@ -69,35 +117,32 @@ func ScrapeUser(fetch *Fetcher) error {
uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page) uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
} }
for i, adlink := range adlinks { for _, adlink := range adlinks {
err := ScrapeAd(fetch, Baseuri+adlink) err := Scrape(Baseuri+adlink, dir, template)
if err != nil { if err != nil {
return err return err
} }
if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break
}
} }
return nil return nil
} }
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func ScrapeAd(fetch *Fetcher, uri string) error { func Scrape(uri string, dir string, template string) error {
client := &http.Client{}
ad := &Ad{} ad := &Ad{}
// extract slug and id from uri // extract slug and id from uri
uriparts := strings.Split(uri, "/") uriparts := strings.Split(uri, "/")
if len(uriparts) < 6 { if len(uriparts) < 6 {
return errors.New("invalid uri: " + uri) return errors.New("invalid uri")
} }
ad.Slug = uriparts[4] ad.Slug = uriparts[4]
ad.Id = uriparts[5] ad.Id = uriparts[5]
// get the ad // get the ad
slog.Debug("fetching ad page", "uri", uri) slog.Debug("fetching ad page", "uri", uri)
body, err := fetch.Get(uri) body, err := Get(uri, client)
if err != nil { if err != nil {
return err return err
} }
@@ -108,90 +153,70 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
if err != nil { if err != nil {
return err return err
} }
if len(ad.Meta) == 2 {
if len(ad.CategoryTree) > 0 { ad.Category = ad.Meta[0]
ad.Category = strings.Join(ad.CategoryTree, " => ") ad.Condition = ad.Meta[1]
} }
if ad.Incomplete() {
slog.Debug("got ad", "ad", ad)
return errors.New("could not extract ad data from page, got empty struct")
}
ad.CalculateExpire()
// write listing
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
fetch.Config.IncrAds() // write listing
err = WriteAd(dir, ad, template)
return ScrapeImages(fetch, ad, addir) if err != nil {
return err
} }
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error { return ScrapeImages(dir, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
// fetch images // fetch images
img := 1 img := 1
adpath := filepath.Join(fetch.Config.Outdir, addir) var wg sync.WaitGroup
wg.Add(len(ad.Images))
// scan existing images, if any failure := make(chan string)
cache, err := ReadImages(adpath, fetch.Config.ForceDownload)
if err != nil {
return err
}
g := new(errgroup.Group)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
imguri := imguri file := fmt.Sprintf("%s/%d.jpg", dir, img)
file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img)) go func() {
g.Go(func() error { defer wg.Done()
body, err := fetch.Getimage(imguri) err := Getimage(imguri, file)
if err != nil { if err != nil {
return err failure <- err.Error()
return
} }
slog.Info("wrote ad image", "image", file)
buf := new(bytes.Buffer) }()
_, err = buf.ReadFrom(body)
if err != nil {
return err
}
buf2 := buf.Bytes() // needed for image writing
image := NewImage(buf, "", imguri)
err = image.CalcHash()
if err != nil {
return err
}
if !fetch.Config.ForceDownload {
if image.SimilarExists(cache) {
slog.Debug("similar image exists, not written", "uri", image.Uri)
return nil
}
}
err = WriteImage(file, buf2)
if err != nil {
return err
}
slog.Debug("wrote image", "image", image, "size", len(buf2))
return nil
})
img++ img++
} }
if err := g.Wait(); err != nil { close(failure)
return err wg.Wait()
} goterr := <-failure
fetch.Config.IncrImgs(len(ad.Images)) if goterr != "" {
return errors.New(goterr)
}
return nil
}
// fetch an image
func Getimage(uri, fileName string) error {
slog.Debug("fetching ad image", "uri", uri)
response, err := http.Get(uri)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return errors.New("received non 200 response code")
}
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}
return nil return nil
} }

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023-2024 Thomas von Dein Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -18,114 +18,55 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"bytes" "io"
"fmt"
"log/slog" "log/slog"
"os" "os"
"path/filepath"
"runtime"
"strings" "strings"
tpl "text/template" tpl "text/template"
) )
func AdDirName(c *Config, ad *Ad) (string, error) { func WriteAd(dir string, ad *Ad, template string) error {
tmpl, err := tpl.New("adname").Parse(c.Adnametemplate)
if err != nil {
return "", err
}
buf := bytes.Buffer{}
err = tmpl.Execute(&buf, ad)
if err != nil {
return "", err
}
return buf.String(), nil
}
func WriteAd(c *Config, ad *Ad) (string, error) {
// prepare ad dir name
addir, err := AdDirName(c, ad)
if err != nil {
return "", err
}
// prepare output dir // prepare output dir
dir := filepath.Join(c.Outdir, addir) dir = dir + "/" + ad.Slug
err = Mkdir(dir) err := Mkdir(dir)
if err != nil { if err != nil {
return "", err return err
} }
// write ad file // write ad file
listingfile := filepath.Join(dir, "Adlisting.txt") listingfile := strings.Join([]string{dir, "Adlisting.txt"}, "/")
f, err := os.Create(listingfile) f, err := os.Create(listingfile)
if err != nil { if err != nil {
return "", err return err
} }
defer f.Close()
if runtime.GOOS == "windows" {
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\r\n")
} else {
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n") ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
}
tmpl, err := tpl.New("adlisting").Parse(c.Template) tmpl, err := tpl.New("adlisting").Parse(template)
if err != nil { if err != nil {
return "", err return err
} }
err = tmpl.Execute(f, ad) err = tmpl.Execute(f, ad)
if err != nil { if err != nil {
return "", err return err
} }
slog.Info("wrote ad listing", "listingfile", listingfile) slog.Info("wrote ad listing", "listingfile", listingfile)
return addir, nil return nil
} }
func WriteImage(filename string, buf []byte) error { func WriteImage(filename string, reader io.ReadCloser) error {
file, err := os.Create(filename) file, err := os.Create(filename)
if err != nil { if err != nil {
return err return err
} }
defer file.Close() defer file.Close()
_, err = file.Write(buf) _, err = io.Copy(file, reader)
if err != nil { if err != nil {
return err return err
} }
return nil return nil
} }
func ReadImage(filename string) (*bytes.Buffer, error) {
var buf bytes.Buffer
if !fileExists(filename) {
return nil, fmt.Errorf("image %s does not exist", filename)
}
data, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
_, err = buf.Write(data)
if err != nil {
return nil, err
}
return &buf, nil
}
func fileExists(filename string) bool {
info, err := os.Stat(filename)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}

BIN
t/1.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1001 B

BIN
t/2.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1002 B

View File

@@ -1,6 +0,0 @@
# empty config for Main() unit tests to force unit tests NOT to use an
# eventually existing ~/.kleingebaeck!
template="""
{{.Title}}{{.Price}}{{.Id}}{{.Category}}{{.Condition}}{{.Created}}
"""

View File

@@ -1,6 +0,0 @@
user = 1
loglevel = "verbose"
outdir = "t/out"
template="""
{{.Title}}{{.Price}}{{.Id}}{{.Category}}{{.Condition}}{{.Created}}
"""

View File

@@ -1,13 +0,0 @@
# Mock http server
Install ehfs from https://github.com/mjpclab/extra-http-file-server/.
Install p2cli from https://github.com/wrouesnel/p2cli.
Run `templates/render.sh` to build the file structure.
Run `server.sh` to start the http server.
To scrape an ad from it, use such a URL:
http://localhost:8080/s-anzeige/first-ad/111-11-111

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

View File

@@ -1,4 +0,0 @@
#!/bin/sh
ehfs -a :/s-anzeige:./kleinanzeigen \
-a :/api/v1/prod-ads/images/fc:./img \
-l localhost:8080 -I index.html

View File

@@ -1,50 +0,0 @@
<!DOCTYPE html>
<html lang="de">
<head>
<title>Ad Listing</title>
</head>
<body>
<div class="l-container-row">
<div id="vap-brdcrmb" class="breadcrump">
<a class="breadcrump-link" itemprop="url" href="/" title="Kleinanzeigen ">
<span itemprop="title">Kleinanzeigen </span>
</a>
<a class="breadcrump-link" itemprop="url" href="/egal">
<span itemprop="title">{{ category }}</span></a>
</div>
</div>
{% for image in images %}
<div class="galleryimage-element" data-ix="3">
<img src="http://localhost:8080/api/v1/prod-ads/images/fc/{{ image.id }}?rule=$_59.JPG"/>
</div>
{% endfor %}
<h1 id="viewad-title" class="boxedarticle--title" itemprop="name" data-soldlabel="Verkauft">
{{ title }}</h1>
<div class="boxedarticle--flex--container">
<h2 class="boxedarticle--price" id="viewad-price">
{{ price }}</h2>
</div>
<div id="viewad-extra-info" class="boxedarticle--details--full">
<div><i class="icon icon-small icon-calendar-gray-simple"></i><span>{{ created }}</span></div>
</div>
<div class="splitlinebox l-container-row" id="viewad-details">
<ul class="addetailslist">
<li class="addetailslist--detail">
Zustand<span class="addetailslist--detail--value" >
{{ condition }}</span>
</li>
</ul>
</div>
<div class="l-container last-paragraph-no-margin-bottom">
<p id="viewad-description-text" class="text-force-linebreak " itemprop="description">
{{ text }}
</p>
</div>
</body>
</html>

View File

@@ -1,15 +0,0 @@
<!DOCTYPE html>
<html lang="de" >
<head>
<title>Ads</title>
</head>
<body>
{% for ad in ads %}
<h2 class="text-module-begin">
<a class="ellipsis"
href="/s-anzeige/{{ ad.slug }}/{{ ad.id }}">{{ ad.title }}</a>
</h2>
{% endfor %}
</body>
</html>

View File

@@ -1,13 +0,0 @@
#!/bin/sh -x
base="../kleinanzeigen"
mkdir -p $base
echo "Generating /s-bestandsliste.html"
p2cli -t index.tpl -i vars.yaml > $base/s-bestandsliste.html
for idx in 0 1; do
slug=$(cat vars.yaml | yq ".ads[$idx].slug")
id=$(cat vars.yaml | yq ".ads[$idx].id")
mkdir -p $base/$slug/$id
cat vars.yaml | yq ".ads[$idx]" | p2cli -t ad.tpl -f yaml > $base/$slug/$id/index.html
done

View File

@@ -1,27 +0,0 @@
ads:
- slug: first-ad
id: 111-11-111
title: First Ad
price: "19 €"
condition: "Sehr gut"
category: "Weitere Elektronik"
created: 21.12.2023
images:
- id: fcf6d664-5258-42c2-bf58-d1b8e9221574
- id: fcf6d664-5258-42c2-bf58-as43as5d43as
text: |
Zu Verkaufen.
Zahlung nur Paypal.
- slug: second-ad
id: 222-22-222
title: Second Ad
price: "200 €"
condition: "Sehr gut"
category: "Elektronik"
created: 21.12.2023
images:
- id: cdas4sd5-5258-42c2-bf58-d1b8e9221574
- id: cdas4sd5-5258-42c2-bf58-as43as5d43as
text: |
Zu Verkaufen.
Zahlung nur Überweisung.

View File

@@ -1 +0,0 @@
user = "

13
util.go
View File

@@ -22,9 +22,6 @@ import (
"errors" "errors"
"os" "os"
"os/exec" "os/exec"
"runtime"
"github.com/mattn/go-isatty"
) )
func Mkdir(dir string) error { func Mkdir(dir string) error {
@@ -56,13 +53,3 @@ func man() error {
return nil return nil
} }
// returns TRUE if stdout is NOT a tty or windows
func IsNoTty() bool {
if runtime.GOOS == "windows" || !isatty.IsTerminal(os.Stdout.Fd()) {
return true
}
// it is a tty
return false
}