Compare commits

..

2 Commits

42 changed files with 2888 additions and 156 deletions

View File

@@ -5,4 +5,3 @@ title: "[bug-report]"
labels: bug
assignees: TLINDEN
---

Binary file not shown.

View File

@@ -1,10 +0,0 @@
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "monthly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"

47
.github/workflows/ci.yaml vendored Normal file
View File

@@ -0,0 +1,47 @@
name: build-and-test
on: [push, pull_request]
jobs:
build:
strategy:
matrix:
version: [1.21]
os: [ubuntu-latest, windows-latest, macos-latest]
name: Build
runs-on: ${{ matrix.os }}
steps:
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: ${{ matrix.version }}
id: go
- name: checkout
uses: actions/checkout@v3
- name: build
run: go build
- name: test
run: make test
- name: Update coverage report
uses: ncruces/go-coverage-report@main
with:
report: true
chart: true
amend: true
if: |
matrix.os == 'ubuntu-latest' &&
github.event_name == 'push'
continue-on-error: true
golangci:
name: lint
runs-on: ubuntu-latest
steps:
- uses: actions/setup-go@v3
with:
go-version: 1.21
- uses: actions/checkout@v3
- name: golangci-lint
uses: golangci/golangci-lint-action@v3

34
.github/workflows/pushimage.yaml vendored Normal file
View File

@@ -0,0 +1,34 @@
name: build-push-image
on:
push:
tags:
- 'v*'
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: https://ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:${{ github.ref_name}}
- name: Build and push latest Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
push: true
tags: ghcr.io/tlinden/kleingebaeck:latest

27
Dockerfile Normal file
View File

@@ -0,0 +1,27 @@
FROM golang:1.21-alpine as builder
RUN apk update
RUN apk upgrade
RUN apk add --no-cache git make
RUN git --version
WORKDIR /work
COPY go.mod .
COPY . .
RUN go mod download
RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck
ENV KLEINGEBAECK_OUTDIR /backup
ENV LANG C.UTF-8
USER 1001:1001
ENTRYPOINT ["/app/kleingebaeck"]
CMD ["-h"]

92
Makefile Normal file
View File

@@ -0,0 +1,92 @@
# Copyright © 2023 Thomas von Dein
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# no need to modify anything below
tool = kleingebaeck
VERSION = $(shell grep VERSION config.go | head -1 | cut -d '"' -f2)
archs = darwin freebsd linux windows
PREFIX = /usr/local
UID = root
GID = 0
HAVE_POD := $(shell pod2text -h 2>/dev/null)
all: $(tool).1 $(tool).go buildlocal
%.1: %.pod
ifdef HAVE_POD
pod2man -c "User Commands" -r 1 -s 1 $*.pod > $*.1
endif
%.go: %.pod
ifdef HAVE_POD
echo "package main" > $*.go
echo >> $*.go
echo "var manpage = \`" >> $*.go
pod2text $*.pod >> $*.go
echo "\`" >> $*.go
endif
buildlocal:
CGO_LDFLAGS='-static' go build -tags osusergo,netgo -ldflags "-extldflags=-static" -o $(tool)
install: buildlocal
install -d -o $(UID) -g $(GID) $(PREFIX)/bin
install -d -o $(UID) -g $(GID) $(PREFIX)/man/man1
install -o $(UID) -g $(GID) -m 555 $(tool) $(PREFIX)/sbin/
install -o $(UID) -g $(GID) -m 444 $(tool).1 $(PREFIX)/man/man1/
clean:
rm -rf $(tool) coverage.out testdata t/out
test: clean
mkdir -p t/out
go test ./... $(ARGS)
testfuzzy: clean
go test -fuzz ./... $(ARGS)
singletest:
@echo "Call like this: make singletest TEST=TestPrepareColumns ARGS=-v"
go test -run $(TEST) $(ARGS)
cover-report:
go test ./... -cover -coverprofile=coverage.out
go tool cover -html=coverage.out
goupdate:
go get -t -u=patch ./...
buildall:
./mkrel.sh $(tool) $(VERSION)
release: buildall
gh release create v$(VERSION) --generate-notes releases/*
show-versions: buildlocal
@echo "### kleingebaeck version:"
@./kleingebaeck -V
@echo
@echo "### go module versions:"
@go list -m all
@echo
@echo "### go version used for building:"
@grep -m 1 go go.mod
lint:
golangci-lint run -p bugs -p unused

View File

@@ -1,64 +1,52 @@
## Kleingebäck - kleinanzeigen.de Backup
![Kleingebaeck Logo](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleingebaecklogo-small.png)
![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png)
[![Go Report Card](https://goreportcard.com/badge/codeberg.org/scip/kleingebaeck)](https://goreportcard.com/report/codeberg.org/scip/kleingebaeck)
[![status-badge](https://ci.codeberg.org/api/badges/15530/status.svg)](https://ci.codeberg.org/repos/15530)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
[![Actions](https://github.com/tlinden/kleingebaeck/actions/workflows/ci.yaml/badge.svg)](https://github.com/tlinden/kleingebaeck/actions)
[![Go Coverage](https://github.com/tlinden/kleingebaeck/wiki/coverage.svg)](https://raw.githack.com/wiki/tlinden/kleingebaeck/coverage.html)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://codeberg.org/scip/kleingebaeck/releases)
[![English](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/english.png)](https://codeberg.org/scip/kleingebaeck/raw/branch/main/README.md)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
[![English](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/english.png)](https://github.com/tlinden/kleingebaeck/blob/main/README.md)
Mit diesem Tool kann man seine Anzeigen bei https://kleinanzeigen.de sichern.
> [!IMPORTANT]
> Diese Software wird jetzt bei Codeberg weitergepflegt: [Codeberg](https://codeberg.org/scip/kleingebaeck/).
Es kann alle Anzeigen eines Users (oder nur eine Ausgewählte)
inklusive der Bilder herunterladen, die in einem Verzeichnis pro
Anzeige gespeichert werden. In dem Verzeichnis wird eine Datei
`Adlisting.txt` erstellt, in der sich die Inhalte der Anzeige wie
Titel, Preis, Text etc befinden. Bilder werden natürlich auch heruntergeladen.
## ACHTUNG - SICHERHEITS-UPDATE
Fertige vorcompilierte Programme älter als Version `v0.3.12` sind von
Schwachstellen in der Behandlung von HTTP und Zertifikaten
betroffen. Falls Du eine ältere Kleingebäck-Version im Einsatz hast,
bitte update auf Version `v0.3.12` oder höher. Bitte lies auch die [Release Notes für
v0.3.12](https://codeberg.org/scip/kleingebaeck/releases/tag/v0.3.12)
für mehr Details.
## Screenshots
Das ist die Hauptseite meines kleinanzeigen.de Accounts:
![Index](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-index.png)
![Index](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-index.png)
Sichern ich meine Anzeigen:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-download.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-download.png)
Backupverzeichnis nach dem Download:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-backup.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-backup.png)
Verzeichnis einer Anzeige:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-ad.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-ad.png)
**Das gleiche unter Windows:**
Anzeigen Sichern:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/cmd-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/cmd-windows.jpg)
Backupverzeichnis nach dem Download
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/liste-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/liste-windows.jpg)
Und eine Anzeige:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/adlisting-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/adlisting-windows.jpg)
## Installation
@@ -68,7 +56,7 @@ für seine Plattform herunter und kann direkt loslegen.
### Installation des vorcompilierten Programms
Auf der Seite [des letzten Releases](https://codeberg.org/scip/kleingebaeck/releases) findet man das Program für sein Betriebssystem und die Plattform (z.b. Windows + Intel)
Auf der Seite [des letzten Releases](https://github.com/TLINDEN/kleingebaeck/releases/latest) findet man das Program für sein Betriebssystem und die Plattform (z.b. Windows + Intel)
Es gibt 2 Varianten:
@@ -107,7 +95,7 @@ hilfreich, aber nicht unbedingt erforderlich.
Um das Programm zu compilieren, muss man folgende Schritte ausführen:
```shell
git clone https://codeberg.org/scip/kleingebaeck.git
git clone https://github.com/TLINDEN/kleingebaeck.git
cd kleingebaeck
go mod tidy
make # (oder make)
@@ -148,7 +136,7 @@ gemountet.
Die Optionen `-u XXX -v` sind kleingebäck Optionen. Ersetze `XXX`
durch Deine tatsächliche kleinanzeigen.de Userid.
Eine Liste verfügbarer Images findet man [hier](https://codeberg.org/scip/kleingebaeck/pkgs/container/kleingebaeck/versions?filters%5Bversion_type%5D=tagged)
Eine Liste verfügbarer Images findet man [hier](https://github.com/tlinden/kleingebaeck/pkgs/container/kleingebaeck/versions?filters%5Bversion_type%5D=tagged)
## Kommandozeilen Optionen:
@@ -233,53 +221,10 @@ Sowie alle Bilder.
Das Format kann man mit der Variable `template` in der Konfiguration
ändern. Die `example.conf` enthält ein Beispiel für das Standard Template.
## Verhalten des Tools
Es gibt einige Dinge über das Verhalten von kleingebäck, über die Du
Bescheid wissen solltest:
- alle HTML Seiten und Bilder werden immer heruntergeladen
- es wird ein (konfigurierbarer) Useragent verwendet
- HTTP Cookies werden beachtet
- bei Fehlern wird dreimal mit unterschiedlichem Abstand erneut
versucht
- Bilder Downloads laufen parallelisiert mit leicht unterschiedlichen
zeitlichen Abständen ab
- Gleich aussehende Bilder werden nicht überschrieben
Der letzte Punkt muss genauer erläutert werden:
Wenn man bei Kleinanzeigen.de eine Anzeige einstellt und Bilder
postet, werden diese dort in ihrer Grösse reduziert (durch Kompression
und Verkleinerung der Bilder usw.). Diese reduzierten Bilder werden
dann von kleingebäck heruntergeladen. Falls Du Deine original Bilder
behalten hast, kannst Du diese danach in das Backupverzeichnis
kopieren. Bei einem erneuten kleingebäck-Lauf werden diese Bilder dann
nicht überschrieben.
Wir verwenden dafür einen Algorythmus namens [distance
hashing](https://github.com/corona10/goimagehash). Dieser Algorithmus
prüft die Ähnlichkeit von Bildern. Diese können in ihrer Auflösung,
Kompression, Farbtiefe und vielem mehr manipuliert worden sein und
trotzdem als das "gleiche Bild" erkannt werden (wohlgemerkt nicht "das
selbe": die Dateien sind durchaus unterschiedlich!). Bis zu einer
Distance von 5 überschreiben wir keine Bilder, weil wir dann davon
ausgehen, dass das lokal Vorhandene das Original ist.
Bitte beachte aber, dass dies KEIN Cachingmechanismus ist: die Bilder
werden trotzdem immer alle heruntergeladen. Das muss so sein, da wir
uns nicht die Dateinamen anschauen können, da kleinanzeigen.de diese
nämlich zu Zahlen umbenennt. Und die Dateinamen können sich auch
ändern, wenn der User in der Anzeige die Bilder umarrangiert hat.
Du kannst dieses Verhalten mit der Option **--force** ausschalten. Du
kannst ausserdem mit der Option **--ignoreerrors** auch alle Fehler
ignorieren, die beim Bilderdownload auftreten könnten.
## Documentation
Die Dokumentation kann man
[online](https://codeberg.org/scip/kleingebaeck/raw/branch/main/kleingebaeck.pod)
[online](https://github.com/TLINDEN/kleingebaeck/blob/main/kleingebaeck.pod)
oder lokal lesen mit: `kleingebaeck --manual`. Hat man das Tool mit
dem Tarball installiert, funktioniert auch `man kleingebaeck`.
@@ -295,7 +240,7 @@ das doch der beste Weg, die Anfrage zu übersehen und zu vergessen.
Um einen Fehler, ein unerwartetes Verhalten, eine Feature Request oder
einen Patch zu übermitteln, eröffne daher bitte einen Issue unter:
https://codeberg.org/scip/kleingebaeck/issues. Danke!
https://github.com/TLINDEN/kleingebaeck/issues. Danke!
Bitte gebe den fehlgeschlagenen Befehl an, rufe es auch mit Debugging
`-d` auf.

View File

@@ -1,18 +1,16 @@
## Kleingebäck - kleinanzeigen.de Backup
![Kleingebaeck Logo](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleingebaecklogo-small.png)
![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png)
[![Go Report Card](https://goreportcard.com/badge/codeberg.org/scip/kleingebaeck)](https://goreportcard.com/report/codeberg.org/scip/kleingebaeck)
[![status-badge](https://ci.codeberg.org/api/badges/15530/status.svg)](https://ci.codeberg.org/repos/15530)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
[![Actions](https://github.com/tlinden/kleingebaeck/actions/workflows/ci.yaml/badge.svg)](https://github.com/tlinden/kleingebaeck/actions)
[![Go Coverage](https://github.com/tlinden/kleingebaeck/wiki/coverage.svg)](https://raw.githack.com/wiki/tlinden/kleingebaeck/coverage.html)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://codeberg.org/scip/kleingebaeck/releases)
[![German](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/german.png)](https://codeberg.org/scip/kleingebaeck/raw/branch/main/README-de.md)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
[![German](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/german.png)](https://github.com/tlinden/kleingebaeck/blob/main/README-de.md)
[Die deutsche Version des READMEs findet Ihr hier](README-de.md).
> [!IMPORTANT]
> This software is now being maintained on [Codeberg](https://codeberg.org/scip/kleingebaeck/).
This tool can be used to backup ads on the german ad page https://kleinanzeigen.de
It downloads all (or only the specified ones) ads of one user into a
@@ -20,46 +18,37 @@ directory, each ad into its own subdirectory. The backup will contain
a textfile `Adlisting.txt` which contains the ad contents as the
title, body, price etc. All images will be downloaded as well.
## CAUTION - SECURITY UPDATE
Binary releases prior to version `v0.3.11` are affected by
vulnerabilities in HTTP and certificate handling. If you are using
such a binary, please update to `v0.3.12` or higher. Please also refer
to the [Release Notes of
v0.3.12](https://codeberg.org/scip/kleingebaeck/releases/tag/v0.3.12)
for more details.
## Screenshots
This is the index of my kleinanzeigen.de Account:
![Index](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-index.png)
![Index](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-index.png)
Here I download my ads on the commandline:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-download.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-download.png)
And this is the backup directory after download:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-backup.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-backup.png)
Here's a directory for one ad:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/kleinanzeigen-ad.png)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleinanzeigen-ad.png)
**The same thing under windows:**
Downloading ads:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/cmd-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/cmd-windows.jpg)
Backup directory after download:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/liste-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/liste-windows.jpg)
And one ad listing directory:
![Download](https://codeberg.org/scip/kleingebaeck/raw/branch/main/.github/assets/adlisting-windows.jpg)
![Download](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/adlisting-windows.jpg)
## Installation
@@ -70,7 +59,7 @@ releases page and you're good to go.
### Installation using a pre-compiled binary
Go to the [latest release
page](https://codeberg.org/scip/kleingebaeck/releases) and
page](https://github.com/TLINDEN/kleingebaeck/releases/latest) and
look for your OS and platform. There are two options to install the binary:
1. Directly download the binary for your platform,
@@ -140,7 +129,7 @@ docker run. And the local directory `myads` will be mapped to
The options `-u XXX -v` are kleingebaeck options, replace `XXX` with
your actual kleinanzeigen.de user id.
A list of available images is [here](https://codeberg.org/scip/kleingebaeck/pkgs/container/kleingebaeck/versions?filters%5Bversion_type%5D=tagged)
A list of available images is [here](https://github.com/tlinden/kleingebaeck/pkgs/container/kleingebaeck/versions?filters%5Bversion_type%5D=tagged)
## Commandline options:
@@ -203,11 +192,9 @@ somewhat like this:
```default
Title: A book I sell
Price: 99 € VB
Shipping: 6,90 €
Id: 1919191919
Category: Sachbücher
Condition: Sehr Gut
Type: Buch
Created: 10.12.2023
This is the description text.
@@ -220,51 +207,9 @@ variable. The supplied sample config contains the default template.
All images will be stored in the same directory.
## Tool Behavior
There are a bunch of things you might want to know about the behavior
of the kleingebäck tool:
- all HTML pages and IMAGEs are always being downloaded
- we use a (customizable) user agent
- we respect HTTP cookies
- in the case of an error, the tool does 3 retries, the time it waits
between tries is longer for each retry
- image download is parallized using small time differences to look
more natural
- same images are not being overwritten on subsequent download
The latter needs to be elaborated a bit more:
If you publish an ad on kleinanzeigen.de and post images, those images
will be reduced in size by the site (by compressing and down sizing
them). This reduced images will be downloaded by kleingebäck. However,
you may still own the original images and may want to put them into
that backup directory so that you have all things for one ad together.
You can easily do that, because kleingebäck won't overwrite those
original images. It uses something called a distance hash using
[goimagehash](https://github.com/corona10/goimagehash). This
algorithmus checks the similarity of images. If an image has been
resized it is still very similar to the original one. We accept a
maximum of a distance of 5, everything above leads to overwrite.
This works with resizes, cropped and otherwise manipulated images as
long as the image still shows the original contents good enough.
Also note, that this is NOT a caching mechanism: the images will be
downloaded anyway during each run. We also can't look at the file
names because kleinanzeigen.de renames all images to numbers. And
those might even change if the user re-arranges the images.
You can override this behavior using the **--force** option. Another
option, **--ignoreerrors**, can be used to ignore all kinds of image
errors.
## Documentation
You can read the documentation [online](https://codeberg.org/scip/kleingebaeck/raw/branch/main/kleingebaeck.pod) or locally once you have installed kleingebaeck with: `kleingebaeck --manual`.
You can read the documentation [online](https://github.com/TLINDEN/kleingebaeck/blob/main/kleingebaeck.pod) or locally once you have installed kleingebaeck with: `kleingebaeck --manual`.
## Kleingebäck?
@@ -280,7 +225,7 @@ that's the best way for me to forget to do something.
In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github:
https://codeberg.org/scip/kleingebaeck/issues.
https://github.com/TLINDEN/kleingebaeck/issues.
Please repeat the failing command with debugging enabled `-d` and
include the output in the issue.

82
ad.go Normal file
View File

@@ -0,0 +1,82 @@
/*
Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"log/slog"
"strings"
"time"
)
type Index struct {
Links []string `goquery:".text-module-begin a,[href]"`
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string `goquery:".addetailslist--detail--value,text"`
Category string
CategoryTree []string `goquery:".breadcrump-link,text"`
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Expire string
}
// Used by slog to pretty print an ad
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition),
slog.String("created", ad.Created),
slog.String("expire", ad.Expire),
)
}
// check for completeness. I erected these fields to be mandatory
// (though I really don't know if they really are). I consider images
// and meta optional. So, if either of the checked fields here is
// empty we return an error. All the checked fields are extracted
// using goquery. However, I think price is optional since there are
// ads for gifts as well.
//
// Note: we return true for "ad is incomplete" and false for "ad is complete"!
func (ad *Ad) Incomplete() bool {
if ad.Category == "" || ad.Created == "" || ad.Text == "" {
return true
}
return false
}
func (ad *Ad) CalculateExpire() {
if len(ad.Created) > 0 {
ts, err := time.Parse("02.01.2006", ad.Created)
if err == nil {
ad.Expire = ts.AddDate(0, 2, 1).Format("02.01.2006")
}
}
}

203
config.go Normal file
View File

@@ -0,0 +1,203 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
"github.com/knadh/koanf/parsers/toml"
"github.com/knadh/koanf/providers/confmap"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file"
"github.com/knadh/koanf/providers/posflag"
"github.com/knadh/koanf/v2"
flag "github.com/spf13/pflag"
)
const (
VERSION string = "0.3.0"
Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html"
Defaultdir string = "."
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\n" +
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" +
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\n" +
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}"
)
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...]
Options:
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
If one or more ad listing url's are specified, only backup those,
otherwise backup all ads of the given user.`
type Config struct {
Verbose bool `koanf:"verbose"` // loglevel=info
Debug bool `koanf:"debug"` // loglevel=debug
Showversion bool `koanf:"version"` // -v
Showhelp bool `koanf:"help"` // -h
Showmanual bool `koanf:"manual"` // -m
User int `koanf:"user"`
Outdir string `koanf:"outdir"`
Template string `koanf:"template"`
Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
ForceDownload bool `koanf:"force"`
Adlinks []string
StatsCountAds int
StatsCountImages int
}
func (c *Config) IncrAds() {
c.StatsCountAds++
}
func (c *Config) IncrImgs(num int) {
c.StatsCountImages += num
}
// load commandline flags and config file
func InitConfig(w io.Writer) (*Config, error) {
var k = koanf.New(".")
// determine template based on os
template := DefaultTemplate
if runtime.GOOS == "windows" {
template = DefaultTemplateWin
}
// Load default values using the confmap provider.
if err := k.Load(confmap.Provider(map[string]interface{}{
"template": template,
"outdir": ".",
"loglevel": "notice",
"userid": 0,
"adnametemplate": DefaultAdNameTemplate,
}, "."), nil); err != nil {
return nil, err
}
// setup custom usage
f := flag.NewFlagSet("config", flag.ContinueOnError)
f.Usage = func() {
fmt.Fprintln(w, Usage)
os.Exit(0)
}
// parse commandline flags
f.StringP("config", "c", "", "config file")
f.StringP("outdir", "o", "", "directory where to store ads")
f.IntP("user", "u", 0, "user id")
f.IntP("limit", "l", 0, "limit ads to be downloaded (default 0, unlimited)")
f.BoolP("verbose", "v", false, "be verbose")
f.BoolP("debug", "d", false, "enable debug log")
f.BoolP("version", "V", false, "show program version")
f.BoolP("help", "h", false, "show usage")
f.BoolP("manual", "m", false, "show manual")
f.BoolP("force", "f", false, "force")
if err := f.Parse(os.Args[1:]); err != nil {
return nil, err
}
// generate a list of config files to try to load, including the
// one provided via -c, if any
var configfiles []string
configfile, _ := f.GetString("config")
home, _ := os.UserHomeDir()
if configfile != "" {
configfiles = []string{configfile}
} else {
configfiles = []string{
"/etc/kleingebaeck.conf", "/usr/local/etc/kleingebaeck.conf", // unix variants
filepath.Join(home, ".config", "kleingebaeck", "config"),
filepath.Join(home, ".kleingebaeck"),
"kleingebaeck.conf",
}
}
// Load the config file[s]
for _, cfgfile := range configfiles {
if path, err := os.Stat(cfgfile); !os.IsNotExist(err) {
if !path.IsDir() {
if err := k.Load(file.Provider(cfgfile), toml.Parser()); err != nil {
return nil, errors.New("error loading config file: " + err.Error())
}
}
}
// else: we ignore the file if it doesn't exists
}
// env overrides config file
if err := k.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string {
return strings.Replace(strings.ToLower(
strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".", -1)
}), nil); err != nil {
return nil, errors.New("error loading environment: " + err.Error())
}
// command line overrides env
if err := k.Load(posflag.Provider(f, ".", k), nil); err != nil {
return nil, errors.New("error loading flags: " + err.Error())
}
// fetch values
conf := &Config{}
if err := k.Unmarshal("", &conf); err != nil {
return nil, errors.New("error unmarshalling: " + err.Error())
}
// adjust loglevel
switch conf.Loglevel {
case "verbose":
conf.Verbose = true
case "debug":
conf.Debug = true
}
// are there any args left on commandline? if so threat them as adlinks
conf.Adlinks = f.Args()
return conf, nil
}

22
docker-compose.yaml Normal file
View File

@@ -0,0 +1,22 @@
version: "3.9"
services:
init:
image: alpine:latest
user: "root"
group_add:
- '${GROUP_ID}'
volumes:
- ${OUTDIR}:/backup
command: chown -R ${USER_ID}:${USER_ID} /backup
kleingebaeck:
container_name: kleingebaeck
user: "${USER_ID}:${USER_ID}"
volumes:
- ${OUTDIR}:/backup
working_dir: /backup
build: .
image: kleingebaeck:latest
depends_on:
init:
condition: service_completed_successfully

30
example.conf Normal file
View File

@@ -0,0 +1,30 @@
#
# kleingebaeck sample configuration file.
# put this to ~/.kleingebaeck.
#
# Comments start with the '#' character.
# kleinanzeigen.de user-id. must be an unquoted number
user = 00000000
# enable verbose output (same as -v), may be true or false.
# other values: notice or debug
loglevel = "verbose"
# directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string.
outdir = "test"
# template for stored adlistings. To enable it, remove the comment
# chars up until the last #"""
#template="""
#Title: {{.Title}}
#Price: {{.Price}}
#Id: {{.Id}}
#Category: {{.Category}}
#Condition: {{.Condition}}
#Created: {{.Created}}
#{{.Text}}
# """

75
fetch.go Normal file
View File

@@ -0,0 +1,75 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"io"
"log/slog"
"net/http"
)
// convenient wrapper to fetch some web content
type Fetcher struct {
Config *Config
Client *http.Client
Useragent string // FIXME: make configurable
}
func NewFetcher(c *Config) *Fetcher {
return &Fetcher{
Client: &http.Client{Transport: &loggingTransport{}}, // implemented in http.go
Useragent: Useragent, // default in config.go
Config: c,
}
}
func (f *Fetcher) Get(uri string) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", f.Useragent)
res, err := f.Client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != 200 {
return nil, errors.New("could not get page via HTTP")
}
return res.Body, nil
}
// fetch an image
func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) {
slog.Debug("fetching ad image", "uri", uri)
body, err := f.Get(uri)
if err != nil {
if f.Config.IgnoreErrors {
slog.Info("Failed to download image, error ignored", "error", err.Error())
return nil, nil
}
return nil, err
}
return body, nil
}

38
go.mod Normal file
View File

@@ -0,0 +1,38 @@
module kleingebaeck
go 1.21
require (
astuart.co/goq v1.0.0
github.com/jarcoal/httpmock v1.3.1
github.com/knadh/koanf/parsers/toml v0.1.0
github.com/knadh/koanf/providers/confmap v0.1.0
github.com/knadh/koanf/providers/env v0.1.0
github.com/knadh/koanf/providers/file v0.1.0
github.com/knadh/koanf/providers/posflag v0.1.0
github.com/knadh/koanf/v2 v2.0.1
github.com/lmittmann/tint v1.0.4
github.com/mattn/go-isatty v0.0.20
github.com/spf13/pflag v1.0.5
github.com/tlinden/yadu v0.1.1
golang.org/x/sync v0.5.0
)
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/corona10/goimagehash v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.14.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

86
go.sum Normal file
View File

@@ -0,0 +1,86 @@
astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw=
astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww=
github.com/jarcoal/httpmock v1.3.1/go.mod h1:3yb8rc4BI7TCBhFY8ng0gjuLKJNquuDNiPaZjnENuYg=
github.com/knadh/koanf/maps v0.1.1 h1:G5TjmUh2D7G2YWf5SQQqSiHRJEjaicvU0KpypqB3NIs=
github.com/knadh/koanf/maps v0.1.1/go.mod h1:npD/QZY3V6ghQDdcQzl1W4ICNVTkohC8E73eI2xW4yI=
github.com/knadh/koanf/parsers/toml v0.1.0 h1:S2hLqS4TgWZYj4/7mI5m1CQQcWurxUz6ODgOub/6LCI=
github.com/knadh/koanf/parsers/toml v0.1.0/go.mod h1:yUprhq6eo3GbyVXFFMdbfZSo928ksS+uo0FFqNMnO18=
github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU=
github.com/knadh/koanf/providers/confmap v0.1.0/go.mod h1:2uLhxQzJnyHKfxG927awZC7+fyHFdQkd697K4MdLnIU=
github.com/knadh/koanf/providers/env v0.1.0 h1:LqKteXqfOWyx5Ab9VfGHmjY9BvRXi+clwyZozgVRiKg=
github.com/knadh/koanf/providers/env v0.1.0/go.mod h1:RE8K9GbACJkeEnkl8L/Qcj8p4ZyPXZIQ191HJi44ZaQ=
github.com/knadh/koanf/providers/file v0.1.0 h1:fs6U7nrV58d3CFAFh8VTde8TM262ObYf3ODrc//Lp+c=
github.com/knadh/koanf/providers/file v0.1.0/go.mod h1:rjJ/nHQl64iYCtAW2QQnF0eSmDEX/YZ/eNFj5yR6BvA=
github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHYujwipe7Ie3qW6U=
github.com/knadh/koanf/providers/posflag v0.1.0/go.mod h1:SYg03v/t8ISBNrMBRMlojH8OsKowbkXV7giIbBVgbz0=
github.com/knadh/koanf/v2 v2.0.1 h1:1dYGITt1I23x8cfx8ZnldtezdyaZtfAuRtIFOiRzK7g=
github.com/knadh/koanf/v2 v2.0.1/go.mod h1:ZeiIlIDXTE7w1lMT6UVcNiRAS2/rCeLn/GdLNvY1Dus=
github.com/lmittmann/tint v1.0.4 h1:LeYihpJ9hyGvE0w+K2okPTGUdVLfng1+nDNVR4vWISc=
github.com/lmittmann/tint v1.0.4/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g=
github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM=
github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw=
github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355 h1:EmgK+IGUz2m42bFKteLY5SYJLn/CyBrz6nkgS22K8Bk=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
github.com/tlinden/yadu v0.1.0 h1:qtCi1jxg392qVRLFyrJ2LYu6/PiKSp1LT02EX+mNLME=
github.com/tlinden/yadu v0.1.0/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
github.com/tlinden/yadu v0.1.1 h1:116oEUy9b4PcMF5wLL2dCFA/sn/praYutOnao07MROw=
github.com/tlinden/yadu v0.1.1/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

129
http.go Normal file
View File

@@ -0,0 +1,129 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"io"
"log/slog"
"math"
"math/rand"
"net/http"
"time"
)
// I add an artificial "ID" to each HTTP request and the corresponding
// respose for debugging purposes so that the pair of them can be
// easier associated in debug output
var letters = []rune("ABCDEF0123456789")
func getid() string {
b := make([]rune, 8)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
// retry after HTTP 50x errors or err!=nil
const RetryCount = 3
// used to inject debug log and implement retries
type loggingTransport struct{}
// escalating timeout, $retry^2 seconds
func backoff(retries int) time.Duration {
return time.Duration(math.Pow(2, float64(retries))) * time.Second
}
// only retry in case of errors or certain non 200 HTTP codes
func shouldRetry(err error, resp *http.Response) bool {
if err != nil {
return true
}
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
return true
}
return false
}
// Body needs to be drained, otherwise we can't reuse the http.Response
func drainBody(resp *http.Response) {
if resp != nil {
if resp.Body != nil {
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
// unable to copy data? uff!
panic(err)
}
resp.Body.Close()
}
}
}
// the actual logging transport with retries
func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// just requred for debugging
id := getid()
// clone the request body, put into request on retry
var bodyBytes []byte
if req.Body != nil {
bodyBytes, _ = io.ReadAll(req.Body)
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
slog.Debug("REQUEST", "id", id, "uri", req.URL, "host", req.Host)
// first try
resp, err := http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength)
}
// enter retry check and loop, if first req were successfull, leave loop immediately
retries := 0
for shouldRetry(err, resp) && retries < RetryCount {
time.Sleep(backoff(retries))
// consume any response to reuse the connection.
drainBody(resp)
// clone the request body again
if req.Body != nil {
req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
}
// actual retry
resp, err = http.DefaultTransport.RoundTrip(req)
if err == nil {
slog.Debug("RESPONSE", "id", id, "status", resp.StatusCode,
"contentlength", resp.ContentLength, "retry", retries)
}
retries++
}
return resp, err
}

142
image.go Normal file
View File

@@ -0,0 +1,142 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"image/jpeg"
"log/slog"
"os"
"path/filepath"
"github.com/corona10/goimagehash"
)
const MaxDistance = 3
type Image struct {
Filename string
Hash *goimagehash.ImageHash
Data *bytes.Buffer
Uri string
}
// used for logging to avoid printing Data
func (img *Image) LogValue() slog.Value {
return slog.GroupValue(
slog.String("filename", img.Filename),
slog.String("uri", img.Uri),
slog.String("hash", img.Hash.ToString()),
)
}
// holds all images of an ad
type Cache []*goimagehash.ImageHash
func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
img := &Image{
Filename: filename,
Uri: uri,
Data: buf,
}
return img
}
// Calculate diff hash of the image
func (img *Image) CalcHash() error {
jpgdata, err := jpeg.Decode(img.Data)
if err != nil {
return err
}
hash1, err := goimagehash.DifferenceHash(jpgdata)
if err != nil {
return err
}
img.Hash = hash1
return nil
}
// checks if 2 images are similar enough to be considered the same
func (img *Image) Similar(hash *goimagehash.ImageHash) bool {
distance, err := img.Hash.Distance(hash)
if err != nil {
slog.Debug("failed to compute diff hash distance", "error", err)
return false
}
if distance < MaxDistance {
slog.Debug("distance computation", "image-A", img.Hash.ToString(),
"image-B", hash.ToString(), "distance", distance)
return true
} else {
return false
}
}
// check current image against all known hashes.
func (img *Image) SimilarExists(cache Cache) bool {
for _, otherimg := range cache {
if img.Similar(otherimg) {
return true
}
}
return false
}
// read all JPG images in a ad directory, compute diff hashes and
// store the results in the slice Images
func ReadImages(addir string, dont bool) (Cache, error) {
files, err := os.ReadDir(addir)
if err != nil {
return nil, err
}
cache := Cache{}
if dont {
// forced download, -f given
return cache, nil
}
for _, file := range files {
ext := filepath.Ext(file.Name())
if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
filename := filepath.Join(addir, file.Name())
data, err := ReadImage(filename)
if err != nil {
return nil, err
}
img := NewImage(data, filename, "")
if err = img.CalcHash(); err != nil {
return nil, err
}
slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
cache = append(cache, img.Hash)
}
}
//return nil, errors.New("ende")
return cache, nil
}

273
kleingebaeck.1 Normal file
View File

@@ -0,0 +1,273 @@
.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.42)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings. \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote. \*(C+ will
.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
. ds -- \(*W-
. ds PI pi
. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
. ds L" ""
. ds R" ""
. ds C` ""
. ds C' ""
'br\}
.el\{\
. ds -- \|\(em\|
. ds PI \(*p
. ds L" ``
. ds R" ''
. ds C`
. ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
. if \nF \{\
. de IX
. tm Index:\\$1\t\\n%\t"\\$2"
..
. if !\nF==2 \{\
. nr % 0
. nr F 2
. \}
. \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
. \" fudge factors for nroff and troff
.if n \{\
. ds #H 0
. ds #V .8m
. ds #F .3m
. ds #[ \f1
. ds #] \fP
.\}
.if t \{\
. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
. ds #V .6m
. ds #F 0
. ds #[ \&
. ds #] \&
.\}
. \" simple accents for nroff and troff
.if n \{\
. ds ' \&
. ds ` \&
. ds ^ \&
. ds , \&
. ds ~ ~
. ds /
.\}
.if t \{\
. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
. \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
. \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
. \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
. ds : e
. ds 8 ss
. ds o a
. ds d- d\h'-1'\(ga
. ds D- D\h'-1'\(hy
. ds th \o'bp'
. ds Th \o'LP'
. ds ae ae
. ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS"
.IX Header "SYNOPSYS"
.Vb 10
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options:
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
\& \-d \-\-debug Enable debug output.
\& \-v \-\-verbose Enable verbose output.
\& \-o \-\-outdir <dir> Set output dir (default: current directory)
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-f \-\-force Download images even if they already exist.
\& \-m \-\-manual Show manual.
\& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
This tool can be used to backup ads on the german ad page <https://kleinanzeigen.de>.
.PP
It downloads all (or only the specified ones) ads of one user into a
directory, each ad into its own subdirectory. The backup will contain
a textfile \fBAdlisting.txt\fR which contains the ad contents such as
title, body, price etc. All images will be downloaded as well.
.SH "CONFIGURATION"
.IX Header "CONFIGURATION"
You can create a config file to save typing. By default
\&\f(CW\*(C`~/.kleingebaeck\*(C'\fR is being used but you can specify one with \f(CW\*(C`\-c\*(C'\fR as
well. We use \s-1TOML\s0 as our configuration language. See
<https://toml.io/en/>.
.PP
Format is pretty simple:
.PP
.Vb 10
\& user = 1010101
\& loglevel = verbose
\& outdir = "test"
\& template = """
\& Title: {{.Title}}
\& Price: {{.Price}}
\& Id: {{.Id}}
\& Category: {{.Category}}
\& Condition: {{.Condition}}
\& Created: {{.Created}}
\&
\& {{.Text}}
\& """
.Ve
.PP
Be carefull if you want to change the template. The variable is a
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
<https://pkg.go.dev/text/template> for details how to write a
template.
.PP
If you're on windows and want to customize the output directory, put
it into single quotes to avoid the backslashes interpreted as escape
chars like this:
.PP
.Vb 1
\& outdir = \*(AqC:\eData\eAds\*(Aq
.Ve
.SH "SETUP"
.IX Header "SETUP"
To setup the tool, you need to lookup your userid on
kleinanzeigen.de. Go to your ad overview page while \s-1NOT\s0 being logged
in:
.PP
.Vb 1
\& https://www.kleinanzeigen.de/s\-bestandsliste.html?userId=XXXXXX
.Ve
.PP
The \fB\s-1XXXXX\s0\fR part is your userid.
.PP
Put it into the configfile as outlined above. Also specify an output
directory. Then just execute \f(CW\*(C`kleingebaeck\*(C'\fR.
.PP
You can use the \fB\-v\fR option to get verbose output or \fB\-d\fR to enable
debugging.
.SH "ENVIRONMENT VARIABLES"
.IX Header "ENVIRONMENT VARIABLES"
The following environment variables are considered:
.PP
.Vb 7
\& KLEINGEBAECK_USER
\& KLEINGEBAECK_DEBUG
\& KLEINGEBAECK_VERBOSE
\& KLEINGEBAECK_OUTDIR
\& KLEINGEBAECK_LIMIT
\& KLEINGEBAECK_CONFIG
\& KLEINGEBAECK_IGNOREERRORS
.Ve
.PP
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
.SH "BUGS"
.IX Header "BUGS"
In order to report a bug, unexpected behavior, feature requests
or to submit a patch, please open an issue on github:
<https://github.com/TLINDEN/kleingebaeck/issues>.
.PP
Please repeat the failing command with debugging enabled \f(CW\*(C`\-d\*(C'\fR and
include the output in the issue.
.SH "LIMITATIONS"
.IX Header "LIMITATIONS"
The \f(CW\*(C`kleingebaeck\*(C'\fR doesn't currently check if it has downloaded a
file already, so it downloads everything again every time you execute
it. Be aware of it. This will change in the future.
.PP
Also there's currently no parallelization implemented. This will
change in the future.
.SH "LICENSE"
.IX Header "LICENSE"
Copyright 2023\-2024 Thomas von Dein
.PP
This program is free software: you can redistribute it and/or modify
it under the terms of the \s-1GNU\s0 General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
.PP
This program is distributed in the hope that it will be useful,
but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE.\s0 See the
\&\s-1GNU\s0 General Public License for more details.
.PP
You should have received a copy of the \s-1GNU\s0 General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.SH "Author"
.IX Header "Author"
T.v.Dein <tom \s-1AT\s0 vondein \s-1DOT\s0 org>

127
kleingebaeck.go Normal file
View File

@@ -0,0 +1,127 @@
package main
var manpage = `
NAME
kleingebaeck - kleinanzeigen.de backup tool
SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
DESCRIPTION
This tool can be used to backup ads on the german ad page
<https://kleinanzeigen.de>.
It downloads all (or only the specified ones) ads of one user into a
directory, each ad into its own subdirectory. The backup will contain a
textfile Adlisting.txt which contains the ad contents such as title,
body, price etc. All images will be downloaded as well.
CONFIGURATION
You can create a config file to save typing. By default
"~/.kleingebaeck" is being used but you can specify one with "-c" as
well. We use TOML as our configuration language. See
<https://toml.io/en/>.
Format is pretty simple:
user = 1010101
loglevel = verbose
outdir = "test"
template = """
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
Condition: {{.Condition}}
Created: {{.Created}}
{{.Text}}
"""
Be carefull if you want to change the template. The variable is a
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
<https://pkg.go.dev/text/template> for details how to write a template.
If you're on windows and want to customize the output directory, put it
into single quotes to avoid the backslashes interpreted as escape chars
like this:
outdir = 'C:\Data\Ads'
SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de.
Go to your ad overview page while NOT being logged in:
https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
The XXXXX part is your userid.
Put it into the configfile as outlined above. Also specify an output
directory. Then just execute "kleingebaeck".
You can use the -v option to get verbose output or -d to enable
debugging.
ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but commandline
flags take precedence over env!
BUGS
In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github:
<https://github.com/TLINDEN/kleingebaeck/issues>.
Please repeat the failing command with debugging enabled "-d" and
include the output in the issue.
LIMITATIONS
The "kleingebaeck" doesn't currently check if it has downloaded a file
already, so it downloads everything again every time you execute it. Be
aware of it. This will change in the future.
Also there's currently no parallelization implemented. This will change
in the future.
LICENSE
Copyright 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
Author
T.v.Dein <tom AT vondein DOT org>
`

139
kleingebaeck.pod Normal file
View File

@@ -0,0 +1,139 @@
=head1 NAME
kleingebaeck - kleinanzeigen.de backup tool
=head1 SYNOPSYS
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
Options:
-u --user <uid> Backup ads from user with uid <uid>.
-d --debug Enable debug output.
-v --verbose Enable verbose output.
-o --outdir <dir> Set output dir (default: current directory)
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
=head1 DESCRIPTION
This tool can be used to backup ads on the german ad page L<https://kleinanzeigen.de>.
It downloads all (or only the specified ones) ads of one user into a
directory, each ad into its own subdirectory. The backup will contain
a textfile B<Adlisting.txt> which contains the ad contents such as
title, body, price etc. All images will be downloaded as well.
=head1 CONFIGURATION
You can create a config file to save typing. By default
C<~/.kleingebaeck> is being used but you can specify one with C<-c> as
well. We use TOML as our configuration language. See
L<https://toml.io/en/>.
Format is pretty simple:
user = 1010101
loglevel = verbose
outdir = "test"
template = """
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
Condition: {{.Condition}}
Created: {{.Created}}
{{.Text}}
"""
Be carefull if you want to change the template. The variable is a
multiline string surrounded by three double quotes. You can left out
certain fields and use any formatting you like. Refer to
L<https://pkg.go.dev/text/template> for details how to write a
template.
If you're on windows and want to customize the output directory, put
it into single quotes to avoid the backslashes interpreted as escape
chars like this:
outdir = 'C:\Data\Ads'
=head1 SETUP
To setup the tool, you need to lookup your userid on
kleinanzeigen.de. Go to your ad overview page while NOT being logged
in:
https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
The B<XXXXX> part is your userid.
Put it into the configfile as outlined above. Also specify an output
directory. Then just execute C<kleingebaeck>.
You can use the B<-v> option to get verbose output or B<-d> to enable
debugging.
=head1 ENVIRONMENT VARIABLES
The following environment variables are considered:
KLEINGEBAECK_USER
KLEINGEBAECK_DEBUG
KLEINGEBAECK_VERBOSE
KLEINGEBAECK_OUTDIR
KLEINGEBAECK_LIMIT
KLEINGEBAECK_CONFIG
KLEINGEBAECK_IGNOREERRORS
Please note, that they take precedence over config file, but
commandline flags take precedence over env!
=head1 BUGS
In order to report a bug, unexpected behavior, feature requests
or to submit a patch, please open an issue on github:
L<https://github.com/TLINDEN/kleingebaeck/issues>.
Please repeat the failing command with debugging enabled C<-d> and
include the output in the issue.
=head1 LIMITATIONS
The C<kleingebaeck> doesn't currently check if it has downloaded a
file already, so it downloads everything again every time you execute
it. Be aware of it. This will change in the future.
Also there's currently no parallelization implemented. This will
change in the future.
=head1 LICENSE
Copyright 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=head1 Author
T.v.Dein <tom AT vondein DOT org>
=cut

151
main.go Normal file
View File

@@ -0,0 +1,151 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"errors"
"fmt"
"io"
"log/slog"
"os"
"runtime/debug"
"github.com/lmittmann/tint"
"github.com/tlinden/yadu"
)
const LevelNotice = slog.Level(2)
func main() {
os.Exit(Main(os.Stdout))
}
func Main(w io.Writer) int {
logLevel := &slog.LevelVar{}
opts := &tint.Options{
Level: logLevel,
AddSource: false,
ReplaceAttr: func(groups []string, a slog.Attr) slog.Attr {
// Remove time from the output
if a.Key == slog.TimeKey {
return slog.Attr{}
}
return a
},
NoColor: IsNoTty(),
}
logLevel.Set(LevelNotice)
handler := tint.NewHandler(w, opts)
logger := slog.New(handler)
slog.SetDefault(logger)
conf, err := InitConfig(w)
if err != nil {
return Die(err)
}
if conf.Showversion {
fmt.Fprintf(w, "This is kleingebaeck version %s\n", VERSION)
return 0
}
if conf.Showhelp {
fmt.Fprintln(w, Usage)
return 0
}
if conf.Showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
if conf.Verbose {
logLevel.Set(slog.LevelInfo)
}
if conf.Debug {
// we're using a more verbose logger in debug mode
buildInfo, _ := debug.ReadBuildInfo()
opts := &yadu.Options{
Level: logLevel,
AddSource: true,
//NoColor: IsNoTty(),
}
logLevel.Set(slog.LevelDebug)
handler := yadu.NewHandler(w, opts)
debuglogger := slog.New(handler).With(
slog.Group("program_info",
slog.Int("pid", os.Getpid()),
slog.String("go_version", buildInfo.GoVersion),
),
)
slog.SetDefault(debuglogger)
}
slog.Debug("config", "conf", conf)
// prepare output dir
err = Mkdir(conf.Outdir)
if err != nil {
return Die(err)
}
// used for all HTTP requests
fetch := NewFetcher(conf)
if len(conf.Adlinks) >= 1 {
// directly backup ad listing[s]
for _, uri := range conf.Adlinks {
err := ScrapeAd(fetch, uri)
if err != nil {
return Die(err)
}
}
} else if conf.User > 0 {
// backup all ads of the given user (via config or cmdline)
err := ScrapeUser(fetch)
if err != nil {
return Die(err)
}
} else {
return Die(errors.New("invalid or no user id or no ad link specified"))
}
if conf.StatsCountAds > 0 {
adstr := "ads"
if conf.StatsCountAds == 1 {
adstr = "ad"
}
fmt.Fprintf(w, "Successfully downloaded %d %s with %d images to %s.\n",
conf.StatsCountAds, adstr, conf.StatsCountImages, conf.Outdir)
} else {
fmt.Fprintf(w, "No ads found.")
}
return 0
}
func Die(err error) int {
slog.Error("Failure", "error", err.Error())
return 1
}

555
main_test.go Normal file
View File

@@ -0,0 +1,555 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"errors"
"fmt"
"os"
"strings"
"testing"
tpl "text/template"
"github.com/jarcoal/httpmock"
)
// the ad list, aka:
// https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
// Note, that this HTML code is reduced to the max, so that it only
// contains the stuff required to satisfy goquery
const LISTTPL string = `<!DOCTYPE html>
<html lang="de" >
<head>
<title>Ads</title>
</head>
<body>
{{ range . }}
<h2 class="text-module-begin">
<a class="ellipsis"
href="/s-anzeige/{{ .Slug }}/{{ .Id }}">{{ .Title }}</a>
</h2>
{{ end }}
</body>
</html>
`
// an actual ad listing, aka:
// https://www.kleinanzeigen.de/s-anzeige/ad-text-slug/1010101010
// Note, that this HTML code is reduced to the max, so that it only
// contains the stuff required to satisfy goquery
const ADTPL string = `DOCTYPE html>
<html lang="de">
<head>
<title>Ad Listing</title>
</head>
<body>
<div class="l-container-row">
<div id="vap-brdcrmb" class="breadcrump">
<a class="breadcrump-link" itemprop="url" href="/" title="Kleinanzeigen ">
<span itemprop="title">Kleinanzeigen </span>
</a>
<a class="breadcrump-link" itemprop="url" href="/egal">
<span itemprop="title">{{ .Category }}</span></a>
</div>
</div>
{{ range $image := .Images }}
<div class="galleryimage-element" data-ix="3">
<img src="{{ $image }}"/>
</div>
{{ end }}
<h1 id="viewad-title" class="boxedarticle--title" itemprop="name" data-soldlabel="Verkauft">
{{ .Title }}</h1>
<div class="boxedarticle--flex--container">
<h2 class="boxedarticle--price" id="viewad-price">
{{ .Price }}</h2>
</div>
<div id="viewad-extra-info" class="boxedarticle--details--full">
<div><i class="icon icon-small icon-calendar-gray-simple"></i><span>{{ .Created }}</span></div>
</div>
<div class="splitlinebox l-container-row" id="viewad-details">
<ul class="addetailslist">
<li class="addetailslist--detail">
Zustand<span class="addetailslist--detail--value" >
{{ .Condition }}</span>
</li>
</ul>
</div>
<div class="l-container last-paragraph-no-margin-bottom">
<p id="viewad-description-text" class="text-force-linebreak " itemprop="description">
{{ .Text }}
</p>
</div>
</body>
</html>
`
const EMPTYPAGE string = `DOCTYPE html>
<html lang="de">
<head></head>
<body></body>
</html>
`
const (
EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
INVALID503URI string = `https://www.kleinanzeigen.de/s-anzeige/503/1`
INVALIDPATHURI string = `https://www.kleinanzeigen.de/anzeige/name/1`
INVALID404URI string = `https://www.kleinanzeigen.de/anzeige/name/1/foo/bar`
INVALIDURI string = `https://foo.bar/weird/things`
)
var base = "kleingebaeck -c t/config-empty.conf"
type Tests struct {
name string
args string
expect string
exitcode int
}
var tests = []Tests{
{
name: "version",
args: base + " -V",
expect: "This is",
exitcode: 0,
},
{
name: "help",
args: base + " -h",
expect: "Usage:",
exitcode: 0,
},
{
name: "debug",
args: base + " -d",
expect: "error: invalid or no user id or no ad link specified",
exitcode: 1,
},
{
name: "debug-check-programinfo",
args: base + " -d",
expect: "pid:",
exitcode: 1,
},
{
name: "no-args-no-user",
args: base,
expect: "invalid or no user id",
exitcode: 1,
},
{
name: "download-single-ad",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1",
expect: "Successfully downloaded 1 ad with 2 images to t/out",
exitcode: 0,
},
{
name: "download-single-ad-verbose",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -v",
expect: "wrote ad listing",
exitcode: 0,
},
{
name: "download-single-ad-debug",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -d",
expect: "DEBUG: extracted ad listing",
exitcode: 0,
},
{
name: "download-all-ads",
args: base + " -o t/out -u 1",
expect: "Successfully downloaded 6 ads with 12 images to t/out",
exitcode: 0,
},
{
name: "download-all-ads-using-config",
args: "kleingebaeck -c t/fullconfig.conf",
expect: "Successfully downloaded 6 ads with 12 images to t/out",
exitcode: 0,
},
}
var invalidtests = []Tests{
{
name: "empty-ad",
args: base + " " + EMPTYURI,
expect: "could not extract ad data from page, got empty struct",
exitcode: 1,
},
{
name: "invalid-ad",
args: base + " " + INVALIDURI,
expect: "invalid uri",
exitcode: 1,
},
{
name: "invalid-path",
args: base + " " + INVALIDPATHURI,
expect: "could not extract ad data from page, got empty struct",
exitcode: 1,
},
{
name: "404",
args: base + " " + INVALID404URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
{
name: "outdir-no-exists",
args: base + " -o t/foo/bar/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -v",
expect: "Failure",
exitcode: 1,
},
{
name: "wrong-flag",
args: base + " -X",
expect: "unknown shorthand flag: 'X' in -X",
exitcode: 1,
},
{
name: "no-config",
args: "kleingebaeck -c t/invalid.conf",
expect: "error loading config file",
exitcode: 1,
},
{
name: "503",
args: base + " " + INVALID503URI,
expect: "could not get page via HTTP",
exitcode: 1,
},
}
type AdConfig struct {
Title string
Slug string
Id string
Price string
Category string
Condition string
Created string
Text string
Images []string // files in ./t/
}
var adsrc = []AdConfig{
{
Title: "First Ad",
Id: "1", Price: "5€",
Category: "Klimbim",
Text: "Thing to sale",
Slug: "first-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Secnd Ad",
Id: "2", Price: "5€",
Category: "Kram",
Text: "Thing to sale",
Slug: "second-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Third Ad",
Id: "3",
Price: "5€",
Category: "Kuddelmuddel",
Text: "Thing to sale",
Slug: "third-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Forth Ad",
Id: "4",
Price: "5€",
Category: "Krempel",
Text: "Thing to sale",
Slug: "fourth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Fifth Ad",
Id: "5",
Price: "5€",
Category: "Kladderadatsch",
Text: "Thing to sale",
Slug: "fifth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
{
Title: "Sixth Ad",
Id: "6",
Price: "5€",
Category: "Klunker",
Text: "Thing to sale",
Slug: "sixth-ad",
Condition: "works",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
}
// An Adsource is used to construct a httpmock responder for a
// particular url. So, the code (scrape.go) scrapes
// https://kleinanzeigen.de, but in reality httpmock captures the
// request and responds with our mock data
type Adsource struct {
uri string
content string
status int
}
// Render a HTML template for an adlisting or an ad
func GetTemplate(l []AdConfig, a AdConfig, htmltemplate string) string {
tmpl, err := tpl.New("template").Parse(htmltemplate)
if err != nil {
panic(err)
}
var out bytes.Buffer
if len(a.Id) == 0 {
err = tmpl.Execute(&out, l)
} else {
err = tmpl.Execute(&out, a)
}
if err != nil {
panic(err)
}
return out.String()
}
// Initialize the valid sources for the httpmock responder
func InitValidSources() []Adsource {
// valid ad listing page 1
list1 := []AdConfig{
adsrc[0], adsrc[1], adsrc[2],
}
// valid ad listing page 2
list2 := []AdConfig{
adsrc[3], adsrc[4], adsrc[5],
}
// valid ad listing page 3, which is empty
list3 := []AdConfig{}
// used to signal GetTemplate() to render a listing
empty := AdConfig{}
// prepare urls for the listing pages
ads := []Adsource{
{
uri: fmt.Sprintf("%s%s?userId=1", Baseuri, Listuri),
content: GetTemplate(list1, empty, LISTTPL),
},
{
uri: fmt.Sprintf("%s%s?userId=1&pageNum=2", Baseuri, Listuri),
content: GetTemplate(list2, empty, LISTTPL),
},
{
uri: fmt.Sprintf("%s%s?userId=1&pageNum=3", Baseuri, Listuri),
content: GetTemplate(list3, empty, LISTTPL),
},
}
// prepare urls for the ads
for _, ad := range adsrc {
ads = append(ads, Adsource{
uri: fmt.Sprintf("%s/s-anzeige/%s/%s", Baseuri, ad.Slug, ad.Id),
content: GetTemplate(nil, ad, ADTPL),
})
//panic(GetTemplate(nil, ad, ADTPL))
}
return ads
}
func InitInvalidSources() []Adsource {
empty := AdConfig{}
ads := []Adsource{
{
// valid ad page but without content
uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri),
content: GetTemplate(nil, empty, EMPTYPAGE),
},
{
// some random foreign webpage
uri: INVALIDURI,
content: GetTemplate(nil, empty, "<html>foo</html>"),
},
{
// some invalid page path
uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri),
content: GetTemplate(nil, empty, "<html></html>"),
},
{
// some none-ad page
uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
status: 404,
},
{
// valid ad page but 503
uri: fmt.Sprintf("%s/s-anzeige/503/1", Baseuri),
content: GetTemplate(nil, empty, "<html>HTTP 503: service unavailable</html>"),
status: 503,
},
}
return ads
}
// load a test image from disk
func GetImage(path string) []byte {
dat, err := os.ReadFile(path)
if err != nil {
panic(err)
}
return dat
}
// setup httpmock
func SetIntercept(ads []Adsource) {
for _, ad := range ads {
if ad.status == 0 {
ad.status = 200
}
httpmock.RegisterResponder("GET", ad.uri,
httpmock.NewStringResponder(ad.status, ad.content))
}
// we just use 2 images, put this here
for _, image := range []string{"t/1.jpg", "t/2.jpg"} {
httpmock.RegisterResponder("GET", image,
httpmock.NewBytesResponder(200, GetImage(image)))
}
}
func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " +
ad.Category + ad.Condition + ad.Created
// prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: "{{ .Slug }}"}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct)
if err != nil {
return err
}
file := fmt.Sprintf("t/out/%s/Adlisting.txt", addir)
content, err := os.ReadFile(file)
if err != nil {
return err
}
if body != strings.TrimSpace(string(content)) {
msg := fmt.Sprintf("ad content doesn't match.\nExpect: %s\n Got: %s\n", body, content)
return errors.New(msg)
}
return nil
}
func TestMain(t *testing.T) {
oldargs := os.Args
defer func() { os.Args = oldargs }()
httpmock.Activate()
defer httpmock.DeactivateAndReset()
// prepare httpmock responders
SetIntercept(InitValidSources())
// run commandline tests
for _, tt := range tests {
var buf bytes.Buffer
os.Args = strings.Split(tt.args, " ")
ret := Main(&buf)
if ret != tt.exitcode {
t.Errorf("%s with cmd <%s> did not exit with %d but %d",
tt.name, tt.args, tt.exitcode, ret)
}
if !strings.Contains(buf.String(), tt.expect) {
t.Errorf("%s with cmd <%s> output did not match.\nExpect: %s\n Got: %s\n",
tt.name, tt.args, tt.expect, buf.String())
}
}
// verify if downloaded ads match
for _, ad := range adsrc {
if err := VerifyAd(ad); err != nil {
t.Errorf(err.Error())
}
}
}
func TestMainInvalids(t *testing.T) {
oldargs := os.Args
defer func() { os.Args = oldargs }()
httpmock.Activate()
defer httpmock.DeactivateAndReset()
// prepare httpmock responders
SetIntercept(InitInvalidSources())
// run commandline tests
for _, tt := range invalidtests {
var buf bytes.Buffer
os.Args = strings.Split(tt.args, " ")
ret := Main(&buf)
if ret != tt.exitcode {
t.Errorf("%s with cmd <%s> did not exit with %d but %d",
tt.name, tt.args, tt.exitcode, ret)
}
if !strings.Contains(buf.String(), tt.expect) {
t.Errorf("%s with cmd <%s> output did not match.\nExpect: %s\n Got: %s\n",
tt.name, tt.args, tt.expect, buf.String())
}
}
}

70
mkrel.sh Executable file
View File

@@ -0,0 +1,70 @@
#!/bin/bash
# Copyright © 2023 Thomas von Dein
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# get list with: go tool dist list
DIST="darwin/amd64
freebsd/amd64
linux/amd64
netbsd/amd64
openbsd/amd64
windows/amd64"
tool="$1"
version="$2"
if test -z "$version"; then
echo "Usage: $0 <tool name> <release version>"
exit 1
fi
rm -rf releases
mkdir -p releases
for D in $DIST; do
os=${D/\/*/}
arch=${D/*\//}
binfile="releases/${tool}-${os}-${arch}-${version}"
if test "$os" = "windows"; then
binfile="${binfile}.exe"
fi
tardir="${tool}-${os}-${arch}-${version}"
tarfile="releases/${tool}-${os}-${arch}-${version}.tar.gz"
set -x
GOOS=${os} GOARCH=${arch} go build -tags osusergo,netgo -ldflags "-extldflags=-static" -o ${binfile}
mkdir -p ${tardir}
cp ${binfile} README.md LICENSE ${tardir}/
echo 'tool = kleingebaeck
PREFIX = /usr/local
UID = root
GID = 0
install:
install -d -o $(UID) -g $(GID) $(PREFIX)/bin
install -d -o $(UID) -g $(GID) $(PREFIX)/man/man1
install -o $(UID) -g $(GID) -m 555 $(tool) $(PREFIX)/sbin/
install -o $(UID) -g $(GID) -m 444 $(tool).1 $(PREFIX)/man/man1/' > ${tardir}/Makefile
tar cpzf ${tarfile} ${tardir}
sha256sum ${binfile} | cut -d' ' -f1 > ${binfile}.sha256
sha256sum ${tarfile} | cut -d' ' -f1 > ${tarfile}.sha256
rm -rf ${tardir}
set +x
done

197
scrape.go Normal file
View File

@@ -0,0 +1,197 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"errors"
"fmt"
"log/slog"
"path/filepath"
"strings"
"astuart.co/goq"
"golang.org/x/sync/errgroup"
)
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func ScrapeUser(fetch *Fetcher) error {
adlinks := []string{}
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
page := 1
uri := baseuri
slog.Info("fetching ad pages", "user", fetch.Config.User)
for {
var index Index
slog.Debug("fetching page", "uri", uri)
body, err := fetch.Get(uri)
if err != nil {
return err
}
defer body.Close()
err = goq.NewDecoder(body).Decode(&index)
if err != nil {
return err
}
if len(index.Links) == 0 {
break
}
slog.Debug("extracted ad links", "count", len(index.Links))
for _, href := range index.Links {
adlinks = append(adlinks, href)
slog.Debug("ad link", "href", href)
}
page++
uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
}
for i, adlink := range adlinks {
err := ScrapeAd(fetch, Baseuri+adlink)
if err != nil {
return err
}
if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
break
}
}
return nil
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func ScrapeAd(fetch *Fetcher, uri string) error {
ad := &Ad{}
// extract slug and id from uri
uriparts := strings.Split(uri, "/")
if len(uriparts) < 6 {
return errors.New("invalid uri: " + uri)
}
ad.Slug = uriparts[4]
ad.Id = uriparts[5]
// get the ad
slog.Debug("fetching ad page", "uri", uri)
body, err := fetch.Get(uri)
if err != nil {
return err
}
defer body.Close()
// extract ad contents with goquery/goq
err = goq.NewDecoder(body).Decode(&ad)
if err != nil {
return err
}
if len(ad.CategoryTree) > 0 {
ad.Category = strings.Join(ad.CategoryTree, " => ")
}
if ad.Incomplete() {
slog.Debug("got ad", "ad", ad)
return errors.New("could not extract ad data from page, got empty struct")
}
ad.CalculateExpire()
// write listing
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
slog.Debug("extracted ad listing", "ad", ad)
fetch.Config.IncrAds()
return ScrapeImages(fetch, ad, addir)
}
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
adpath := filepath.Join(fetch.Config.Outdir, addir)
// scan existing images, if any
cache, err := ReadImages(adpath, fetch.Config.ForceDownload)
if err != nil {
return err
}
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
buf := new(bytes.Buffer)
_, err = buf.ReadFrom(body)
if err != nil {
return err
}
buf2 := buf.Bytes() // needed for image writing
image := NewImage(buf, "", imguri)
err = image.CalcHash()
if err != nil {
return err
}
if !fetch.Config.ForceDownload {
if image.SimilarExists(cache) {
slog.Debug("similar image exists, not written", "uri", image.Uri)
return nil
}
}
err = WriteImage(file, buf2)
if err != nil {
return err
}
slog.Debug("wrote image", "image", image, "size", len(buf2))
return nil
})
img++
}
if err := g.Wait(); err != nil {
return err
}
fetch.Config.IncrImgs(len(ad.Images))
return nil
}

131
store.go Normal file
View File

@@ -0,0 +1,131 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
tpl "text/template"
)
func AdDirName(c *Config, ad *Ad) (string, error) {
tmpl, err := tpl.New("adname").Parse(c.Adnametemplate)
if err != nil {
return "", err
}
buf := bytes.Buffer{}
err = tmpl.Execute(&buf, ad)
if err != nil {
return "", err
}
return buf.String(), nil
}
func WriteAd(c *Config, ad *Ad) (string, error) {
// prepare ad dir name
addir, err := AdDirName(c, ad)
if err != nil {
return "", err
}
// prepare output dir
dir := filepath.Join(c.Outdir, addir)
err = Mkdir(dir)
if err != nil {
return "", err
}
// write ad file
listingfile := filepath.Join(dir, "Adlisting.txt")
f, err := os.Create(listingfile)
if err != nil {
return "", err
}
defer f.Close()
if runtime.GOOS == "windows" {
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\r\n")
} else {
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
}
tmpl, err := tpl.New("adlisting").Parse(c.Template)
if err != nil {
return "", err
}
err = tmpl.Execute(f, ad)
if err != nil {
return "", err
}
slog.Info("wrote ad listing", "listingfile", listingfile)
return addir, nil
}
func WriteImage(filename string, buf []byte) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = file.Write(buf)
if err != nil {
return err
}
return nil
}
func ReadImage(filename string) (*bytes.Buffer, error) {
var buf bytes.Buffer
if !fileExists(filename) {
return nil, fmt.Errorf("image %s does not exist", filename)
}
data, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
_, err = buf.Write(data)
if err != nil {
return nil, err
}
return &buf, nil
}
func fileExists(filename string) bool {
info, err := os.Stat(filename)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}

BIN
t/1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1001 B

BIN
t/2.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1002 B

6
t/config-empty.conf Normal file
View File

@@ -0,0 +1,6 @@
# empty config for Main() unit tests to force unit tests NOT to use an
# eventually existing ~/.kleingebaeck!
template="""
{{.Title}}{{.Price}}{{.Id}}{{.Category}}{{.Condition}}{{.Created}}
"""

6
t/fullconfig.conf Normal file
View File

@@ -0,0 +1,6 @@
user = 1
loglevel = "verbose"
outdir = "t/out"
template="""
{{.Title}}{{.Price}}{{.Id}}{{.Category}}{{.Condition}}{{.Created}}
"""

13
t/httproot/README.md Normal file
View File

@@ -0,0 +1,13 @@
# Mock http server
Install ehfs from https://github.com/mjpclab/extra-http-file-server/.
Install p2cli from https://github.com/wrouesnel/p2cli.
Run `templates/render.sh` to build the file structure.
Run `server.sh` to start the http server.
To scrape an ad from it, use such a URL:
http://localhost:8080/s-anzeige/first-ad/111-11-111

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

4
t/httproot/serve.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/sh
ehfs -a :/s-anzeige:./kleinanzeigen \
-a :/api/v1/prod-ads/images/fc:./img \
-l localhost:8080 -I index.html

View File

@@ -0,0 +1,50 @@
<!DOCTYPE html>
<html lang="de">
<head>
<title>Ad Listing</title>
</head>
<body>
<div class="l-container-row">
<div id="vap-brdcrmb" class="breadcrump">
<a class="breadcrump-link" itemprop="url" href="/" title="Kleinanzeigen ">
<span itemprop="title">Kleinanzeigen </span>
</a>
<a class="breadcrump-link" itemprop="url" href="/egal">
<span itemprop="title">{{ category }}</span></a>
</div>
</div>
{% for image in images %}
<div class="galleryimage-element" data-ix="3">
<img src="http://localhost:8080/api/v1/prod-ads/images/fc/{{ image.id }}?rule=$_59.JPG"/>
</div>
{% endfor %}
<h1 id="viewad-title" class="boxedarticle--title" itemprop="name" data-soldlabel="Verkauft">
{{ title }}</h1>
<div class="boxedarticle--flex--container">
<h2 class="boxedarticle--price" id="viewad-price">
{{ price }}</h2>
</div>
<div id="viewad-extra-info" class="boxedarticle--details--full">
<div><i class="icon icon-small icon-calendar-gray-simple"></i><span>{{ created }}</span></div>
</div>
<div class="splitlinebox l-container-row" id="viewad-details">
<ul class="addetailslist">
<li class="addetailslist--detail">
Zustand<span class="addetailslist--detail--value" >
{{ condition }}</span>
</li>
</ul>
</div>
<div class="l-container last-paragraph-no-margin-bottom">
<p id="viewad-description-text" class="text-force-linebreak " itemprop="description">
{{ text }}
</p>
</div>
</body>
</html>

View File

@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="de" >
<head>
<title>Ads</title>
</head>
<body>
{% for ad in ads %}
<h2 class="text-module-begin">
<a class="ellipsis"
href="/s-anzeige/{{ ad.slug }}/{{ ad.id }}">{{ ad.title }}</a>
</h2>
{% endfor %}
</body>
</html>

13
t/httproot/templates/render.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/sh -x
base="../kleinanzeigen"
mkdir -p $base
echo "Generating /s-bestandsliste.html"
p2cli -t index.tpl -i vars.yaml > $base/s-bestandsliste.html
for idx in 0 1; do
slug=$(cat vars.yaml | yq ".ads[$idx].slug")
id=$(cat vars.yaml | yq ".ads[$idx].id")
mkdir -p $base/$slug/$id
cat vars.yaml | yq ".ads[$idx]" | p2cli -t ad.tpl -f yaml > $base/$slug/$id/index.html
done

View File

@@ -0,0 +1,27 @@
ads:
- slug: first-ad
id: 111-11-111
title: First Ad
price: "19 €"
condition: "Sehr gut"
category: "Weitere Elektronik"
created: 21.12.2023
images:
- id: fcf6d664-5258-42c2-bf58-d1b8e9221574
- id: fcf6d664-5258-42c2-bf58-as43as5d43as
text: |
Zu Verkaufen.
Zahlung nur Paypal.
- slug: second-ad
id: 222-22-222
title: Second Ad
price: "200 €"
condition: "Sehr gut"
category: "Elektronik"
created: 21.12.2023
images:
- id: cdas4sd5-5258-42c2-bf58-d1b8e9221574
- id: cdas4sd5-5258-42c2-bf58-as43as5d43as
text: |
Zu Verkaufen.
Zahlung nur Überweisung.

1
t/invalid.conf Normal file
View File

@@ -0,0 +1 @@
user = "

68
util.go Normal file
View File

@@ -0,0 +1,68 @@
/*
Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"errors"
"os"
"os/exec"
"runtime"
"github.com/mattn/go-isatty"
)
func Mkdir(dir string) error {
if _, err := os.Stat(dir); errors.Is(err, os.ErrNotExist) {
err := os.Mkdir(dir, os.ModePerm)
if err != nil {
return err
}
}
return nil
}
func man() error {
man := exec.Command("less", "-")
var b bytes.Buffer
b.Write([]byte(manpage))
man.Stdout = os.Stdout
man.Stdin = &b
man.Stderr = os.Stderr
err := man.Run()
if err != nil {
return err
}
return nil
}
// returns TRUE if stdout is NOT a tty or windows
func IsNoTty() bool {
if runtime.GOOS == "windows" || !isatty.IsTerminal(os.Stdout.Fd()) {
return true
}
// it is a tty
return false
}