Compare commits

..

44 Commits

Author SHA1 Message Date
0726b95fd4 added docker instructions 2024-01-17 14:15:46 +01:00
21ca5c626f added environment variable support 2024-01-17 14:03:43 +01:00
7e2161d4d4 no bak 2024-01-17 13:58:21 +01:00
ef4d2dab16 added docker image support 2024-01-17 13:58:09 +01:00
T.v.Dein
3fd75fa53d refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00
T.v.Dein
78e5de61d2 Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
2024-01-16 13:20:16 +01:00
T.v.Dein
f4a9a9895c Enhancement/http (#32)
* added HTTP debug logging using `-d` or `DEBUGHTTP=1` (headers only)
2024-01-16 13:20:16 +01:00
T.v.Dein
ac5b0608d8 fix #30: revert default adnamedir to just use the slug as before (#31) 2024-01-16 13:20:16 +01:00
T.v.Dein
239e253057 Make ad directory name tunable, adapt to kleinanzeigen.de site changes
Add template for ad name, adapt kleinanzeigen.de changes
2024-01-12 14:56:08 +01:00
cdf58efd45 fixed changes on kleinanzeigen.de:
- Meta did not contain condition and category together anymore, they
removed  the category. Therefore fetching (that is, validation)
failed.
- Now we extract the condition and category directly.
- On top, category now includes the whole category tree.
- unit tests had to be tweaked for this measure.
2024-01-12 14:11:02 +01:00
110ee17091 fixed utf8 2024-01-12 13:31:24 +01:00
8321d3c343 added template for ad directory, by default include id now 2024-01-12 13:29:59 +01:00
T.v.Dein
56f53bb777 remove duplicate license badge (#28)
* remove duplicate license badge

* fix badges
2024-01-06 18:07:33 +01:00
T.v.Dein
9e7f9a2821 Merge pull request #27 from TLINDEN/test/add-main-tests
Test/add main tests
2024-01-02 12:51:51 +01:00
577f9d983e portable error check 2024-01-02 12:36:00 +01:00
114f6b16d9 also added coverage report+badge 2024-01-02 12:31:19 +01:00
a06c730fe4 put all tests into main_test.go, more failure mode tests and verify 2024-01-02 12:22:52 +01:00
d8e968ed6d better error message on 404 2024-01-02 12:22:26 +01:00
5f450e54ea add commandline main() test units 2024-01-01 20:53:39 +01:00
3e6349cf36 pass a io.Writer to loggers and outputs so we can test the cmdline 2024-01-01 20:53:05 +01:00
T.v.Dein
bf8e074034 Merge pull request #26 from TLINDEN/test/enhancements
Enhanced error checking, added more failure tests
2024-01-01 16:27:20 +01:00
6dea8d78ed added more invalid tests 2024-01-01 16:24:43 +01:00
ea76b98445 upd httpmock+deps 2024-01-01 16:24:33 +01:00
9f688b7692 put ad code into separate file, enhance error checking 2024-01-01 16:24:07 +01:00
T.v.Dein
d8baa34c54 Test/add mock tests (#24)
* add scrape unit test using httpmock lib
2023-12-29 13:47:18 +01:00
T.v.Dein
c1cbce32e1 fix linter errors (#23) 2023-12-23 22:51:50 +01:00
T.v.Dein
dc4d3d7f9c add ci pipeline (#22)
Co-authored-by: Thomas von Dein <tom@izb.net>
2023-12-23 22:36:21 +01:00
T.v.Dein
b28f544416 Doc/add prior art (#21)
* add mor prior art
2023-12-23 22:20:39 +01:00
T.v.Dein
8cdefe457b added windows screenshots (#20)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-23 18:48:10 +01:00
T.v.Dein
1e4b406aa4 Revert "Fix/newline windows (#18)" (#19)
This reverts commit eaf4db6cef.
2023-12-23 18:00:51 +01:00
T.v.Dein
eaf4db6cef Fix/newline windows (#18)
* fix #17: use fmt.Println() after stats
* bump version
2023-12-22 18:59:08 +01:00
T.v.Dein
825649bb3b added screenshots and a section about prior work (#16) 2023-12-21 12:25:04 +01:00
T.v.Dein
6aa9c658b6 add doc link (#15)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-19 18:36:11 +01:00
T.v.Dein
2c62f9eb17 fix invalid mod load (#14)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-19 18:27:20 +01:00
T.v.Dein
bff0ae553e Bugfixes (#13)
* several fixes:

- fix #9 + #10: switched to koanf module and dropped support for HCL
- fix #11: disabling colors on windows
- fix #12: fixed race condition in go routine call inside for loop,
  images had been downloaded multiple times
- remove hcl support and use toml format (same thing, better parser)
- update documentation and example config on TOML format of config file
- use Config as arg instead of singular args
- use x/errgroup instead of sync.Waitgroup inside image download loop

---------

Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-19 18:23:41 +01:00
T.v.Dein
450d44d129 Dev (#8)
* fixed conf parsing: variables can now be omitted from the config
* fix newlines: use CRLF on windows
* bump version

---------

Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-18 20:18:37 +01:00
T.v.Dein
18f7e0fe49 added proper install instructions (#7)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2023-12-18 09:48:00 +01:00
T.v.Dein
def063afe9 Merge pull request #6 from TLINDEN/dev 2023-12-18 09:23:55 +01:00
f1908f02cb bump version 2023-12-18 09:23:18 +01:00
4a528ad9d1 fix #5: add exe extension to built windows binaries 2023-12-18 09:22:08 +01:00
5c1161f227 fix #4, use filepath.Join to create portable path's 2023-12-18 09:21:26 +01:00
bd9d8fdb2c fix version finding 2023-12-17 17:53:01 +01:00
T.v.Dein
1ee886c504 Merge pull request #2 from TLINDEN/dev
re-orgainzied code a little, using go templates instead format string
2023-12-17 17:49:27 +01:00
T.v.Dein
d7b13e8a9a Merge pull request #1 from TLINDEN/dev
added custom template support, added more ad data, use concurrency
2023-12-16 20:35:18 +01:00
14 changed files with 31 additions and 294 deletions

View File

@@ -16,6 +16,8 @@ RUN make
FROM alpine:latest
LABEL maintainer="Thomas von Dein <git@daemon.de>"
#RUN install -o 1001 -g 1001 -d /data
WORKDIR /app
COPY --from=builder /work/kleingebaeck /app/kleingebaeck

View File

@@ -106,16 +106,9 @@ USER_ID=$(id -u) GROUP_ID=$(id -g) OUTDIR=./kleinanzeigen-backup docker-compose
```
`USER_ID` and `GROUP_ID` needs to be specified so that you are the
owner of the created backups. The backup directory `OUTDIR` must exist
prior to the execution, otherwise docker will create it as root, then
kleingebaeck will fail. You may also use a `.env` file in the same
directory containing the variables, such as:
```
USER_ID=1000
GROUP_ID=1000
OUTDIR=./kleinanzeigen-backup
```
owner of the created backups. The backup directory must exist prior to
the execution, otherwise docker will create it as root, then
kleingebaeck will fail.
You may of course also modify the `docker-compose.yaml` to suit your needs.

13
ad.go
View File

@@ -20,7 +20,6 @@ package main
import (
"log/slog"
"strings"
"time"
)
type Index struct {
@@ -38,7 +37,6 @@ type Ad struct {
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Expire string
}
// Used by slog to pretty print an ad
@@ -51,8 +49,6 @@ func (ad *Ad) LogValue() slog.Value {
slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition),
slog.String("created", ad.Created),
slog.String("expire", ad.Expire),
)
}
@@ -71,12 +67,3 @@ func (ad *Ad) Incomplete() bool {
return false
}
func (ad *Ad) CalculateExpire() {
if len(ad.Created) > 0 {
ts, err := time.Parse("02.01.2006", ad.Created)
if err == nil {
ad.Expire = ts.AddDate(0, 2, 1).Format("02.01.2006")
}
}
}

View File

@@ -35,16 +35,14 @@ import (
)
const (
VERSION string = "0.2.0"
VERSION string = "0.1.2"
Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html"
Defaultdir string = "."
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\n" +
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
"Category: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n"
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" +
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\n" +
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}"
@@ -62,7 +60,6 @@ Options:
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
@@ -83,7 +80,6 @@ type Config struct {
Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
ForceDownload bool `koanf:"force"`
Adlinks []string
StatsCountAds int
StatsCountImages int
@@ -135,7 +131,6 @@ func InitConfig(w io.Writer) (*Config, error) {
f.BoolP("version", "V", false, "show program version")
f.BoolP("help", "h", false, "show usage")
f.BoolP("manual", "m", false, "show manual")
f.BoolP("force", "f", false, "force")
if err := f.Parse(os.Args[1:]); err != nil {
return nil, err

16
go.mod
View File

@@ -7,32 +7,26 @@ require (
github.com/jarcoal/httpmock v1.3.1
github.com/knadh/koanf/parsers/toml v0.1.0
github.com/knadh/koanf/providers/confmap v0.1.0
github.com/knadh/koanf/providers/env v0.1.0
github.com/knadh/koanf/providers/file v0.1.0
github.com/knadh/koanf/providers/posflag v0.1.0
github.com/knadh/koanf/v2 v2.0.1
github.com/lmittmann/tint v1.0.4
github.com/lmittmann/tint v1.0.3
github.com/mattn/go-isatty v0.0.20
github.com/spf13/pflag v1.0.5
github.com/tlinden/yadu v0.1.0
golang.org/x/sync v0.5.0
)
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/corona10/goimagehash v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/PuerkitoBio/goquery v1.5.0 // indirect
github.com/andybalholm/cascadia v1.0.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/knadh/koanf/providers/env v0.1.0 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.14.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
golang.org/x/sys v0.6.0 // indirect
)

30
go.sum
View File

@@ -1,18 +1,12 @@
astuart.co/goq v1.0.0 h1:nnYIhu/Z/j0VaX9Dp+pmh2Uh7ldEz6XfgSg+bAY5Yrw=
astuart.co/goq v1.0.0/go.mod h1:+fokcnFrO8Pw2fj8drdStJvzoMFebJH69rw8IC21rno=
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww=
@@ -31,11 +25,8 @@ github.com/knadh/koanf/providers/posflag v0.1.0 h1:mKJlLrKPcAP7Ootf4pBZWJ6J+4wHY
github.com/knadh/koanf/providers/posflag v0.1.0/go.mod h1:SYg03v/t8ISBNrMBRMlojH8OsKowbkXV7giIbBVgbz0=
github.com/knadh/koanf/v2 v2.0.1 h1:1dYGITt1I23x8cfx8ZnldtezdyaZtfAuRtIFOiRzK7g=
github.com/knadh/koanf/v2 v2.0.1/go.mod h1:ZeiIlIDXTE7w1lMT6UVcNiRAS2/rCeLn/GdLNvY1Dus=
github.com/lmittmann/tint v1.0.4 h1:LeYihpJ9hyGvE0w+K2okPTGUdVLfng1+nDNVR4vWISc=
github.com/lmittmann/tint v1.0.4/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/lmittmann/tint v1.0.3 h1:W5PHeA2D8bBJVvabNfQD/XW9HPLZK1XoPZH0cq8NouQ=
github.com/lmittmann/tint v1.0.3/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g=
@@ -46,8 +37,6 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -58,27 +47,18 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355 h1:EmgK+IGUz2m42bFKteLY5SYJLn/CyBrz6nkgS22K8Bk=
github.com/tlinden/yadu v0.0.0-20240118202225-ec3f0b7fc355/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
github.com/tlinden/yadu v0.1.0 h1:qtCi1jxg392qVRLFyrJ2LYu6/PiKSp1LT02EX+mNLME=
github.com/tlinden/yadu v0.1.0/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

142
image.go
View File

@@ -1,142 +0,0 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"image/jpeg"
"log/slog"
"os"
"path/filepath"
"github.com/corona10/goimagehash"
)
const MaxDistance = 3
type Image struct {
Filename string
Hash *goimagehash.ImageHash
Data *bytes.Buffer
Uri string
}
// used for logging to avoid printing Data
func (img *Image) LogValue() slog.Value {
return slog.GroupValue(
slog.String("filename", img.Filename),
slog.String("uri", img.Uri),
slog.String("hash", img.Hash.ToString()),
)
}
// holds all images of an ad
type Cache []*goimagehash.ImageHash
func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
img := &Image{
Filename: filename,
Uri: uri,
Data: buf,
}
return img
}
// Calculate diff hash of the image
func (img *Image) CalcHash() error {
jpgdata, err := jpeg.Decode(img.Data)
if err != nil {
return err
}
hash1, err := goimagehash.DifferenceHash(jpgdata)
if err != nil {
return err
}
img.Hash = hash1
return nil
}
// checks if 2 images are similar enough to be considered the same
func (img *Image) Similar(hash *goimagehash.ImageHash) bool {
distance, err := img.Hash.Distance(hash)
if err != nil {
slog.Debug("failed to compute diff hash distance", "error", err)
return false
}
if distance < MaxDistance {
slog.Debug("distance computation", "image-A", img.Hash.ToString(),
"image-B", hash.ToString(), "distance", distance)
return true
} else {
return false
}
}
// check current image against all known hashes.
func (img *Image) SimilarExists(cache Cache) bool {
for _, otherimg := range cache {
if img.Similar(otherimg) {
return true
}
}
return false
}
// read all JPG images in a ad directory, compute diff hashes and
// store the results in the slice Images
func ReadImages(addir string, dont bool) (Cache, error) {
files, err := os.ReadDir(addir)
if err != nil {
return nil, err
}
cache := Cache{}
if dont {
// forced download, -f given
return cache, nil
}
for _, file := range files {
ext := filepath.Ext(file.Name())
if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
filename := filepath.Join(addir, file.Name())
data, err := ReadImage(filename)
if err != nil {
return nil, err
}
img := NewImage(data, filename, "")
if err = img.CalcHash(); err != nil {
return nil, err
}
slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
cache = append(cache, img.Hash)
}
}
//return nil, errors.New("ende")
return cache, nil
}

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands"
.TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -142,7 +142,7 @@
kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS"
.IX Header "SYNOPSYS"
.Vb 10
.Vb 12
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options:
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
@@ -152,7 +152,6 @@ kleingebaeck \- kleinanzeigen.de backup tool
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-f \-\-force Download images even if they already exist.
\& \-m \-\-manual Show manual.
\& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.

View File

@@ -14,7 +14,6 @@ SYNOPSYS
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.

View File

@@ -13,7 +13,6 @@ kleingebaeck - kleinanzeigen.de backup tool
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.

View File

@@ -26,7 +26,6 @@ import (
"runtime/debug"
"github.com/lmittmann/tint"
"github.com/tlinden/yadu"
)
const LevelNotice = slog.Level(2)
@@ -85,14 +84,14 @@ func Main(w io.Writer) int {
if conf.Debug {
// we're using a more verbose logger in debug mode
buildInfo, _ := debug.ReadBuildInfo()
opts := &yadu.Options{
opts := &tint.Options{
Level: logLevel,
AddSource: true,
//NoColor: IsNoTty(),
NoColor: IsNoTty(),
}
logLevel.Set(slog.LevelDebug)
handler := yadu.NewHandler(w, opts)
handler := tint.NewHandler(w, opts)
debuglogger := slog.New(handler).With(
slog.Group("program_info",
slog.Int("pid", os.Getpid()),

View File

@@ -145,13 +145,7 @@ var tests = []Tests{
{
name: "debug",
args: base + " -d",
expect: "error: invalid or no user id or no ad link specified",
exitcode: 1,
},
{
name: "debug-check-programinfo",
args: base + " -d",
expect: "pid:",
expect: "program_info",
exitcode: 1,
},
{
@@ -175,7 +169,7 @@ var tests = []Tests{
{
name: "download-single-ad-debug",
args: base + " -o t/out https://www.kleinanzeigen.de/s-anzeige/first-ad/1 -d",
expect: "DEBUG: extracted ad listing",
expect: "extracted ad listing program_info.pid=",
exitcode: 0,
},
{

View File

@@ -18,7 +18,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main
import (
"bytes"
"errors"
"fmt"
"log/slog"
@@ -118,7 +117,7 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
return errors.New("could not extract ad data from page, got empty struct")
}
ad.CalculateExpire()
slog.Debug("extracted ad listing", "ad", ad)
// write listing
addir, err := WriteAd(fetch.Config, ad)
@@ -126,8 +125,6 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
return err
}
slog.Debug("extracted ad listing", "ad", ad)
fetch.Config.IncrAds()
return ScrapeImages(fetch, ad, addir)
@@ -136,52 +133,22 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
adpath := filepath.Join(fetch.Config.Outdir, addir)
// scan existing images, if any
cache, err := ReadImages(adpath, fetch.Config.ForceDownload)
if err != nil {
return err
}
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img))
file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
buf := new(bytes.Buffer)
_, err = buf.ReadFrom(body)
err = WriteImage(file, body)
if err != nil {
return err
}
buf2 := buf.Bytes() // needed for image writing
image := NewImage(buf, "", imguri)
err = image.CalcHash()
if err != nil {
return err
}
if !fetch.Config.ForceDownload {
if image.SimilarExists(cache) {
slog.Debug("similar image exists, not written", "uri", image.Uri)
return nil
}
}
err = WriteImage(file, buf2)
if err != nil {
return err
}
slog.Debug("wrote image", "image", image, "size", len(buf2))
return nil
})
img++

View File

@@ -19,7 +19,7 @@ package main
import (
"bytes"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
@@ -86,46 +86,17 @@ func WriteAd(c *Config, ad *Ad) (string, error) {
return addir, nil
}
func WriteImage(filename string, buf []byte) error {
func WriteImage(filename string, reader io.ReadCloser) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = file.Write(buf)
_, err = io.Copy(file, reader)
if err != nil {
return err
}
return nil
}
func ReadImage(filename string) (*bytes.Buffer, error) {
var buf bytes.Buffer
if !fileExists(filename) {
return nil, fmt.Errorf("image %s does not exist", filename)
}
data, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
_, err = buf.Write(data)
if err != nil {
return nil, err
}
return &buf, nil
}
func fileExists(filename string) bool {
info, err := os.Stat(filename)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}