Compare commits

..

39 Commits

Author SHA1 Message Date
04aa2d07a3 fix linter error 2024-01-22 14:51:26 +01:00
f41b89f6fe added -f to override d-hash, better debug and error handling 2024-01-22 14:30:54 +01:00
4b0066d6ce added image diff hash distance caching to not overwrite similar images 2024-01-21 12:38:14 +01:00
T.v.Dein
8771ec1108 added support to calculate and store the ad expire date (#43) 2024-01-19 18:38:35 +01:00
T.v.Dein
1896209b96 use new yadu log handler, +tests, +upd modules, +version 2024-01-19 18:38:35 +01:00
T.v.Dein
3c93c9fce0 added docker image support
* added environment variable support
* added docker instructions
* added .env hint
2024-01-19 18:38:35 +01:00
T.v.Dein
42a958fc4c refactored out http fetching code into Fetcher{}/fetch.go 2024-01-19 18:38:35 +01:00
T.v.Dein
5fa46ff106 Add HTTP retries and the possibility to ignore image download errors (#33)
added HTTP retry and --ignoreerrors which ignores image download errors, fix #30
2024-01-19 18:38:35 +01:00
T.v.Dein
cca3211023 Enhancement/http (#32)
* added HTTP debug logging using `-d` or `DEBUGHTTP=1` (headers only)
2024-01-19 18:38:35 +01:00
T.v.Dein
dce7604afb fix #30: revert default adnamedir to just use the slug as before (#31) 2024-01-19 18:38:35 +01:00
0fd9b519d1 fixed changes on kleinanzeigen.de:
- Meta did not contain condition and category together anymore, they
removed  the category. Therefore fetching (that is, validation)
failed.
- Now we extract the condition and category directly.
- On top, category now includes the whole category tree.
- unit tests had to be tweaked for this measure.
2024-01-19 18:38:35 +01:00
6b7f727449 fixed utf8 2024-01-19 18:38:35 +01:00
5abbab9527 added template for ad directory, by default include id now 2024-01-19 18:38:35 +01:00
T.v.Dein
e03c7debb6 remove duplicate license badge (#28)
* remove duplicate license badge

* fix badges
2024-01-19 18:38:35 +01:00
1d2483d18f portable error check 2024-01-19 18:38:35 +01:00
b17f4f0f3e also added coverage report+badge 2024-01-19 18:38:35 +01:00
4a91167871 put all tests into main_test.go, more failure mode tests and verify 2024-01-19 18:38:35 +01:00
0baaf6f38b better error message on 404 2024-01-19 18:38:35 +01:00
42182bb6c9 add commandline main() test units 2024-01-19 18:38:35 +01:00
8455c193eb pass a io.Writer to loggers and outputs so we can test the cmdline 2024-01-19 18:38:35 +01:00
d1faa10a52 added more invalid tests 2024-01-19 18:38:35 +01:00
e28137bf9b upd httpmock+deps 2024-01-19 18:38:35 +01:00
1ff5c240c8 put ad code into separate file, enhance error checking 2024-01-19 18:38:35 +01:00
T.v.Dein
f893f9c3d7 Test/add mock tests (#24)
* add scrape unit test using httpmock lib
2024-01-19 18:38:35 +01:00
T.v.Dein
c4e88d98f2 fix linter errors (#23) 2024-01-19 18:38:35 +01:00
T.v.Dein
0cca387982 add ci pipeline (#22)
Co-authored-by: Thomas von Dein <tom@izb.net>
2024-01-19 18:38:35 +01:00
T.v.Dein
9e619fb3c5 Doc/add prior art (#21)
* add mor prior art
2024-01-19 18:38:35 +01:00
T.v.Dein
0fdfed2929 added windows screenshots (#20)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
T.v.Dein
73c09ec38b Revert "Fix/newline windows (#18)" (#19)
This reverts commit eaf4db6cef.
2024-01-19 18:38:35 +01:00
T.v.Dein
f901af4f0c Fix/newline windows (#18)
* fix #17: use fmt.Println() after stats
* bump version
2024-01-19 18:38:35 +01:00
T.v.Dein
2a8f53ca98 added screenshots and a section about prior work (#16) 2024-01-19 18:38:35 +01:00
T.v.Dein
4a95cb1f5e add doc link (#15)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
T.v.Dein
482612f889 fix invalid mod load (#14)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
T.v.Dein
b8977df986 Bugfixes (#13)
* several fixes:

- fix #9 + #10: switched to koanf module and dropped support for HCL
- fix #11: disabling colors on windows
- fix #12: fixed race condition in go routine call inside for loop,
  images had been downloaded multiple times
- remove hcl support and use toml format (same thing, better parser)
- update documentation and example config on TOML format of config file
- use Config as arg instead of singular args
- use x/errgroup instead of sync.Waitgroup inside image download loop

---------

Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
T.v.Dein
ae5e3daea3 Dev (#8)
* fixed conf parsing: variables can now be omitted from the config
* fix newlines: use CRLF on windows
* bump version

---------

Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
T.v.Dein
1c6d832b20 added proper install instructions (#7)
Co-authored-by: Thomas von Dein <tom@vondein.org>
2024-01-19 18:38:35 +01:00
52b39d91a3 fix version finding + bump version 2024-01-19 18:38:27 +01:00
3748cd35e5 fix #5: add exe extension to built windows binaries 2024-01-19 18:38:27 +01:00
4d4577c9f8 fix #4, use filepath.Join to create portable path's 2024-01-19 18:38:27 +01:00
9 changed files with 223 additions and 9 deletions

View File

@@ -62,6 +62,7 @@ Options:
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.
@@ -82,6 +83,7 @@ type Config struct {
Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"`
IgnoreErrors bool `koanf:"ignoreerrors"`
ForceDownload bool `koanf:"force"`
Adlinks []string
StatsCountAds int
StatsCountImages int
@@ -133,6 +135,7 @@ func InitConfig(w io.Writer) (*Config, error) {
f.BoolP("version", "V", false, "show program version")
f.BoolP("help", "h", false, "show usage")
f.BoolP("manual", "m", false, "show manual")
f.BoolP("force", "f", false, "force")
if err := f.Parse(os.Args[1:]); err != nil {
return nil, err

2
go.mod
View File

@@ -21,6 +21,7 @@ require (
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/corona10/goimagehash v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
@@ -28,6 +29,7 @@ require (
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.14.0 // indirect

4
go.sum
View File

@@ -6,6 +6,8 @@ github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBK
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -44,6 +46,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

142
image.go Normal file
View File

@@ -0,0 +1,142 @@
/*
Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"bytes"
"image/jpeg"
"log/slog"
"os"
"path/filepath"
"github.com/corona10/goimagehash"
)
const MaxDistance = 3
type Image struct {
Filename string
Hash *goimagehash.ImageHash
Data *bytes.Buffer
Uri string
}
// used for logging to avoid printing Data
func (img *Image) LogValue() slog.Value {
return slog.GroupValue(
slog.String("filename", img.Filename),
slog.String("uri", img.Uri),
slog.String("hash", img.Hash.ToString()),
)
}
// holds all images of an ad
type Cache []*goimagehash.ImageHash
func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
img := &Image{
Filename: filename,
Uri: uri,
Data: buf,
}
return img
}
// Calculate diff hash of the image
func (img *Image) CalcHash() error {
jpgdata, err := jpeg.Decode(img.Data)
if err != nil {
return err
}
hash1, err := goimagehash.DifferenceHash(jpgdata)
if err != nil {
return err
}
img.Hash = hash1
return nil
}
// checks if 2 images are similar enough to be considered the same
func (img *Image) Similar(hash *goimagehash.ImageHash) bool {
distance, err := img.Hash.Distance(hash)
if err != nil {
slog.Debug("failed to compute diff hash distance", "error", err)
return false
}
if distance < MaxDistance {
slog.Debug("distance computation", "image-A", img.Hash.ToString(),
"image-B", hash.ToString(), "distance", distance)
return true
} else {
return false
}
}
// check current image against all known hashes.
func (img *Image) SimilarExists(cache Cache) bool {
for _, otherimg := range cache {
if img.Similar(otherimg) {
return true
}
}
return false
}
// read all JPG images in a ad directory, compute diff hashes and
// store the results in the slice Images
func ReadImages(addir string, dont bool) (Cache, error) {
files, err := os.ReadDir(addir)
if err != nil {
return nil, err
}
cache := Cache{}
if dont {
// forced download, -f given
return cache, nil
}
for _, file := range files {
ext := filepath.Ext(file.Name())
if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
filename := filepath.Join(addir, file.Name())
data, err := ReadImage(filename)
if err != nil {
return nil, err
}
img := NewImage(data, filename, "")
if err = img.CalcHash(); err != nil {
return nil, err
}
slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
cache = append(cache, img.Hash)
}
}
//return nil, errors.New("ende")
return cache, nil
}

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -142,7 +142,7 @@
kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS"
.IX Header "SYNOPSYS"
.Vb 12
.Vb 10
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options:
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
@@ -152,6 +152,7 @@ kleingebaeck \- kleinanzeigen.de backup tool
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
\& \-f \-\-force Download images even if they already exist.
\& \-m \-\-manual Show manual.
\& \-h \-\-help Show usage.
\& \-V \-\-version Show program version.

View File

@@ -14,6 +14,7 @@ SYNOPSYS
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.

View File

@@ -13,6 +13,7 @@ kleingebaeck - kleinanzeigen.de backup tool
-l --limit <num> Limit the ads to download to <num>, default: load all.
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
-f --force Download images even if they already exist.
-m --manual Show manual.
-h --help Show usage.
-V --version Show program version.

View File

@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main
import (
"bytes"
"errors"
"fmt"
"log/slog"
@@ -119,14 +120,14 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
ad.CalculateExpire()
slog.Debug("extracted ad listing", "ad", ad)
// write listing
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
slog.Debug("extracted ad listing", "ad", ad)
fetch.Config.IncrAds()
return ScrapeImages(fetch, ad, addir)
@@ -135,22 +136,52 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
adpath := filepath.Join(fetch.Config.Outdir, addir)
// scan existing images, if any
cache, err := ReadImages(adpath, fetch.Config.ForceDownload)
if err != nil {
return err
}
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
err = WriteImage(file, body)
buf := new(bytes.Buffer)
_, err = buf.ReadFrom(body)
if err != nil {
return err
}
buf2 := buf.Bytes() // needed for image writing
image := NewImage(buf, "", imguri)
err = image.CalcHash()
if err != nil {
return err
}
if !fetch.Config.ForceDownload {
if image.SimilarExists(cache) {
slog.Debug("similar image exists, not written", "uri", image.Uri)
return nil
}
}
err = WriteImage(file, buf2)
if err != nil {
return err
}
slog.Debug("wrote image", "image", image, "size", len(buf2))
return nil
})
img++

View File

@@ -19,7 +19,7 @@ package main
import (
"bytes"
"io"
"fmt"
"log/slog"
"os"
"path/filepath"
@@ -86,17 +86,46 @@ func WriteAd(c *Config, ad *Ad) (string, error) {
return addir, nil
}
func WriteImage(filename string, reader io.ReadCloser) error {
func WriteImage(filename string, buf []byte) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, reader)
_, err = file.Write(buf)
if err != nil {
return err
}
return nil
}
func ReadImage(filename string) (*bytes.Buffer, error) {
var buf bytes.Buffer
if !fileExists(filename) {
return nil, fmt.Errorf("image %s does not exist", filename)
}
data, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
_, err = buf.Write(data)
if err != nil {
return nil, err
}
return &buf, nil
}
func fileExists(filename string) bool {
info, err := os.Stat(filename)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}