mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-17 20:41:01 +01:00
Compare commits
40 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0387d55624 | |||
| a94ef63a90 | |||
| 90f5e86fdb | |||
| 587529e314 | |||
|
|
8771ec1108 | ||
|
|
1896209b96 | ||
|
|
3c93c9fce0 | ||
|
|
42a958fc4c | ||
|
|
5fa46ff106 | ||
|
|
cca3211023 | ||
|
|
dce7604afb | ||
| 0fd9b519d1 | |||
| 6b7f727449 | |||
| 5abbab9527 | |||
|
|
e03c7debb6 | ||
| 1d2483d18f | |||
| b17f4f0f3e | |||
| 4a91167871 | |||
| 0baaf6f38b | |||
| 42182bb6c9 | |||
| 8455c193eb | |||
| d1faa10a52 | |||
| e28137bf9b | |||
| 1ff5c240c8 | |||
|
|
f893f9c3d7 | ||
|
|
c4e88d98f2 | ||
|
|
0cca387982 | ||
|
|
9e619fb3c5 | ||
|
|
0fdfed2929 | ||
|
|
73c09ec38b | ||
|
|
f901af4f0c | ||
|
|
2a8f53ca98 | ||
|
|
4a95cb1f5e | ||
|
|
482612f889 | ||
|
|
b8977df986 | ||
|
|
ae5e3daea3 | ||
|
|
1c6d832b20 | ||
| 52b39d91a3 | |||
| 3748cd35e5 | |||
| 4d4577c9f8 |
@@ -35,7 +35,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
VERSION string = "0.2.0"
|
||||
VERSION string = "0.3.0"
|
||||
Baseuri string = "https://www.kleinanzeigen.de"
|
||||
Listuri string = "/s-bestandsliste.html"
|
||||
Defaultdir string = "."
|
||||
@@ -62,6 +62,7 @@ Options:
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-f --force Download images even if they already exist.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
@@ -82,6 +83,7 @@ type Config struct {
|
||||
Loglevel string `koanf:"loglevel"`
|
||||
Limit int `koanf:"limit"`
|
||||
IgnoreErrors bool `koanf:"ignoreerrors"`
|
||||
ForceDownload bool `koanf:"force"`
|
||||
Adlinks []string
|
||||
StatsCountAds int
|
||||
StatsCountImages int
|
||||
@@ -133,6 +135,7 @@ func InitConfig(w io.Writer) (*Config, error) {
|
||||
f.BoolP("version", "V", false, "show program version")
|
||||
f.BoolP("help", "h", false, "show usage")
|
||||
f.BoolP("manual", "m", false, "show manual")
|
||||
f.BoolP("force", "f", false, "force")
|
||||
|
||||
if err := f.Parse(os.Args[1:]); err != nil {
|
||||
return nil, err
|
||||
|
||||
2
go.mod
2
go.mod
@@ -21,6 +21,7 @@ require (
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.1 // indirect
|
||||
github.com/andybalholm/cascadia v1.1.0 // indirect
|
||||
github.com/corona10/goimagehash v1.1.0 // indirect
|
||||
github.com/fatih/color v1.16.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/knadh/koanf/maps v0.1.1 // indirect
|
||||
@@ -28,6 +29,7 @@ require (
|
||||
github.com/mitchellh/copystructure v1.2.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/mitchellh/reflectwalk v1.0.2 // indirect
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
|
||||
github.com/pelletier/go-toml v1.9.5 // indirect
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
|
||||
golang.org/x/sys v0.14.0 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@@ -6,6 +6,8 @@ github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBK
|
||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
|
||||
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@@ -44,6 +46,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
|
||||
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
|
||||
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
|
||||
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
||||
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
|
||||
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
|
||||
142
image.go
Normal file
142
image.go
Normal file
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright © 2023-2024 Thomas von Dein
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image/jpeg"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/corona10/goimagehash"
|
||||
)
|
||||
|
||||
const MaxDistance = 3
|
||||
|
||||
type Image struct {
|
||||
Filename string
|
||||
Hash *goimagehash.ImageHash
|
||||
Data *bytes.Buffer
|
||||
Uri string
|
||||
}
|
||||
|
||||
// used for logging to avoid printing Data
|
||||
func (img *Image) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("filename", img.Filename),
|
||||
slog.String("uri", img.Uri),
|
||||
slog.String("hash", img.Hash.ToString()),
|
||||
)
|
||||
}
|
||||
|
||||
// holds all images of an ad
|
||||
type Cache []*goimagehash.ImageHash
|
||||
|
||||
func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
|
||||
img := &Image{
|
||||
Filename: filename,
|
||||
Uri: uri,
|
||||
Data: buf,
|
||||
}
|
||||
|
||||
return img
|
||||
}
|
||||
|
||||
// Calculate diff hash of the image
|
||||
func (img *Image) CalcHash() error {
|
||||
jpgdata, err := jpeg.Decode(img.Data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hash1, err := goimagehash.DifferenceHash(jpgdata)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
img.Hash = hash1
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// checks if 2 images are similar enough to be considered the same
|
||||
func (img *Image) Similar(hash *goimagehash.ImageHash) bool {
|
||||
distance, err := img.Hash.Distance(hash)
|
||||
if err != nil {
|
||||
slog.Debug("failed to compute diff hash distance", "error", err)
|
||||
return false
|
||||
}
|
||||
|
||||
if distance < MaxDistance {
|
||||
slog.Debug("distance computation", "image-A", img.Hash.ToString(),
|
||||
"image-B", hash.ToString(), "distance", distance)
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// check current image against all known hashes.
|
||||
func (img *Image) SimilarExists(cache Cache) bool {
|
||||
for _, otherimg := range cache {
|
||||
if img.Similar(otherimg) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// read all JPG images in a ad directory, compute diff hashes and
|
||||
// store the results in the slice Images
|
||||
func ReadImages(addir string, dont bool) (Cache, error) {
|
||||
files, err := os.ReadDir(addir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cache := Cache{}
|
||||
|
||||
if dont {
|
||||
// forced download, -f given
|
||||
return cache, nil
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
ext := filepath.Ext(file.Name())
|
||||
if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
|
||||
filename := filepath.Join(addir, file.Name())
|
||||
data, err := ReadImage(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
img := NewImage(data, filename, "")
|
||||
if err = img.CalcHash(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
|
||||
cache = append(cache, img.Hash)
|
||||
}
|
||||
}
|
||||
|
||||
//return nil, errors.New("ende")
|
||||
return cache, nil
|
||||
}
|
||||
@@ -133,7 +133,7 @@
|
||||
.\" ========================================================================
|
||||
.\"
|
||||
.IX Title "KLEINGEBAECK 1"
|
||||
.TH KLEINGEBAECK 1 "2024-01-17" "1" "User Commands"
|
||||
.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands"
|
||||
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
||||
.\" way too many mistakes in technical documents.
|
||||
.if n .ad l
|
||||
@@ -142,7 +142,7 @@
|
||||
kleingebaeck \- kleinanzeigen.de backup tool
|
||||
.SH "SYNOPSYS"
|
||||
.IX Header "SYNOPSYS"
|
||||
.Vb 12
|
||||
.Vb 10
|
||||
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
|
||||
\& Options:
|
||||
\& \-u \-\-user <uid> Backup ads from user with uid <uid>.
|
||||
@@ -152,6 +152,7 @@ kleingebaeck \- kleinanzeigen.de backup tool
|
||||
\& \-l \-\-limit <num> Limit the ads to download to <num>, default: load all.
|
||||
\& \-c \-\-config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
\& \-\-ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
\& \-f \-\-force Download images even if they already exist.
|
||||
\& \-m \-\-manual Show manual.
|
||||
\& \-h \-\-help Show usage.
|
||||
\& \-V \-\-version Show program version.
|
||||
|
||||
@@ -14,6 +14,7 @@ SYNOPSYS
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-f --force Download images even if they already exist.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
@@ -13,6 +13,7 @@ kleingebaeck - kleinanzeigen.de backup tool
|
||||
-l --limit <num> Limit the ads to download to <num>, default: load all.
|
||||
-c --config <file> Use config file <file> (default: ~/.kleingebaeck).
|
||||
--ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup.
|
||||
-f --force Download images even if they already exist.
|
||||
-m --manual Show manual.
|
||||
-h --help Show usage.
|
||||
-V --version Show program version.
|
||||
|
||||
39
scrape.go
39
scrape.go
@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
@@ -119,14 +120,14 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
|
||||
|
||||
ad.CalculateExpire()
|
||||
|
||||
slog.Debug("extracted ad listing", "ad", ad)
|
||||
|
||||
// write listing
|
||||
addir, err := WriteAd(fetch.Config, ad)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Debug("extracted ad listing", "ad", ad)
|
||||
|
||||
fetch.Config.IncrAds()
|
||||
|
||||
return ScrapeImages(fetch, ad, addir)
|
||||
@@ -135,22 +136,52 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
|
||||
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
|
||||
// fetch images
|
||||
img := 1
|
||||
adpath := filepath.Join(fetch.Config.Outdir, addir)
|
||||
|
||||
// scan existing images, if any
|
||||
cache, err := ReadImages(adpath, fetch.Config.ForceDownload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
g := new(errgroup.Group)
|
||||
|
||||
for _, imguri := range ad.Images {
|
||||
imguri := imguri
|
||||
file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
|
||||
file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img))
|
||||
g.Go(func() error {
|
||||
body, err := fetch.Getimage(imguri)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = WriteImage(file, body)
|
||||
buf := new(bytes.Buffer)
|
||||
_, err = buf.ReadFrom(body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
buf2 := buf.Bytes() // needed for image writing
|
||||
|
||||
image := NewImage(buf, "", imguri)
|
||||
err = image.CalcHash()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !fetch.Config.ForceDownload {
|
||||
if image.SimilarExists(cache) {
|
||||
slog.Debug("similar image exists, not written", "uri", image.Uri)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
err = WriteImage(file, buf2)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Debug("wrote image", "image", image, "size", len(buf2))
|
||||
return nil
|
||||
})
|
||||
img++
|
||||
|
||||
35
store.go
35
store.go
@@ -19,7 +19,7 @@ package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -86,17 +86,46 @@ func WriteAd(c *Config, ad *Ad) (string, error) {
|
||||
return addir, nil
|
||||
}
|
||||
|
||||
func WriteImage(filename string, reader io.ReadCloser) error {
|
||||
func WriteImage(filename string, buf []byte) error {
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
_, err = io.Copy(file, reader)
|
||||
_, err = file.Write(buf)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func ReadImage(filename string) (*bytes.Buffer, error) {
|
||||
var buf bytes.Buffer
|
||||
|
||||
if !fileExists(filename) {
|
||||
return nil, fmt.Errorf("image %s does not exist", filename)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
_, err = buf.Write(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &buf, nil
|
||||
}
|
||||
|
||||
func fileExists(filename string) bool {
|
||||
info, err := os.Stat(filename)
|
||||
if os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return !info.IsDir()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user