diff --git a/go.mod b/go.mod
index 003e6eb..55e7757 100644
--- a/go.mod
+++ b/go.mod
@@ -21,6 +21,7 @@ require (
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
+ github.com/corona10/goimagehash v1.1.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
@@ -28,6 +29,7 @@ require (
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
+ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/sys v0.14.0 // indirect
diff --git a/go.sum b/go.sum
index 76737e9..eb536d8 100644
--- a/go.sum
+++ b/go.sum
@@ -6,6 +6,8 @@ github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBK
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
+github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -44,6 +46,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
+github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
+github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/image.go b/image.go
new file mode 100644
index 0000000..f162534
--- /dev/null
+++ b/image.go
@@ -0,0 +1,136 @@
+/*
+Copyright © 2023-2024 Thomas von Dein
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+package main
+
+import (
+ "bytes"
+ "image/jpeg"
+ "log/slog"
+ "os"
+ "path/filepath"
+
+ "github.com/corona10/goimagehash"
+)
+
+const MaxDistance = 3
+
+type Image struct {
+ Filename string
+ Hash *goimagehash.ImageHash
+ Data *bytes.Buffer
+ Uri string
+}
+
+// used for logging to avoid printing Data
+func (img *Image) LogValue() slog.Value {
+ return slog.GroupValue(
+ slog.String("filename", img.Filename),
+ slog.String("uri", img.Uri),
+ slog.String("hash", img.Hash.ToString()),
+ )
+}
+
+// holds all images of an ad
+type Images []*Image
+
+func NewImage(buf *bytes.Buffer, filename string, uri string) *Image {
+ img := &Image{
+ Filename: filename,
+ Uri: uri,
+ Data: buf,
+ }
+
+ return img
+}
+
+// Calculate diff hash of the image
+func (img *Image) CalcHash() error {
+ jpgdata, err := jpeg.Decode(img.Data)
+ if err != nil {
+ return err
+ }
+
+ hash1, err := goimagehash.DifferenceHash(jpgdata)
+ if err != nil {
+ return err
+ }
+
+ img.Hash = hash1
+
+ return nil
+}
+
+// checks if 2 images are similar enough to be considered the same
+func (img *Image) Similar(otherimg *Image) bool {
+ distance, err := img.Hash.Distance(otherimg.Hash)
+ if err != nil {
+ slog.Debug("failed to compute diff hash distance", "error", err)
+ return false
+ }
+
+ if distance < MaxDistance {
+ slog.Debug("distance computation", "image-A", img, "image-B", otherimg, "distance", distance)
+ return true
+ } else {
+ return false
+ }
+}
+
+// check current image against all known hashes.
+func (img *Image) SimilarExists(images Images) bool {
+ for _, otherimg := range images {
+ if img.Similar(otherimg) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// read all JPG images in a ad directory, compute diff hashes and
+// store the results in the slice Images
+func ReadImages(addir string) (Images, error) {
+ files, err := os.ReadDir(addir)
+ if err != nil {
+ return nil, err
+ }
+
+ imgs := Images{}
+
+ for _, file := range files {
+ ext := filepath.Ext(file.Name())
+ if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") {
+ filename := filepath.Join(addir, file.Name())
+ data, err := ReadImage(filename)
+ if err != nil {
+ return nil, err
+ }
+
+ img := NewImage(data, filename, "")
+ if err = img.CalcHash(); err != nil {
+ return nil, err
+ }
+
+ slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString())
+ imgs = append(imgs, img)
+ }
+ }
+
+ //return nil, errors.New("ende")
+ return imgs, nil
+}
diff --git a/scrape.go b/scrape.go
index 71a6e2e..6b38a13 100644
--- a/scrape.go
+++ b/scrape.go
@@ -18,6 +18,7 @@ along with this program. If not, see .
package main
import (
+ "bytes"
"errors"
"fmt"
"log/slog"
@@ -119,14 +120,14 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
ad.CalculateExpire()
- slog.Debug("extracted ad listing", "ad", ad)
-
// write listing
addir, err := WriteAd(fetch.Config, ad)
if err != nil {
return err
}
+ slog.Debug("extracted ad listing", "ad", ad)
+
fetch.Config.IncrAds()
return ScrapeImages(fetch, ad, addir)
@@ -135,22 +136,48 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
// fetch images
img := 1
+
+ adpath := filepath.Join(fetch.Config.Outdir, addir)
+
+ // scan existing images, if any
+ images, err := ReadImages(adpath)
+ if err != nil {
+ return err
+ }
+
g := new(errgroup.Group)
for _, imguri := range ad.Images {
imguri := imguri
- file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
+ file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
body, err := fetch.Getimage(imguri)
if err != nil {
return err
}
- err = WriteImage(file, body)
+ buf := new(bytes.Buffer)
+ buf.ReadFrom(body)
+
+ buf2 := buf.Bytes() // needed for image writing
+
+ image := NewImage(buf, "", imguri)
+ err = image.CalcHash()
if err != nil {
return err
}
+ if image.SimilarExists(images) {
+ slog.Debug("similar image exists, not written", "image", image)
+ return nil
+ }
+
+ err = WriteImage(file, buf2)
+ if err != nil {
+ return err
+ }
+
+ slog.Debug("wrote image", "image", image, "size", len(buf2))
return nil
})
img++
diff --git a/store.go b/store.go
index ad71c3e..90d6249 100644
--- a/store.go
+++ b/store.go
@@ -19,7 +19,7 @@ package main
import (
"bytes"
- "io"
+ "fmt"
"log/slog"
"os"
"path/filepath"
@@ -86,17 +86,46 @@ func WriteAd(c *Config, ad *Ad) (string, error) {
return addir, nil
}
-func WriteImage(filename string, reader io.ReadCloser) error {
+func WriteImage(filename string, buf []byte) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
- _, err = io.Copy(file, reader)
+ _, err = file.Write(buf)
+
if err != nil {
return err
}
return nil
}
+
+func ReadImage(filename string) (*bytes.Buffer, error) {
+ var buf bytes.Buffer
+
+ if !fileExists(filename) {
+ return nil, fmt.Errorf("image %s does not exist", filename)
+ }
+
+ data, err := os.ReadFile(filename)
+ if err != nil {
+ return nil, err
+ }
+
+ _, err = buf.Write(data)
+ if err != nil {
+ return nil, err
+ }
+
+ return &buf, nil
+}
+
+func fileExists(filename string) bool {
+ info, err := os.Stat(filename)
+ if os.IsNotExist(err) {
+ return false
+ }
+ return !info.IsDir()
+}