From 2239a83f760bfa2810d990705fbcb24a71fc484c Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Mon, 6 Jan 2025 10:19:34 +0100 Subject: [PATCH] properly check image format for storing and distance hashing --- config.go | 2 +- go.mod | 1 + go.sum | 2 ++ image.go | 59 ++++++++++++++++++++++++++++++++++++++++++++++++------- scrape.go | 12 ++++++++--- 5 files changed, 65 insertions(+), 11 deletions(-) diff --git a/config.go b/config.go index 81e5034..c0d16e9 100644 --- a/config.go +++ b/config.go @@ -34,7 +34,7 @@ import ( ) const ( - VERSION string = "0.3.10" + VERSION string = "0.3.11" Baseuri string = "https://www.kleinanzeigen.de" Listuri string = "/s-bestandsliste.html" Defaultdir string = "." diff --git a/go.mod b/go.mod index 30e9346..a088e33 100644 --- a/go.mod +++ b/go.mod @@ -32,6 +32,7 @@ require ( github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/pelletier/go-toml v1.9.5 // indirect + golang.org/x/image v0.23.0 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/sys v0.21.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 2941152..d8bf1aa 100644 --- a/go.sum +++ b/go.sum @@ -67,6 +67,8 @@ github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXl github.com/tlinden/yadu v0.1.3 h1:5cRCUmj+l5yvlM2irtpFBIJwVV2DPEgYSaWvF19FtcY= github.com/tlinden/yadu v0.1.3/go.mod h1:l3bRmHKL9zGAR6pnBHY2HRPxBecf7L74BoBgOOpTcUA= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/image v0.23.0 h1:HseQ7c2OpPKTPVzNjG5fwJsOTCiiwS4QdsYi5XU6H68= +golang.org/x/image v0.23.0/go.mod h1:wJJBTdLfCCf3tiHa1fNxpZmUI4mmoZvwMCPP0ddoNKY= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= diff --git a/image.go b/image.go index 7a19847..d60065b 100644 --- a/image.go +++ b/image.go @@ -20,11 +20,16 @@ package main import ( "bytes" "fmt" - "image/jpeg" + "image" + _ "image/gif" + _ "image/jpeg" + _ "image/png" "log/slog" "os" "path/filepath" + _ "golang.org/x/image/webp" + "github.com/corona10/goimagehash" ) @@ -35,6 +40,7 @@ type Image struct { Hash *goimagehash.ImageHash Data *bytes.Reader URI string + Mime string } // used for logging to avoid printing Data @@ -49,21 +55,49 @@ func (img *Image) LogValue() slog.Value { // holds all images of an ad type Cache []*goimagehash.ImageHash -func NewImage(buf *bytes.Reader, filename, uri string) *Image { +// filename comes from the scraper, it contains directory/base w/o suffix +func NewImage(buf *bytes.Reader, filename, uri string) (*Image, error) { + _, imgconfig, err := image.DecodeConfig(buf) + if err != nil { + return nil, fmt.Errorf("failed to decode image: %w", err) + } + + buf.Seek(0, 0) + + if imgconfig == "jpeg" { + // we're using the format as file extension, but have used + // "jpg" in the past, so to be backwards compatible, stay with + // it. + imgconfig = "jpg" + } + + if imgconfig == "" { + return nil, fmt.Errorf("failed to process image: unknown or unsupported image format (supported: jpg,png,gif,webp)") + } + + filename += "." + imgconfig + img := &Image{ Filename: filename, URI: uri, Data: buf, + Mime: imgconfig, } - return img + slog.Debug("image MIME", "mime", img.Mime) + + return img, nil } // Calculate diff hash of the image func (img *Image) CalcHash() error { - jpgdata, err := jpeg.Decode(img.Data) + jpgdata, format, err := image.Decode(img.Data) if err != nil { - return fmt.Errorf("failed to decode JPEG image: %w", err) + return fmt.Errorf("failed to decode image: %w", err) + } + + if format == "" { + return fmt.Errorf("failed to decode image: unknown or unsupported image format (supported: jpg,png,gif,webp)") } hash1, err := goimagehash.DifferenceHash(jpgdata) @@ -78,6 +112,10 @@ func (img *Image) CalcHash() error { // checks if 2 images are similar enough to be considered the same func (img *Image) Similar(hash *goimagehash.ImageHash) bool { + if img.Mime != "jpeg" { + return false + } + distance, err := img.Hash.Distance(hash) if err != nil { slog.Debug("failed to compute diff hash distance", "error", err) @@ -133,12 +171,19 @@ func ReadImages(addir string, dont bool) (Cache, error) { reader := bytes.NewReader(data.Bytes()) - img := NewImage(reader, filename, "") + img, err := NewImage(reader, filename, "") + if err != nil { + return nil, err + } + if err := img.CalcHash(); err != nil { return nil, err } - slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString()) + if img.Hash != nil { + slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString()) + } + cache = append(cache, img.Hash) } } diff --git a/scrape.go b/scrape.go index 985176a..9d81b44 100644 --- a/scrape.go +++ b/scrape.go @@ -170,7 +170,9 @@ func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error { for _, imguri := range advertisement.Images { imguri := imguri - file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img)) + + // we append the suffix later in NewImage() based on image format + file := filepath.Join(adpath, fmt.Sprintf("%d", img)) egroup.Go(func() error { // wait a little @@ -192,7 +194,11 @@ func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error { reader := bytes.NewReader(buf.Bytes()) - image := NewImage(reader, file, imguri) + image, err := NewImage(reader, file, imguri) + if err != nil { + return err + } + err = image.CalcHash() if err != nil { return err @@ -211,7 +217,7 @@ func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error { return fmt.Errorf("failed to seek(0) on image reader: %w", err) } - err = WriteImage(file, reader) + err = WriteImage(image.Filename, reader) if err != nil { return err }