diff --git a/go.mod b/go.mod index 003e6eb..55e7757 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( require ( github.com/PuerkitoBio/goquery v1.5.1 // indirect github.com/andybalholm/cascadia v1.1.0 // indirect + github.com/corona10/goimagehash v1.1.0 // indirect github.com/fatih/color v1.16.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect @@ -28,6 +29,7 @@ require ( github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/pelletier/go-toml v1.9.5 // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/sys v0.14.0 // indirect diff --git a/go.sum b/go.sum index 76737e9..eb536d8 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBK github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI= +github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -44,6 +46,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ= +github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/image.go b/image.go new file mode 100644 index 0000000..f162534 --- /dev/null +++ b/image.go @@ -0,0 +1,136 @@ +/* +Copyright © 2023-2024 Thomas von Dein + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "bytes" + "image/jpeg" + "log/slog" + "os" + "path/filepath" + + "github.com/corona10/goimagehash" +) + +const MaxDistance = 3 + +type Image struct { + Filename string + Hash *goimagehash.ImageHash + Data *bytes.Buffer + Uri string +} + +// used for logging to avoid printing Data +func (img *Image) LogValue() slog.Value { + return slog.GroupValue( + slog.String("filename", img.Filename), + slog.String("uri", img.Uri), + slog.String("hash", img.Hash.ToString()), + ) +} + +// holds all images of an ad +type Images []*Image + +func NewImage(buf *bytes.Buffer, filename string, uri string) *Image { + img := &Image{ + Filename: filename, + Uri: uri, + Data: buf, + } + + return img +} + +// Calculate diff hash of the image +func (img *Image) CalcHash() error { + jpgdata, err := jpeg.Decode(img.Data) + if err != nil { + return err + } + + hash1, err := goimagehash.DifferenceHash(jpgdata) + if err != nil { + return err + } + + img.Hash = hash1 + + return nil +} + +// checks if 2 images are similar enough to be considered the same +func (img *Image) Similar(otherimg *Image) bool { + distance, err := img.Hash.Distance(otherimg.Hash) + if err != nil { + slog.Debug("failed to compute diff hash distance", "error", err) + return false + } + + if distance < MaxDistance { + slog.Debug("distance computation", "image-A", img, "image-B", otherimg, "distance", distance) + return true + } else { + return false + } +} + +// check current image against all known hashes. +func (img *Image) SimilarExists(images Images) bool { + for _, otherimg := range images { + if img.Similar(otherimg) { + return true + } + } + + return false +} + +// read all JPG images in a ad directory, compute diff hashes and +// store the results in the slice Images +func ReadImages(addir string) (Images, error) { + files, err := os.ReadDir(addir) + if err != nil { + return nil, err + } + + imgs := Images{} + + for _, file := range files { + ext := filepath.Ext(file.Name()) + if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") { + filename := filepath.Join(addir, file.Name()) + data, err := ReadImage(filename) + if err != nil { + return nil, err + } + + img := NewImage(data, filename, "") + if err = img.CalcHash(); err != nil { + return nil, err + } + + slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString()) + imgs = append(imgs, img) + } + } + + //return nil, errors.New("ende") + return imgs, nil +} diff --git a/scrape.go b/scrape.go index 71a6e2e..6b38a13 100644 --- a/scrape.go +++ b/scrape.go @@ -18,6 +18,7 @@ along with this program. If not, see . package main import ( + "bytes" "errors" "fmt" "log/slog" @@ -119,14 +120,14 @@ func ScrapeAd(fetch *Fetcher, uri string) error { ad.CalculateExpire() - slog.Debug("extracted ad listing", "ad", ad) - // write listing addir, err := WriteAd(fetch.Config, ad) if err != nil { return err } + slog.Debug("extracted ad listing", "ad", ad) + fetch.Config.IncrAds() return ScrapeImages(fetch, ad, addir) @@ -135,22 +136,48 @@ func ScrapeAd(fetch *Fetcher, uri string) error { func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error { // fetch images img := 1 + + adpath := filepath.Join(fetch.Config.Outdir, addir) + + // scan existing images, if any + images, err := ReadImages(adpath) + if err != nil { + return err + } + g := new(errgroup.Group) for _, imguri := range ad.Images { imguri := imguri - file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img)) + file := filepath.Join(adpath, fmt.Sprintf("%d.jpg", img)) g.Go(func() error { body, err := fetch.Getimage(imguri) if err != nil { return err } - err = WriteImage(file, body) + buf := new(bytes.Buffer) + buf.ReadFrom(body) + + buf2 := buf.Bytes() // needed for image writing + + image := NewImage(buf, "", imguri) + err = image.CalcHash() if err != nil { return err } + if image.SimilarExists(images) { + slog.Debug("similar image exists, not written", "image", image) + return nil + } + + err = WriteImage(file, buf2) + if err != nil { + return err + } + + slog.Debug("wrote image", "image", image, "size", len(buf2)) return nil }) img++ diff --git a/store.go b/store.go index ad71c3e..90d6249 100644 --- a/store.go +++ b/store.go @@ -19,7 +19,7 @@ package main import ( "bytes" - "io" + "fmt" "log/slog" "os" "path/filepath" @@ -86,17 +86,46 @@ func WriteAd(c *Config, ad *Ad) (string, error) { return addir, nil } -func WriteImage(filename string, reader io.ReadCloser) error { +func WriteImage(filename string, buf []byte) error { file, err := os.Create(filename) if err != nil { return err } defer file.Close() - _, err = io.Copy(file, reader) + _, err = file.Write(buf) + if err != nil { return err } return nil } + +func ReadImage(filename string) (*bytes.Buffer, error) { + var buf bytes.Buffer + + if !fileExists(filename) { + return nil, fmt.Errorf("image %s does not exist", filename) + } + + data, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + + _, err = buf.Write(data) + if err != nil { + return nil, err + } + + return &buf, nil +} + +func fileExists(filename string) bool { + info, err := os.Stat(filename) + if os.IsNotExist(err) { + return false + } + return !info.IsDir() +}