diff --git a/ad.go b/ad.go new file mode 100644 index 0000000..a9de619 --- /dev/null +++ b/ad.go @@ -0,0 +1,66 @@ +/* +Copyright © 2023 Thomas von Dein + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "log/slog" +) + +type Index struct { + Links []string `goquery:".text-module-begin a,[href]"` +} + +type Ad struct { + Title string `goquery:"h1"` + Slug string + Id string + Condition string + Category string + Price string `goquery:"h2#viewad-price"` + Created string `goquery:"#viewad-extra-info,text"` + Text string `goquery:"p#viewad-description-text,html"` + Images []string `goquery:".galleryimage-element img,[src]"` + Meta []string `goquery:".addetailslist--detail--value,text"` +} + +// Used by slog to pretty print an ad +func (ad *Ad) LogValue() slog.Value { + return slog.GroupValue( + slog.String("title", ad.Title), + slog.String("price", ad.Price), + slog.String("id", ad.Id), + slog.Int("imagecount", len(ad.Images)), + slog.Int("bodysize", len(ad.Text)), + ) +} + +// check for completeness. I erected these fields to be mandatory +// (though I really don't know if they really are). I consider images +// and meta optional. So, if either of the checked fields here is +// empty we return an error. All the checked fields are extracted +// using goquery. However, I think price is optional since there are +// ads for gifts as well. +// +// Note: we return true for "ad is incomplete" and false for "ad is complete"! +func (ad *Ad) Incomplete() bool { + if ad.Category == "" || ad.Condition == "" || ad.Created == "" || ad.Text == "" { + return true + } + + return false +} diff --git a/go.mod b/go.mod index 5a4e11a..9560640 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.21 require ( astuart.co/goq v1.0.0 + github.com/jarcoal/httpmock v1.3.1 github.com/knadh/koanf/parsers/toml v0.1.0 github.com/knadh/koanf/providers/confmap v0.1.0 github.com/knadh/koanf/providers/file v0.1.0 @@ -12,20 +13,19 @@ require ( github.com/lmittmann/tint v1.0.3 github.com/mattn/go-isatty v0.0.20 github.com/spf13/pflag v1.0.5 + golang.org/x/sync v0.5.0 ) require ( github.com/PuerkitoBio/goquery v1.5.0 // indirect github.com/andybalholm/cascadia v1.0.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/jarcoal/httpmock v1.3.1 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/pelletier/go-toml v1.9.5 // indirect - golang.org/x/net v0.0.0-20190606173856-1492cefac77f // indirect - golang.org/x/sync v0.5.0 // indirect + golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/sys v0.6.0 // indirect ) diff --git a/go.sum b/go.sum index fa9fff7..5d13c4b 100644 --- a/go.sum +++ b/go.sum @@ -27,6 +27,8 @@ github.com/lmittmann/tint v1.0.3 h1:W5PHeA2D8bBJVvabNfQD/XW9HPLZK1XoPZH0cq8NouQ= github.com/lmittmann/tint v1.0.3/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g= +github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -46,8 +48,9 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190606173856-1492cefac77f h1:IWHgpgFqnL5AhBUBZSgBdjl2vkQUEzcY+JNKWfcgAU0= golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/scrape.go b/scrape.go index fc3c3c1..7ab0bfb 100644 --- a/scrape.go +++ b/scrape.go @@ -30,33 +30,6 @@ import ( "golang.org/x/sync/errgroup" ) -type Index struct { - Links []string `goquery:".text-module-begin a,[href]"` -} - -type Ad struct { - Title string `goquery:"h1"` - Slug string - Id string - Condition string - Category string - Price string `goquery:"h2#viewad-price"` - Created string `goquery:"#viewad-extra-info,text"` - Text string `goquery:"p#viewad-description-text,html"` - Images []string `goquery:".galleryimage-element img,[src]"` - Meta []string `goquery:".addetailslist--detail--value,text"` -} - -func (ad *Ad) LogValue() slog.Value { - return slog.GroupValue( - slog.String("title", ad.Title), - slog.String("price", ad.Price), - slog.String("id", ad.Id), - slog.Int("imagecount", len(ad.Images)), - slog.Int("bodysize", len(ad.Text)), - ) -} - // fetch some web page content func Get(uri string, client *http.Client) (io.ReadCloser, error) { req, err := http.NewRequest("GET", uri, nil) @@ -74,6 +47,10 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) { slog.Debug("response", "code", res.StatusCode, "status", res.Status, "size", res.ContentLength) + if res.StatusCode != 200 { + return nil, errors.New("could not get page via HTTP") + } + return res.Body, nil } @@ -162,6 +139,11 @@ func Scrape(c *Config, uri string) error { ad.Category = ad.Meta[0] ad.Condition = ad.Meta[1] } + + if ad.Incomplete() { + return errors.New("could not extract ad data from page, got empty struct") + } + slog.Debug("extracted ad listing", "ad", ad) // write listing diff --git a/scrape_test.go b/scrape_test.go index f96506a..fd3c20f 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -111,6 +111,16 @@ const ADTPL string = `DOCTYPE html> ` +const EMPTYPAGE string = `DOCTYPE html> + + + + +` + +const EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` +const INVALIDURI string = `https://foo.bar/weird/things` + // An Adsource is used to construct a httpmock responder for a // particular url. So, the code (scrape.go) scrapes // https://kleinanzeigen.de, but in reality httpmock captures the @@ -118,6 +128,7 @@ const ADTPL string = `DOCTYPE html> type Adsource struct { uri string content string + status int } // Render a HTML template for an adlisting or an ad @@ -207,6 +218,35 @@ func InitValidSources(conf *Config) []Adsource { return ads } +func InitInvalidSources(conf *Config) []Adsource { + empty := AdConfig{} + ads := []Adsource{ + { + // valid ad page but without content + uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri), + content: GetTemplate(nil, empty, EMPTYPAGE), + }, + { + // some random foreign webpage + uri: INVALIDURI, + content: GetTemplate(nil, empty, "foo"), + }, + { + // some invalid page path + uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri), + content: GetTemplate(nil, empty, ""), + }, + { + // some none-ad page + uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri), + content: GetTemplate(nil, empty, "HTTP 404: /eine-anzeige/ does not exist!"), + status: 404, + }, + } + + return ads +} + // load a test image from disk func GetImage(path string) []byte { dat, err := os.ReadFile(path) @@ -220,10 +260,17 @@ func GetImage(path string) []byte { // setup httpmock func SetIntercept(conf *Config) { ads := InitValidSources(conf) + eads := InitInvalidSources(conf) + + ads = append(ads, eads...) for _, ad := range ads { + if ad.status == 0 { + ad.status = 200 + } + httpmock.RegisterResponder("GET", ad.uri, - httpmock.NewStringResponder(200, ad.content)) + httpmock.NewStringResponder(ad.status, ad.content)) } // we just use 2 images, put this here @@ -266,3 +313,33 @@ func TestStart(t *testing.T) { // uncomment to see slogs //t.Errorf("debug") } + +func TestSingleFail(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + + // fake config + conf := &Config{Outdir: "t/out", Template: DefaultTemplate, Adlinks: []string{EMPTYURI}} + + SetIntercept(conf) + + // check empty ad + if err := Scrape(conf, EMPTYURI); err == nil { + t.Errorf("scrape returned empty ad") + } + + // wrong uri + if err := Scrape(conf, INVALIDURI); err == nil { + t.Errorf("scrape returned ad from invalid web site") + } + + // wrong path + if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1", Baseuri)); err == nil { + t.Errorf("scrape returned ad from invalid page") + } + + // wrong path + if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri)); err == nil { + t.Errorf("scrape returned ad from 404 page") + } +}