mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-18 13:01:06 +01:00
Merge pull request #26 from TLINDEN/test/enhancements
Enhanced error checking, added more failure tests
This commit is contained in:
66
ad.go
Normal file
66
ad.go
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
Copyright © 2023 Thomas von Dein
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
type Index struct {
|
||||
Links []string `goquery:".text-module-begin a,[href]"`
|
||||
}
|
||||
|
||||
type Ad struct {
|
||||
Title string `goquery:"h1"`
|
||||
Slug string
|
||||
Id string
|
||||
Condition string
|
||||
Category string
|
||||
Price string `goquery:"h2#viewad-price"`
|
||||
Created string `goquery:"#viewad-extra-info,text"`
|
||||
Text string `goquery:"p#viewad-description-text,html"`
|
||||
Images []string `goquery:".galleryimage-element img,[src]"`
|
||||
Meta []string `goquery:".addetailslist--detail--value,text"`
|
||||
}
|
||||
|
||||
// Used by slog to pretty print an ad
|
||||
func (ad *Ad) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("title", ad.Title),
|
||||
slog.String("price", ad.Price),
|
||||
slog.String("id", ad.Id),
|
||||
slog.Int("imagecount", len(ad.Images)),
|
||||
slog.Int("bodysize", len(ad.Text)),
|
||||
)
|
||||
}
|
||||
|
||||
// check for completeness. I erected these fields to be mandatory
|
||||
// (though I really don't know if they really are). I consider images
|
||||
// and meta optional. So, if either of the checked fields here is
|
||||
// empty we return an error. All the checked fields are extracted
|
||||
// using goquery. However, I think price is optional since there are
|
||||
// ads for gifts as well.
|
||||
//
|
||||
// Note: we return true for "ad is incomplete" and false for "ad is complete"!
|
||||
func (ad *Ad) Incomplete() bool {
|
||||
if ad.Category == "" || ad.Condition == "" || ad.Created == "" || ad.Text == "" {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
6
go.mod
6
go.mod
@@ -4,6 +4,7 @@ go 1.21
|
||||
|
||||
require (
|
||||
astuart.co/goq v1.0.0
|
||||
github.com/jarcoal/httpmock v1.3.1
|
||||
github.com/knadh/koanf/parsers/toml v0.1.0
|
||||
github.com/knadh/koanf/providers/confmap v0.1.0
|
||||
github.com/knadh/koanf/providers/file v0.1.0
|
||||
@@ -12,20 +13,19 @@ require (
|
||||
github.com/lmittmann/tint v1.0.3
|
||||
github.com/mattn/go-isatty v0.0.20
|
||||
github.com/spf13/pflag v1.0.5
|
||||
golang.org/x/sync v0.5.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.0.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/jarcoal/httpmock v1.3.1 // indirect
|
||||
github.com/knadh/koanf/maps v0.1.1 // indirect
|
||||
github.com/mitchellh/copystructure v1.2.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/mitchellh/reflectwalk v1.0.2 // indirect
|
||||
github.com/pelletier/go-toml v1.9.5 // indirect
|
||||
golang.org/x/net v0.0.0-20190606173856-1492cefac77f // indirect
|
||||
golang.org/x/sync v0.5.0 // indirect
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
|
||||
golang.org/x/sys v0.6.0 // indirect
|
||||
|
||||
)
|
||||
|
||||
5
go.sum
5
go.sum
@@ -27,6 +27,8 @@ github.com/lmittmann/tint v1.0.3 h1:W5PHeA2D8bBJVvabNfQD/XW9HPLZK1XoPZH0cq8NouQ=
|
||||
github.com/lmittmann/tint v1.0.3/go.mod h1:HIS3gSy7qNwGCj+5oRjAutErFBl4BzdQP6cJZ0NfMwE=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g=
|
||||
github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM=
|
||||
github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw=
|
||||
github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s=
|
||||
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
|
||||
@@ -46,8 +48,9 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190606173856-1492cefac77f h1:IWHgpgFqnL5AhBUBZSgBdjl2vkQUEzcY+JNKWfcgAU0=
|
||||
golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
|
||||
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
|
||||
36
scrape.go
36
scrape.go
@@ -30,33 +30,6 @@ import (
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
type Index struct {
|
||||
Links []string `goquery:".text-module-begin a,[href]"`
|
||||
}
|
||||
|
||||
type Ad struct {
|
||||
Title string `goquery:"h1"`
|
||||
Slug string
|
||||
Id string
|
||||
Condition string
|
||||
Category string
|
||||
Price string `goquery:"h2#viewad-price"`
|
||||
Created string `goquery:"#viewad-extra-info,text"`
|
||||
Text string `goquery:"p#viewad-description-text,html"`
|
||||
Images []string `goquery:".galleryimage-element img,[src]"`
|
||||
Meta []string `goquery:".addetailslist--detail--value,text"`
|
||||
}
|
||||
|
||||
func (ad *Ad) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("title", ad.Title),
|
||||
slog.String("price", ad.Price),
|
||||
slog.String("id", ad.Id),
|
||||
slog.Int("imagecount", len(ad.Images)),
|
||||
slog.Int("bodysize", len(ad.Text)),
|
||||
)
|
||||
}
|
||||
|
||||
// fetch some web page content
|
||||
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||
req, err := http.NewRequest("GET", uri, nil)
|
||||
@@ -74,6 +47,10 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||
slog.Debug("response", "code", res.StatusCode, "status",
|
||||
res.Status, "size", res.ContentLength)
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, errors.New("could not get page via HTTP")
|
||||
}
|
||||
|
||||
return res.Body, nil
|
||||
}
|
||||
|
||||
@@ -162,6 +139,11 @@ func Scrape(c *Config, uri string) error {
|
||||
ad.Category = ad.Meta[0]
|
||||
ad.Condition = ad.Meta[1]
|
||||
}
|
||||
|
||||
if ad.Incomplete() {
|
||||
return errors.New("could not extract ad data from page, got empty struct")
|
||||
}
|
||||
|
||||
slog.Debug("extracted ad listing", "ad", ad)
|
||||
|
||||
// write listing
|
||||
|
||||
@@ -111,6 +111,16 @@ const ADTPL string = `DOCTYPE html>
|
||||
</html>
|
||||
`
|
||||
|
||||
const EMPTYPAGE string = `DOCTYPE html>
|
||||
<html lang="de">
|
||||
<head></head>
|
||||
<body></body>
|
||||
</html>
|
||||
`
|
||||
|
||||
const EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1`
|
||||
const INVALIDURI string = `https://foo.bar/weird/things`
|
||||
|
||||
// An Adsource is used to construct a httpmock responder for a
|
||||
// particular url. So, the code (scrape.go) scrapes
|
||||
// https://kleinanzeigen.de, but in reality httpmock captures the
|
||||
@@ -118,6 +128,7 @@ const ADTPL string = `DOCTYPE html>
|
||||
type Adsource struct {
|
||||
uri string
|
||||
content string
|
||||
status int
|
||||
}
|
||||
|
||||
// Render a HTML template for an adlisting or an ad
|
||||
@@ -207,6 +218,35 @@ func InitValidSources(conf *Config) []Adsource {
|
||||
return ads
|
||||
}
|
||||
|
||||
func InitInvalidSources(conf *Config) []Adsource {
|
||||
empty := AdConfig{}
|
||||
ads := []Adsource{
|
||||
{
|
||||
// valid ad page but without content
|
||||
uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri),
|
||||
content: GetTemplate(nil, empty, EMPTYPAGE),
|
||||
},
|
||||
{
|
||||
// some random foreign webpage
|
||||
uri: INVALIDURI,
|
||||
content: GetTemplate(nil, empty, "<html>foo</html>"),
|
||||
},
|
||||
{
|
||||
// some invalid page path
|
||||
uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri),
|
||||
content: GetTemplate(nil, empty, "<html></html>"),
|
||||
},
|
||||
{
|
||||
// some none-ad page
|
||||
uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri),
|
||||
content: GetTemplate(nil, empty, "<html>HTTP 404: /eine-anzeige/ does not exist!</html>"),
|
||||
status: 404,
|
||||
},
|
||||
}
|
||||
|
||||
return ads
|
||||
}
|
||||
|
||||
// load a test image from disk
|
||||
func GetImage(path string) []byte {
|
||||
dat, err := os.ReadFile(path)
|
||||
@@ -220,10 +260,17 @@ func GetImage(path string) []byte {
|
||||
// setup httpmock
|
||||
func SetIntercept(conf *Config) {
|
||||
ads := InitValidSources(conf)
|
||||
eads := InitInvalidSources(conf)
|
||||
|
||||
ads = append(ads, eads...)
|
||||
|
||||
for _, ad := range ads {
|
||||
if ad.status == 0 {
|
||||
ad.status = 200
|
||||
}
|
||||
|
||||
httpmock.RegisterResponder("GET", ad.uri,
|
||||
httpmock.NewStringResponder(200, ad.content))
|
||||
httpmock.NewStringResponder(ad.status, ad.content))
|
||||
}
|
||||
|
||||
// we just use 2 images, put this here
|
||||
@@ -266,3 +313,33 @@ func TestStart(t *testing.T) {
|
||||
// uncomment to see slogs
|
||||
//t.Errorf("debug")
|
||||
}
|
||||
|
||||
func TestSingleFail(t *testing.T) {
|
||||
httpmock.Activate()
|
||||
defer httpmock.DeactivateAndReset()
|
||||
|
||||
// fake config
|
||||
conf := &Config{Outdir: "t/out", Template: DefaultTemplate, Adlinks: []string{EMPTYURI}}
|
||||
|
||||
SetIntercept(conf)
|
||||
|
||||
// check empty ad
|
||||
if err := Scrape(conf, EMPTYURI); err == nil {
|
||||
t.Errorf("scrape returned empty ad")
|
||||
}
|
||||
|
||||
// wrong uri
|
||||
if err := Scrape(conf, INVALIDURI); err == nil {
|
||||
t.Errorf("scrape returned ad from invalid web site")
|
||||
}
|
||||
|
||||
// wrong path
|
||||
if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1", Baseuri)); err == nil {
|
||||
t.Errorf("scrape returned ad from invalid page")
|
||||
}
|
||||
|
||||
// wrong path
|
||||
if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri)); err == nil {
|
||||
t.Errorf("scrape returned ad from 404 page")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user