From 0baaf6f38b3847b51b78dc3e716a255e7648be2c Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Tue, 2 Jan 2024 12:22:26 +0100 Subject: [PATCH] better error message on 404 --- scrape.go | 2 +- scrape_test.go | 345 ---------------------------------------------- t/fullconfig.conf | 6 + t/invalid.conf | 1 + 4 files changed, 8 insertions(+), 346 deletions(-) delete mode 100644 scrape_test.go create mode 100644 t/fullconfig.conf create mode 100644 t/invalid.conf diff --git a/scrape.go b/scrape.go index 7ab0bfb..25e7d73 100644 --- a/scrape.go +++ b/scrape.go @@ -196,7 +196,7 @@ func Getimage(uri, fileName string) error { defer response.Body.Close() if response.StatusCode != 200 { - return errors.New("received non 200 response code") + return errors.New("could not get image via HTTP") } err = WriteImage(fileName, response.Body) diff --git a/scrape_test.go b/scrape_test.go deleted file mode 100644 index fd3c20f..0000000 --- a/scrape_test.go +++ /dev/null @@ -1,345 +0,0 @@ -/* -Copyright © 2023 Thomas von Dein - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -package main - -import ( - "bytes" - "fmt" - "os" - "strings" - "testing" - tpl "text/template" - - "github.com/jarcoal/httpmock" -) - -// used to fill an ad template and the ad listing page template -type AdConfig struct { - Title string - Slug string - Id string - Price string - Category string - Condition string - Created string - Text string - Images []string // files in ./t/ -} - -// the ad list, aka: -// https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX -// Note, that this HTML code is reduced to the max, so that it only -// contains the stuff required to satisfy goquery -const LISTTPL string = ` - - - Ads - - -{{ range . }} -

- {{ .Title }} -

-{{ end }} - - -` - -// an actual ad listing, aka: -// https://www.kleinanzeigen.de/s-anzeige/ad-text-slug/1010101010 -// Note, that this HTML code is reduced to the max, so that it only -// contains the stuff required to satisfy goquery -const ADTPL string = `DOCTYPE html> - - - Ad Listing - - - - {{ range $image := .Images }} -
- -
- {{ end }} - -

- {{ .Title }}

-
-

- {{ .Price }}

-
- -
-
{{ .Created }}
-
- -
-
    -
  • - Art - {{ .Category }} -
  • -
  • - Zustand - {{ .Condition }} -
  • -
-
- -
-

- {{ .Text }} -

-
- - -` - -const EMPTYPAGE string = `DOCTYPE html> - - - - -` - -const EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` -const INVALIDURI string = `https://foo.bar/weird/things` - -// An Adsource is used to construct a httpmock responder for a -// particular url. So, the code (scrape.go) scrapes -// https://kleinanzeigen.de, but in reality httpmock captures the -// request and responds with our mock data -type Adsource struct { - uri string - content string - status int -} - -// Render a HTML template for an adlisting or an ad -func GetTemplate(l []AdConfig, a AdConfig, htmltemplate string) string { - tmpl, err := tpl.New("template").Parse(htmltemplate) - if err != nil { - panic(err) - } - - var out bytes.Buffer - if len(a.Id) == 0 { - err = tmpl.Execute(&out, l) - } else { - err = tmpl.Execute(&out, a) - } - - if err != nil { - panic(err) - } - - return out.String() -} - -func InitAds() []AdConfig { - return []AdConfig{ - {Title: "First Ad", Id: "1", Price: "5€", Category: "Klimbim", Text: "Thing to sale", Slug: "first-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - {Title: "Secnd Ad", Id: "2", Price: "5€", Category: "Kram", Text: "Thing to sale", Slug: "second-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - {Title: "Third Ad", Id: "3", Price: "5€", Category: "Kuddelmuddel", Text: "Thing to sale", Slug: "third-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - {Title: "Forth Ad", Id: "4", Price: "5€", Category: "Krempel", Text: "Thing to sale", Slug: "fourth-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - {Title: "Fifth Ad", Id: "5", Price: "5€", Category: "Kladderadatsch", Text: "Thing to sale", Slug: "fifth-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - {Title: "Sixth Ad", Id: "6", Price: "5€", Category: "Klunker", Text: "Thing to sale", Slug: "sixth-ad", - Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, - } -} - -// Initialize the valid sources for the httpmock responder -func InitValidSources(conf *Config) []Adsource { - // all our valid ads - adsrc := InitAds() - - // valid ad listing page 1 - list1 := []AdConfig{ - adsrc[0], adsrc[1], adsrc[2], - } - - // valid ad listing page 2 - list2 := []AdConfig{ - adsrc[3], adsrc[4], adsrc[5], - } - - // valid ad listing page 3, which is empty - list3 := []AdConfig{} - - // used to signal GetTemplate() to render a listing - empty := AdConfig{} - - // prepare urls for the listing pages - ads := []Adsource{ - { - uri: fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User), - content: GetTemplate(list1, empty, LISTTPL), - }, - { - uri: fmt.Sprintf("%s%s?userId=%d&pageNum=2", Baseuri, Listuri, conf.User), - content: GetTemplate(list2, empty, LISTTPL), - }, - { - uri: fmt.Sprintf("%s%s?userId=%d&pageNum=3", Baseuri, Listuri, conf.User), - content: GetTemplate(list3, empty, LISTTPL), - }, - } - - // prepare urls for the ads - for _, ad := range adsrc { - ads = append(ads, Adsource{ - uri: fmt.Sprintf("%s/s-anzeige/%s/%s", Baseuri, ad.Slug, ad.Id), - content: GetTemplate(nil, ad, ADTPL), - }) - //panic(GetTemplate(nil, ad, ADTPL)) - } - - return ads -} - -func InitInvalidSources(conf *Config) []Adsource { - empty := AdConfig{} - ads := []Adsource{ - { - // valid ad page but without content - uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri), - content: GetTemplate(nil, empty, EMPTYPAGE), - }, - { - // some random foreign webpage - uri: INVALIDURI, - content: GetTemplate(nil, empty, "foo"), - }, - { - // some invalid page path - uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri), - content: GetTemplate(nil, empty, ""), - }, - { - // some none-ad page - uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri), - content: GetTemplate(nil, empty, "HTTP 404: /eine-anzeige/ does not exist!"), - status: 404, - }, - } - - return ads -} - -// load a test image from disk -func GetImage(path string) []byte { - dat, err := os.ReadFile(path) - if err != nil { - panic(err) - } - - return dat -} - -// setup httpmock -func SetIntercept(conf *Config) { - ads := InitValidSources(conf) - eads := InitInvalidSources(conf) - - ads = append(ads, eads...) - - for _, ad := range ads { - if ad.status == 0 { - ad.status = 200 - } - - httpmock.RegisterResponder("GET", ad.uri, - httpmock.NewStringResponder(ad.status, ad.content)) - } - - // we just use 2 images, put this here - for _, image := range []string{"t/1.jpg", "t/2.jpg"} { - httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image))) - } - -} - -// the actual test, calls Start() from scrape, which recursively -// scrapes ads from a user -func TestStart(t *testing.T) { - httpmock.Activate() - defer httpmock.DeactivateAndReset() - - // fake config - conf := &Config{User: 1, Outdir: "t/out", Template: DefaultTemplate} - - // prepare httpmock responders - SetIntercept(conf) - - // run - if err := Start(conf); err != nil { - t.Errorf("failed to scrape: %s", err.Error()) - } - - // verify - for _, ad := range InitAds() { - file := fmt.Sprintf("t/out/%s/Adlisting.txt", ad.Slug) - content, err := os.ReadFile(file) - if err != nil { - t.Errorf("failed to read adlisting: %s", err.Error()) - } - - if !strings.Contains(string(content), ad.Category) && !strings.Contains(string(content), ad.Title) { - t.Errorf("failed to verify: %s content doesn't contain expected data", file) - } - } - - // uncomment to see slogs - //t.Errorf("debug") -} - -func TestSingleFail(t *testing.T) { - httpmock.Activate() - defer httpmock.DeactivateAndReset() - - // fake config - conf := &Config{Outdir: "t/out", Template: DefaultTemplate, Adlinks: []string{EMPTYURI}} - - SetIntercept(conf) - - // check empty ad - if err := Scrape(conf, EMPTYURI); err == nil { - t.Errorf("scrape returned empty ad") - } - - // wrong uri - if err := Scrape(conf, INVALIDURI); err == nil { - t.Errorf("scrape returned ad from invalid web site") - } - - // wrong path - if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1", Baseuri)); err == nil { - t.Errorf("scrape returned ad from invalid page") - } - - // wrong path - if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri)); err == nil { - t.Errorf("scrape returned ad from 404 page") - } -} diff --git a/t/fullconfig.conf b/t/fullconfig.conf new file mode 100644 index 0000000..9265883 --- /dev/null +++ b/t/fullconfig.conf @@ -0,0 +1,6 @@ +user = 1 +loglevel = "verbose" +outdir = "t/out" +template=""" +{{.Title}}{{.Price}}{{.Id}}{{.Category}}{{.Condition}}{{.Created}} +""" diff --git a/t/invalid.conf b/t/invalid.conf new file mode 100644 index 0000000..f7b6536 --- /dev/null +++ b/t/invalid.conf @@ -0,0 +1 @@ +user = "