/* Copyright © 2023 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ package main import ( "bytes" "fmt" "os" "strings" "testing" tpl "text/template" "github.com/jarcoal/httpmock" ) // used to fill an ad template and the ad listing page template type AdConfig struct { Title string Slug string Id string Price string Category string Condition string Created string Text string Images []string // files in ./t/ } // the ad list, aka: // https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX // Note, that this HTML code is reduced to the max, so that it only // contains the stuff required to satisfy goquery const LISTTPL string = ` Ads {{ range . }}

{{ .Title }}

{{ end }} ` // an actual ad listing, aka: // https://www.kleinanzeigen.de/s-anzeige/ad-text-slug/1010101010 // Note, that this HTML code is reduced to the max, so that it only // contains the stuff required to satisfy goquery const ADTPL string = `DOCTYPE html> Ad Listing {{ range $image := .Images }}
{{ end }}

{{ .Title }}

{{ .Price }}

{{ .Created }}

{{ .Text }}

` const EMPTYPAGE string = `DOCTYPE html> ` const EMPTYURI string = `https://www.kleinanzeigen.de/s-anzeige/empty/1` const INVALIDURI string = `https://foo.bar/weird/things` // An Adsource is used to construct a httpmock responder for a // particular url. So, the code (scrape.go) scrapes // https://kleinanzeigen.de, but in reality httpmock captures the // request and responds with our mock data type Adsource struct { uri string content string status int } // Render a HTML template for an adlisting or an ad func GetTemplate(l []AdConfig, a AdConfig, htmltemplate string) string { tmpl, err := tpl.New("template").Parse(htmltemplate) if err != nil { panic(err) } var out bytes.Buffer if len(a.Id) == 0 { err = tmpl.Execute(&out, l) } else { err = tmpl.Execute(&out, a) } if err != nil { panic(err) } return out.String() } func InitAds() []AdConfig { return []AdConfig{ {Title: "First Ad", Id: "1", Price: "5€", Category: "Klimbim", Text: "Thing to sale", Slug: "first-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, {Title: "Secnd Ad", Id: "2", Price: "5€", Category: "Kram", Text: "Thing to sale", Slug: "second-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, {Title: "Third Ad", Id: "3", Price: "5€", Category: "Kuddelmuddel", Text: "Thing to sale", Slug: "third-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, {Title: "Forth Ad", Id: "4", Price: "5€", Category: "Krempel", Text: "Thing to sale", Slug: "fourth-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, {Title: "Fifth Ad", Id: "5", Price: "5€", Category: "Kladderadatsch", Text: "Thing to sale", Slug: "fifth-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, {Title: "Sixth Ad", Id: "6", Price: "5€", Category: "Klunker", Text: "Thing to sale", Slug: "sixth-ad", Condition: "works", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}}, } } // Initialize the valid sources for the httpmock responder func InitValidSources(conf *Config) []Adsource { // all our valid ads adsrc := InitAds() // valid ad listing page 1 list1 := []AdConfig{ adsrc[0], adsrc[1], adsrc[2], } // valid ad listing page 2 list2 := []AdConfig{ adsrc[3], adsrc[4], adsrc[5], } // valid ad listing page 3, which is empty list3 := []AdConfig{} // used to signal GetTemplate() to render a listing empty := AdConfig{} // prepare urls for the listing pages ads := []Adsource{ { uri: fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User), content: GetTemplate(list1, empty, LISTTPL), }, { uri: fmt.Sprintf("%s%s?userId=%d&pageNum=2", Baseuri, Listuri, conf.User), content: GetTemplate(list2, empty, LISTTPL), }, { uri: fmt.Sprintf("%s%s?userId=%d&pageNum=3", Baseuri, Listuri, conf.User), content: GetTemplate(list3, empty, LISTTPL), }, } // prepare urls for the ads for _, ad := range adsrc { ads = append(ads, Adsource{ uri: fmt.Sprintf("%s/s-anzeige/%s/%s", Baseuri, ad.Slug, ad.Id), content: GetTemplate(nil, ad, ADTPL), }) //panic(GetTemplate(nil, ad, ADTPL)) } return ads } func InitInvalidSources(conf *Config) []Adsource { empty := AdConfig{} ads := []Adsource{ { // valid ad page but without content uri: fmt.Sprintf("%s/s-anzeige/empty/1", Baseuri), content: GetTemplate(nil, empty, EMPTYPAGE), }, { // some random foreign webpage uri: INVALIDURI, content: GetTemplate(nil, empty, "foo"), }, { // some invalid page path uri: fmt.Sprintf("%s/anzeige/name/1", Baseuri), content: GetTemplate(nil, empty, ""), }, { // some none-ad page uri: fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri), content: GetTemplate(nil, empty, "HTTP 404: /eine-anzeige/ does not exist!"), status: 404, }, } return ads } // load a test image from disk func GetImage(path string) []byte { dat, err := os.ReadFile(path) if err != nil { panic(err) } return dat } // setup httpmock func SetIntercept(conf *Config) { ads := InitValidSources(conf) eads := InitInvalidSources(conf) ads = append(ads, eads...) for _, ad := range ads { if ad.status == 0 { ad.status = 200 } httpmock.RegisterResponder("GET", ad.uri, httpmock.NewStringResponder(ad.status, ad.content)) } // we just use 2 images, put this here for _, image := range []string{"t/1.jpg", "t/2.jpg"} { httpmock.RegisterResponder("GET", image, httpmock.NewBytesResponder(200, GetImage(image))) } } // the actual test, calls Start() from scrape, which recursively // scrapes ads from a user func TestStart(t *testing.T) { httpmock.Activate() defer httpmock.DeactivateAndReset() // fake config conf := &Config{User: 1, Outdir: "t/out", Template: DefaultTemplate} // prepare httpmock responders SetIntercept(conf) // run if err := Start(conf); err != nil { t.Errorf("failed to scrape: %s", err.Error()) } // verify for _, ad := range InitAds() { file := fmt.Sprintf("t/out/%s/Adlisting.txt", ad.Slug) content, err := os.ReadFile(file) if err != nil { t.Errorf("failed to read adlisting: %s", err.Error()) } if !strings.Contains(string(content), ad.Category) && !strings.Contains(string(content), ad.Title) { t.Errorf("failed to verify: %s content doesn't contain expected data", file) } } // uncomment to see slogs //t.Errorf("debug") } func TestSingleFail(t *testing.T) { httpmock.Activate() defer httpmock.DeactivateAndReset() // fake config conf := &Config{Outdir: "t/out", Template: DefaultTemplate, Adlinks: []string{EMPTYURI}} SetIntercept(conf) // check empty ad if err := Scrape(conf, EMPTYURI); err == nil { t.Errorf("scrape returned empty ad") } // wrong uri if err := Scrape(conf, INVALIDURI); err == nil { t.Errorf("scrape returned ad from invalid web site") } // wrong path if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1", Baseuri)); err == nil { t.Errorf("scrape returned ad from invalid page") } // wrong path if err := Scrape(conf, fmt.Sprintf("%s/anzeige/name/1/foo/bar", Baseuri)); err == nil { t.Errorf("scrape returned ad from 404 page") } }