From 9cd1fc059654d84d8650a50d78e033ad5e04598d Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Wed, 24 Jan 2024 19:19:31 +0100 Subject: [PATCH] behavior changes: UserAgent configurable, test cookies, check errors --- config.go | 16 +++++++++++----- fetch.go | 30 ++++++++++++++---------------- kleingebaeck.1 | 5 +++-- kleingebaeck.go | 1 + kleingebaeck.pod | 1 + main.go | 5 ++++- main_test.go | 8 ++++++-- store_test.go | 37 +++++++++++++++++++++++++++++++++++++ 8 files changed, 77 insertions(+), 26 deletions(-) create mode 100644 store_test.go diff --git a/config.go b/config.go index 31309ee..e6004a8 100644 --- a/config.go +++ b/config.go @@ -35,18 +35,22 @@ import ( ) const ( - VERSION string = "0.3.0" - Baseuri string = "https://www.kleinanzeigen.de" - Listuri string = "/s-bestandsliste.html" - Defaultdir string = "." + VERSION string = "0.3.1" + Baseuri string = "https://www.kleinanzeigen.de" + Listuri string = "/s-bestandsliste.html" + Defaultdir string = "." + DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" + "Category: {{.Category}}\nCondition: {{.Condition}}\n" + "Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n" + DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.Id}}\r\n" + "Category: {{.Category}}\r\nCondition: {{.Condition}}\r\n" + "Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n" - Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + + + DefaultUserAgent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + DefaultAdNameTemplate string = "{{.Slug}}" // for image download throttling @@ -88,6 +92,7 @@ type Config struct { Limit int `koanf:"limit"` IgnoreErrors bool `koanf:"ignoreerrors"` ForceDownload bool `koanf:"force"` + UserAgent string `koanf:"useragent"` // conf only Adlinks []string StatsCountAds int StatsCountImages int @@ -118,6 +123,7 @@ func InitConfig(w io.Writer) (*Config, error) { "loglevel": "notice", "userid": 0, "adnametemplate": DefaultAdNameTemplate, + "useragent": DefaultUserAgent, }, "."), nil); err != nil { return nil, err } diff --git a/fetch.go b/fetch.go index 59f79d4..2160ae7 100644 --- a/fetch.go +++ b/fetch.go @@ -20,7 +20,6 @@ package main import ( "errors" "io" - "log" "log/slog" "net/http" "net/http/cookiejar" @@ -29,28 +28,26 @@ import ( // convenient wrapper to fetch some web content type Fetcher struct { - Config *Config - Client *http.Client - Useragent string // FIXME: make configurable - Cookies []*http.Cookie + Config *Config + Client *http.Client + Cookies []*http.Cookie } -func NewFetcher(c *Config) *Fetcher { +func NewFetcher(c *Config) (*Fetcher, error) { jar, err := cookiejar.New(nil) if err != nil { - // cannot return error here, FIXME - log.Fatalf("Got error while creating cookie jar %s", err.Error()) + return nil, err } return &Fetcher{ - Client: &http.Client{ - Transport: &loggingTransport{}, // implemented in http.go - Jar: jar, + Client: &http.Client{ + Transport: &loggingTransport{}, // implemented in http.go + Jar: jar, + }, + Config: c, + Cookies: []*http.Cookie{}, }, - Useragent: Useragent, // default in config.go - Config: c, - Cookies: []*http.Cookie{}, - } + nil } func (f *Fetcher) Get(uri string) (io.ReadCloser, error) { @@ -59,7 +56,7 @@ func (f *Fetcher) Get(uri string) (io.ReadCloser, error) { return nil, err } - req.Header.Set("User-Agent", f.Useragent) + req.Header.Set("User-Agent", f.Config.UserAgent) if len(f.Cookies) > 0 { uriobj, _ := url.Parse(Baseuri) @@ -79,6 +76,7 @@ func (f *Fetcher) Get(uri string) (io.ReadCloser, error) { return nil, errors.New("could not get page via HTTP") } + slog.Debug("got cookies?", "cookies", res.Cookies()) f.Cookies = res.Cookies() return res.Body, nil diff --git a/kleingebaeck.1 b/kleingebaeck.1 index d8b4bec..b38c4f5 100644 --- a/kleingebaeck.1 +++ b/kleingebaeck.1 @@ -133,7 +133,7 @@ .\" ======================================================================== .\" .IX Title "KLEINGEBAECK 1" -.TH KLEINGEBAECK 1 "2024-01-22" "1" "User Commands" +.TH KLEINGEBAECK 1 "2024-01-24" "1" "User Commands" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l @@ -174,10 +174,11 @@ well. We use \s-1TOML\s0 as our configuration language. See .PP Format is pretty simple: .PP -.Vb 10 +.Vb 11 \& user = 1010101 \& loglevel = verbose \& outdir = "test" +\& useragent = "Mozilla/5.0" \& template = """ \& Title: {{.Title}} \& Price: {{.Price}} diff --git a/kleingebaeck.go b/kleingebaeck.go index bf510b1..5da890d 100644 --- a/kleingebaeck.go +++ b/kleingebaeck.go @@ -39,6 +39,7 @@ CONFIGURATION user = 1010101 loglevel = verbose outdir = "test" + useragent = "Mozilla/5.0" template = """ Title: {{.Title}} Price: {{.Price}} diff --git a/kleingebaeck.pod b/kleingebaeck.pod index 664056b..a3cf39b 100644 --- a/kleingebaeck.pod +++ b/kleingebaeck.pod @@ -39,6 +39,7 @@ Format is pretty simple: user = 1010101 loglevel = verbose outdir = "test" + useragent = "Mozilla/5.0" template = """ Title: {{.Title}} Price: {{.Price}} diff --git a/main.go b/main.go index 7a4d0be..93857e8 100644 --- a/main.go +++ b/main.go @@ -113,7 +113,10 @@ func Main(w io.Writer) int { } // used for all HTTP requests - fetch := NewFetcher(conf) + fetch, err := NewFetcher(conf) + if err != nil { + return Die(err) + } // randomization needed here and there rand.Seed(time.Now().UnixNano()) diff --git a/main_test.go b/main_test.go index 5075912..20c1aa8 100644 --- a/main_test.go +++ b/main_test.go @@ -21,6 +21,7 @@ import ( "bytes" "errors" "fmt" + "net/http" "os" "strings" "testing" @@ -446,19 +447,22 @@ func GetImage(path string) []byte { // setup httpmock func SetIntercept(ads []Adsource) { + ch := http.Header{} + ch.Add("Set-Cookie", "session=permanent") + for _, ad := range ads { if ad.status == 0 { ad.status = 200 } httpmock.RegisterResponder("GET", ad.uri, - httpmock.NewStringResponder(ad.status, ad.content)) + httpmock.NewStringResponder(ad.status, ad.content).HeaderAdd(ch)) } // we just use 2 images, put this here for _, image := range []string{"t/1.jpg", "t/2.jpg"} { httpmock.RegisterResponder("GET", image, - httpmock.NewBytesResponder(200, GetImage(image))) + httpmock.NewBytesResponder(200, GetImage(image)).HeaderAdd(ch)) } } diff --git a/store_test.go b/store_test.go new file mode 100644 index 0000000..90a04b0 --- /dev/null +++ b/store_test.go @@ -0,0 +1,37 @@ +/* +Copyright © 2023-2024 Thomas von Dein + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +package main + +import ( + "testing" +) + +// this is a weird thing. WriteImage() is being called in scrape.go +// which is being tested by TestMain() in main_test.go. However, it +// doesn't show up in the coverage report for unknown reasons, so +// here's a single test for it +func TestWriteImage(t *testing.T) { + buf := []byte{1, 2, 3, 4, 5, 6, 7, 8} + file := "t/out/t.jpg" + + err := WriteImage(file, buf) + if err != nil { + t.Errorf("Could not write mock image to %s: %s", file, err.Error()) + } + +}