From 0fd9b519d13dc83892a6d43fde20c78ee2b09c2d Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Fri, 12 Jan 2024 14:11:02 +0100 Subject: [PATCH] fixed changes on kleinanzeigen.de: - Meta did not contain condition and category together anymore, they removed the category. Therefore fetching (that is, validation) failed. - Now we extract the condition and category directly. - On top, category now includes the whole category tree. - unit tests had to be tweaked for this measure. --- ad.go | 25 ++++++++++++++----------- main_test.go | 16 +++++++++++----- scrape.go | 7 ++++--- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/ad.go b/ad.go index a9de619..c0889ae 100644 --- a/ad.go +++ b/ad.go @@ -19,6 +19,7 @@ package main import ( "log/slog" + "strings" ) type Index struct { @@ -26,16 +27,16 @@ type Index struct { } type Ad struct { - Title string `goquery:"h1"` - Slug string - Id string - Condition string - Category string - Price string `goquery:"h2#viewad-price"` - Created string `goquery:"#viewad-extra-info,text"` - Text string `goquery:"p#viewad-description-text,html"` - Images []string `goquery:".galleryimage-element img,[src]"` - Meta []string `goquery:".addetailslist--detail--value,text"` + Title string `goquery:"h1"` + Slug string + Id string + Condition string `goquery:".addetailslist--detail--value,text"` + Category string + CategoryTree []string `goquery:".breadcrump-link,text"` + Price string `goquery:"h2#viewad-price"` + Created string `goquery:"#viewad-extra-info,text"` + Text string `goquery:"p#viewad-description-text,html"` + Images []string `goquery:".galleryimage-element img,[src]"` } // Used by slog to pretty print an ad @@ -46,6 +47,8 @@ func (ad *Ad) LogValue() slog.Value { slog.String("id", ad.Id), slog.Int("imagecount", len(ad.Images)), slog.Int("bodysize", len(ad.Text)), + slog.String("categorytree", strings.Join(ad.CategoryTree, "+")), + slog.String("condition", ad.Condition), ) } @@ -58,7 +61,7 @@ func (ad *Ad) LogValue() slog.Value { // // Note: we return true for "ad is incomplete" and false for "ad is complete"! func (ad *Ad) Incomplete() bool { - if ad.Category == "" || ad.Condition == "" || ad.Created == "" || ad.Text == "" { + if ad.Category == "" || ad.Created == "" || ad.Text == "" { return true } diff --git a/main_test.go b/main_test.go index 9b7dc5e..4244175 100644 --- a/main_test.go +++ b/main_test.go @@ -60,6 +60,16 @@ const ADTPL string = `DOCTYPE html> +
+ +
+ {{ range $image := .Images }}
@@ -79,10 +89,6 @@ const ADTPL string = `DOCTYPE html>
    -
  • - Art - {{ .Category }} -
  • Zustand {{ .Condition }} @@ -438,7 +444,7 @@ func SetIntercept(ads []Adsource) { } func VerifyAd(ad AdConfig) error { - body := ad.Title + ad.Price + ad.Id + ad.Category + ad.Condition + ad.Created + body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created // prepare ad dir name using DefaultAdNameTemplate c := Config{Adnametemplate: DefaultAdNameTemplate} diff --git a/scrape.go b/scrape.go index 895fb20..6545b70 100644 --- a/scrape.go +++ b/scrape.go @@ -135,12 +135,13 @@ func Scrape(c *Config, uri string) error { if err != nil { return err } - if len(ad.Meta) == 2 { - ad.Category = ad.Meta[0] - ad.Condition = ad.Meta[1] + + if len(ad.CategoryTree) > 0 { + ad.Category = strings.Join(ad.CategoryTree, " => ") } if ad.Incomplete() { + slog.Debug("got ad", "ad", ad) return errors.New("could not extract ad data from page, got empty struct") }