From 46be48af38c8e9286f69b2acd9d924f1a18583b3 Mon Sep 17 00:00:00 2001 From: "T.v.Dein" Date: Mon, 10 Feb 2025 18:20:54 +0100 Subject: [PATCH] Generic attributes (#120) * fix #117: use a generic attribute parser, still support fixed attrs --- ad.go | 80 ++++++++++++++++++++++++++++++++++++++++-------- config.go | 10 +++++- example.conf | 38 +++++++++++++++-------- kleingebaeck.1 | 6 ++-- kleingebaeck.go | 2 -- kleingebaeck.pod | 2 -- main_test.go | 12 ++++++++ scrape.go | 13 +------- 8 files changed, 118 insertions(+), 45 deletions(-) diff --git a/ad.go b/ad.go index ac479ba..94fc38c 100644 --- a/ad.go +++ b/ad.go @@ -18,6 +18,7 @@ along with this program. If not, see . package main import ( + "bufio" "log/slog" "strings" "time" @@ -31,10 +32,12 @@ type Ad struct { Title string `goquery:"h1"` Slug string ID string - Details []string `goquery:".addetailslist--detail--value,text"` - Condition string // post processed from details - Type string // post processed from details - Color string // post processed from details + Details string `goquery:".addetailslist--detail,text"` + Attributes map[string]string // processed afterwards + Condition string // post processed from details for backward compatibility + Type string // post processed from details for backward compatibility + Color string // post processed from details for backward compatibility + Material string // post processed from details for backward compatibility Category string CategoryTree []string `goquery:".breadcrump-link,text"` Price string `goquery:"h2#viewad-price"` @@ -53,19 +56,11 @@ func (ad *Ad) LogValue() slog.Value { slog.Int("imagecount", len(ad.Images)), slog.Int("bodysize", len(ad.Text)), slog.String("categorytree", strings.Join(ad.CategoryTree, "+")), - slog.String("condition", ad.Condition), slog.String("created", ad.Created), slog.String("expire", ad.Expire), ) } -// static set of conditions available, used for post processing details -var CONDITIONS = []string{"Neu", "Gut", "Sehr Gut", "In Ordnung"} -var COLORS = []string{"Beige", "Blau", "Braun", "Bunt", "Burgunderrot", - "Creme", "Gelb", "Gold", "Grau", "Grün", "Holz", "Khaki", "Lavelndel", - "Lila", "Orange", "Pink", "Print", "Rot", "Schwarz", "Silber", - "Transparent", "Türkis", "Weiß", "Sonstige"} - // check for completeness. I erected these fields to be mandatory // (though I really don't know if they really are). I consider images // and meta optional. So, if either of the checked fields here is @@ -90,3 +85,64 @@ func (ad *Ad) CalculateExpire() { } } } + +/* +Decode attributes like color or condition. See +https://github.com/TLINDEN/kleingebaeck/issues/117 +for more details. In short: the HTML delivered by +kleinanzeigen.de has no css attribute for the keys +so we cannot extract key=>value mappings of the +ad details but have to parse them manually. + +The ad.Details member contains this after goq run: + +Art + + Weitere Kinderzimmermöbel + + Farbe + Holz + + Zustand + In Ordnung + +We parse this into ad.Attributes and fill in some +static members for backward compatibility reasons. +*/ +func (ad *Ad) DecodeAttributes() { + rd := strings.NewReader(ad.Details) + scanner := bufio.NewScanner(rd) + + isattr := true + attr := "" + attrmap := map[string]string{} + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + if line == "" { + continue + } + + if isattr { + attr = line + } else { + attrmap[attr] = line + } + + isattr = !isattr + } + + ad.Attributes = attrmap + + switch { + case Exists(ad.Attributes, "Zustand"): + ad.Condition = ad.Attributes["Zustand"] + case Exists(ad.Attributes, "Farbe"): + ad.Color = ad.Attributes["Farbe"] + case Exists(ad.Attributes, "Art"): + ad.Type = ad.Attributes["Type"] + case Exists(ad.Attributes, "Material"): + ad.Material = ad.Attributes["Material"] + } +} diff --git a/config.go b/config.go index 8f6613d..0b1bb84 100644 --- a/config.go +++ b/config.go @@ -34,11 +34,19 @@ import ( ) const ( - VERSION string = "0.3.15" + VERSION string = "0.3.16" Baseuri string = "https://www.kleinanzeigen.de" Listuri string = "/s-bestandsliste.html" Defaultdir string = "." + /* + Also possible: loop through .Attributes: + + DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" + + "Category: {{.Category}}\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\n{{ end }}" + + "Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n" + + */ DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" + "Category: {{.Category}}\nCondition: {{.Condition}}\nType: {{.Type}}\nColor: {{.Color}}\n" + "Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n" diff --git a/example.conf b/example.conf index 9b94fe2..d1433ac 100644 --- a/example.conf +++ b/example.conf @@ -15,17 +15,31 @@ loglevel = "verbose" # create it. must be a quoted string. outdir = "test" -# template for stored adlistings. To enable it, remove the comment -# chars up until the last #""" -#template=""" -#Title: {{.Title}} -#Price: {{.Price}} -#Id: {{.Id}} -#Category: {{.Category}} -#Condition: {{.Condition}} -#Type: {{.Type}} -#Created: {{.Created}} +# template for stored adlistings. +template=""" +Title: {{.Title}} +Price: {{.Price}} +Id: {{.Id}} +Category: {{.Category}} +Condition: {{.Condition}} +Type: {{.Type}} +Created: {{.Created}} -#{{.Text}} -# """ +{{.Text}} +""" +# Ads may contain more attributes than just the Condition. To print +# all attributes, loop over all of them: + +template=""" +Title: {{.Title}} +Price: {{.Price}} +Id: {{.Id}} +Category: {{.Category}} +{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }} +{{ end }} +Type: {{.Type}} +Created: {{.Created}} + +{{.Text}} +""" diff --git a/kleingebaeck.1 b/kleingebaeck.1 index 96aaf50..0667b43 100644 --- a/kleingebaeck.1 +++ b/kleingebaeck.1 @@ -133,7 +133,7 @@ .\" ======================================================================== .\" .IX Title "KLEINGEBAECK 1" -.TH KLEINGEBAECK 1 "2025-02-06" "1" "User Commands" +.TH KLEINGEBAECK 1 "2025-02-10" "1" "User Commands" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l @@ -174,7 +174,7 @@ well. We use \s-1TOML\s0 as our configuration language. See .PP Format is pretty simple: .PP -.Vb 10 +.Vb 11 \& user = 1010101 \& loglevel = verbose \& outdir = "test" @@ -185,8 +185,6 @@ Format is pretty simple: \& Id: {{.ID}} \& Category: {{.Category}} \& Condition: {{.Condition}} -\& Type: {{.Type}} -\& Color: {{.Color}} \& Created: {{.Created}} \& \& {{.Text}} diff --git a/kleingebaeck.go b/kleingebaeck.go index 705fb4a..d15e567 100644 --- a/kleingebaeck.go +++ b/kleingebaeck.go @@ -46,8 +46,6 @@ CONFIGURATION Id: {{.ID}} Category: {{.Category}} Condition: {{.Condition}} - Type: {{.Type}} - Color: {{.Color}} Created: {{.Created}} {{.Text}} diff --git a/kleingebaeck.pod b/kleingebaeck.pod index 7737a30..3b8dfd4 100644 --- a/kleingebaeck.pod +++ b/kleingebaeck.pod @@ -46,8 +46,6 @@ Format is pretty simple: Id: {{.ID}} Category: {{.Category}} Condition: {{.Condition}} - Type: {{.Type}} - Color: {{.Color}} Created: {{.Created}} {{.Text}} diff --git a/main_test.go b/main_test.go index 07b5115..37c9da6 100644 --- a/main_test.go +++ b/main_test.go @@ -283,6 +283,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "second-ad", Condition: "Gut", + Color: "Lila", + Type: "Schoki", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}, }, @@ -294,6 +296,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "third-ad", Condition: "In Ordnung", + Color: "Blau", + Type: "Auto", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}, }, @@ -305,6 +309,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "fourth-ad", Condition: "Neu", + Color: "Rot", + Type: "Spielzeut", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}, }, @@ -316,6 +322,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "fifth-ad", Condition: "Sehr Gut", + Color: "Braun", + Type: "Parteibuch", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}, }, @@ -327,6 +335,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "sixth-ad", Condition: "Sehr Gut", + Color: "Silber", + Type: "Ring", Created: "Yesterday", Images: []string{"t/1.jpg", "t/2.jpg"}, }, @@ -338,6 +348,8 @@ var adsrc = []AdConfig{ Text: "Thing to sale", Slug: "seventh-ad", Condition: "Sehr Gut", + Color: "Gelpb", + Type: "Schmuck", Created: "Yesterday", Images: []string{"t/1.png", "t/1.gif", "t/1.webp", "t/1.jpg"}, }, diff --git a/scrape.go b/scrape.go index 2774125..2b279e0 100644 --- a/scrape.go +++ b/scrape.go @@ -22,7 +22,6 @@ import ( "fmt" "log/slog" "path/filepath" - "slices" "strconv" "strings" "time" @@ -125,17 +124,7 @@ func ScrapeAd(fetch *Fetcher, uri string) error { return fmt.Errorf("could not extract ad data from page, got empty struct") } - for _, detail := range advertisement.Details { - switch { - case slices.Contains(CONDITIONS, detail): - advertisement.Condition = detail - case slices.Contains(COLORS, detail): - advertisement.Color = detail - default: - advertisement.Type = detail - } - } - + advertisement.DecodeAttributes() advertisement.CalculateExpire() // prepare ad dir name