mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-17 04:21:00 +01:00
fix #117: use a generic attribute parser, still support fixed attrs
This commit is contained in:
66
ad.go
66
ad.go
@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -33,6 +34,10 @@ type Ad struct {
|
|||||||
ID string
|
ID string
|
||||||
Details string `goquery:".addetailslist--detail,text"`
|
Details string `goquery:".addetailslist--detail,text"`
|
||||||
Attributes map[string]string // processed afterwards
|
Attributes map[string]string // processed afterwards
|
||||||
|
Condition string // post processed from details for backward compatibility
|
||||||
|
Type string // post processed from details for backward compatibility
|
||||||
|
Color string // post processed from details for backward compatibility
|
||||||
|
Material string // post processed from details for backward compatibility
|
||||||
Category string
|
Category string
|
||||||
CategoryTree []string `goquery:".breadcrump-link,text"`
|
CategoryTree []string `goquery:".breadcrump-link,text"`
|
||||||
Price string `goquery:"h2#viewad-price"`
|
Price string `goquery:"h2#viewad-price"`
|
||||||
@@ -80,3 +85,64 @@ func (ad *Ad) CalculateExpire() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Decode attributes like color or condition. See
|
||||||
|
https://github.com/TLINDEN/kleingebaeck/issues/117
|
||||||
|
for more details. In short: the HTML delivered by
|
||||||
|
kleinanzeigen.de has no css attribute for the keys
|
||||||
|
so we cannot extract key=>value mappings of the
|
||||||
|
ad details but have to parse them manually.
|
||||||
|
|
||||||
|
The ad.Details member contains this after goq run:
|
||||||
|
|
||||||
|
Art
|
||||||
|
|
||||||
|
Weitere Kinderzimmermöbel
|
||||||
|
|
||||||
|
Farbe
|
||||||
|
Holz
|
||||||
|
|
||||||
|
Zustand
|
||||||
|
In Ordnung
|
||||||
|
|
||||||
|
We parse this into ad.Attributes and fill in some
|
||||||
|
static members for backward compatibility reasons.
|
||||||
|
*/
|
||||||
|
func (ad *Ad) DecodeAttributes() {
|
||||||
|
rd := strings.NewReader(ad.Details)
|
||||||
|
scanner := bufio.NewScanner(rd)
|
||||||
|
|
||||||
|
isattr := true
|
||||||
|
attr := ""
|
||||||
|
attrmap := map[string]string{}
|
||||||
|
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if isattr {
|
||||||
|
attr = line
|
||||||
|
} else {
|
||||||
|
attrmap[attr] = line
|
||||||
|
}
|
||||||
|
|
||||||
|
isattr = !isattr
|
||||||
|
}
|
||||||
|
|
||||||
|
ad.Attributes = attrmap
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case Exists(ad.Attributes, "Zustand"):
|
||||||
|
ad.Condition = ad.Attributes["Zustand"]
|
||||||
|
case Exists(ad.Attributes, "Farbe"):
|
||||||
|
ad.Color = ad.Attributes["Farbe"]
|
||||||
|
case Exists(ad.Attributes, "Art"):
|
||||||
|
ad.Type = ad.Attributes["Type"]
|
||||||
|
case Exists(ad.Attributes, "Material"):
|
||||||
|
ad.Material = ad.Attributes["Material"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
12
config.go
12
config.go
@@ -34,17 +34,25 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
VERSION string = "0.3.15"
|
VERSION string = "0.3.16"
|
||||||
Baseuri string = "https://www.kleinanzeigen.de"
|
Baseuri string = "https://www.kleinanzeigen.de"
|
||||||
Listuri string = "/s-bestandsliste.html"
|
Listuri string = "/s-bestandsliste.html"
|
||||||
Defaultdir string = "."
|
Defaultdir string = "."
|
||||||
|
|
||||||
|
/*
|
||||||
|
Also possible: loop through .Attributes:
|
||||||
|
|
||||||
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" +
|
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" +
|
||||||
"Category: {{.Category}}\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\n{{ end }}" +
|
"Category: {{.Category}}\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\n{{ end }}" +
|
||||||
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
|
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
|
||||||
|
|
||||||
|
*/
|
||||||
|
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" +
|
||||||
|
"Category: {{.Category}}\nCondition: {{.Condition}}\nType: {{.Type}}\nColor: {{.Color}}\n" +
|
||||||
|
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
|
||||||
|
|
||||||
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.ID}}\r\n" +
|
DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.ID}}\r\n" +
|
||||||
"Category: {{.Category}}\r\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\r\n{{ end }}\r\n" +
|
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nType: {{.Type}}\r\nColor: {{.Color}}\r\n" +
|
||||||
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
|
"Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n"
|
||||||
|
|
||||||
DefaultUserAgent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
DefaultUserAgent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
||||||
|
|||||||
38
example.conf
38
example.conf
@@ -15,17 +15,31 @@ loglevel = "verbose"
|
|||||||
# create it. must be a quoted string.
|
# create it. must be a quoted string.
|
||||||
outdir = "test"
|
outdir = "test"
|
||||||
|
|
||||||
# template for stored adlistings. To enable it, remove the comment
|
# template for stored adlistings.
|
||||||
# chars up until the last #"""
|
template="""
|
||||||
#template="""
|
Title: {{.Title}}
|
||||||
#Title: {{.Title}}
|
Price: {{.Price}}
|
||||||
#Price: {{.Price}}
|
Id: {{.Id}}
|
||||||
#Id: {{.Id}}
|
Category: {{.Category}}
|
||||||
#Category: {{.Category}}
|
Condition: {{.Condition}}
|
||||||
#Condition: {{.Attributes.Condition}}
|
Type: {{.Type}}
|
||||||
#Type: {{.Type}}
|
Created: {{.Created}}
|
||||||
#Created: {{.Created}}
|
|
||||||
|
|
||||||
#{{.Text}}
|
{{.Text}}
|
||||||
# """
|
"""
|
||||||
|
|
||||||
|
# Ads may contain more attributes than just the Condition. To print
|
||||||
|
# all attributes, loop over all of them:
|
||||||
|
|
||||||
|
template="""
|
||||||
|
Title: {{.Title}}
|
||||||
|
Price: {{.Price}}
|
||||||
|
Id: {{.Id}}
|
||||||
|
Category: {{.Category}}
|
||||||
|
{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}
|
||||||
|
{{ end }}
|
||||||
|
Type: {{.Type}}
|
||||||
|
Created: {{.Created}}
|
||||||
|
|
||||||
|
{{.Text}}
|
||||||
|
"""
|
||||||
|
|||||||
@@ -133,7 +133,7 @@
|
|||||||
.\" ========================================================================
|
.\" ========================================================================
|
||||||
.\"
|
.\"
|
||||||
.IX Title "KLEINGEBAECK 1"
|
.IX Title "KLEINGEBAECK 1"
|
||||||
.TH KLEINGEBAECK 1 "2025-02-06" "1" "User Commands"
|
.TH KLEINGEBAECK 1 "2025-02-10" "1" "User Commands"
|
||||||
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
||||||
.\" way too many mistakes in technical documents.
|
.\" way too many mistakes in technical documents.
|
||||||
.if n .ad l
|
.if n .ad l
|
||||||
@@ -174,7 +174,7 @@ well. We use \s-1TOML\s0 as our configuration language. See
|
|||||||
.PP
|
.PP
|
||||||
Format is pretty simple:
|
Format is pretty simple:
|
||||||
.PP
|
.PP
|
||||||
.Vb 10
|
.Vb 11
|
||||||
\& user = 1010101
|
\& user = 1010101
|
||||||
\& loglevel = verbose
|
\& loglevel = verbose
|
||||||
\& outdir = "test"
|
\& outdir = "test"
|
||||||
@@ -185,8 +185,6 @@ Format is pretty simple:
|
|||||||
\& Id: {{.ID}}
|
\& Id: {{.ID}}
|
||||||
\& Category: {{.Category}}
|
\& Category: {{.Category}}
|
||||||
\& Condition: {{.Condition}}
|
\& Condition: {{.Condition}}
|
||||||
\& Type: {{.Type}}
|
|
||||||
\& Color: {{.Color}}
|
|
||||||
\& Created: {{.Created}}
|
\& Created: {{.Created}}
|
||||||
\&
|
\&
|
||||||
\& {{.Text}}
|
\& {{.Text}}
|
||||||
|
|||||||
@@ -46,8 +46,6 @@ CONFIGURATION
|
|||||||
Id: {{.ID}}
|
Id: {{.ID}}
|
||||||
Category: {{.Category}}
|
Category: {{.Category}}
|
||||||
Condition: {{.Condition}}
|
Condition: {{.Condition}}
|
||||||
Type: {{.Type}}
|
|
||||||
Color: {{.Color}}
|
|
||||||
Created: {{.Created}}
|
Created: {{.Created}}
|
||||||
|
|
||||||
{{.Text}}
|
{{.Text}}
|
||||||
|
|||||||
@@ -45,8 +45,7 @@ Format is pretty simple:
|
|||||||
Price: {{.Price}}
|
Price: {{.Price}}
|
||||||
Id: {{.ID}}
|
Id: {{.ID}}
|
||||||
Category: {{.Category}}
|
Category: {{.Category}}
|
||||||
{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}
|
Condition: {{.Condition}}
|
||||||
{{ end }}
|
|
||||||
Created: {{.Created}}
|
Created: {{.Created}}
|
||||||
|
|
||||||
{{.Text}}
|
{{.Text}}
|
||||||
|
|||||||
12
main_test.go
12
main_test.go
@@ -283,6 +283,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "second-ad",
|
Slug: "second-ad",
|
||||||
Condition: "Gut",
|
Condition: "Gut",
|
||||||
|
Color: "Lila",
|
||||||
|
Type: "Schoki",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.jpg", "t/2.jpg"},
|
Images: []string{"t/1.jpg", "t/2.jpg"},
|
||||||
},
|
},
|
||||||
@@ -294,6 +296,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "third-ad",
|
Slug: "third-ad",
|
||||||
Condition: "In Ordnung",
|
Condition: "In Ordnung",
|
||||||
|
Color: "Blau",
|
||||||
|
Type: "Auto",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.jpg", "t/2.jpg"},
|
Images: []string{"t/1.jpg", "t/2.jpg"},
|
||||||
},
|
},
|
||||||
@@ -305,6 +309,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "fourth-ad",
|
Slug: "fourth-ad",
|
||||||
Condition: "Neu",
|
Condition: "Neu",
|
||||||
|
Color: "Rot",
|
||||||
|
Type: "Spielzeut",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.jpg", "t/2.jpg"},
|
Images: []string{"t/1.jpg", "t/2.jpg"},
|
||||||
},
|
},
|
||||||
@@ -316,6 +322,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "fifth-ad",
|
Slug: "fifth-ad",
|
||||||
Condition: "Sehr Gut",
|
Condition: "Sehr Gut",
|
||||||
|
Color: "Braun",
|
||||||
|
Type: "Parteibuch",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.jpg", "t/2.jpg"},
|
Images: []string{"t/1.jpg", "t/2.jpg"},
|
||||||
},
|
},
|
||||||
@@ -327,6 +335,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "sixth-ad",
|
Slug: "sixth-ad",
|
||||||
Condition: "Sehr Gut",
|
Condition: "Sehr Gut",
|
||||||
|
Color: "Silber",
|
||||||
|
Type: "Ring",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.jpg", "t/2.jpg"},
|
Images: []string{"t/1.jpg", "t/2.jpg"},
|
||||||
},
|
},
|
||||||
@@ -338,6 +348,8 @@ var adsrc = []AdConfig{
|
|||||||
Text: "Thing to sale",
|
Text: "Thing to sale",
|
||||||
Slug: "seventh-ad",
|
Slug: "seventh-ad",
|
||||||
Condition: "Sehr Gut",
|
Condition: "Sehr Gut",
|
||||||
|
Color: "Gelpb",
|
||||||
|
Type: "Schmuck",
|
||||||
Created: "Yesterday",
|
Created: "Yesterday",
|
||||||
Images: []string{"t/1.png", "t/1.gif", "t/1.webp", "t/1.jpg"},
|
Images: []string{"t/1.png", "t/1.gif", "t/1.webp", "t/1.jpg"},
|
||||||
},
|
},
|
||||||
|
|||||||
33
scrape.go
33
scrape.go
@@ -18,7 +18,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@@ -125,8 +124,7 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
|
|||||||
return fmt.Errorf("could not extract ad data from page, got empty struct")
|
return fmt.Errorf("could not extract ad data from page, got empty struct")
|
||||||
}
|
}
|
||||||
|
|
||||||
advertisement.Attributes = DecodeAttributes(advertisement.Details)
|
advertisement.DecodeAttributes()
|
||||||
|
|
||||||
advertisement.CalculateExpire()
|
advertisement.CalculateExpire()
|
||||||
|
|
||||||
// prepare ad dir name
|
// prepare ad dir name
|
||||||
@@ -158,35 +156,6 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
|
|||||||
return ScrapeImages(fetch, advertisement, addir)
|
return ScrapeImages(fetch, advertisement, addir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func DecodeAttributes(attributes string) map[string]string {
|
|
||||||
rd := strings.NewReader(attributes)
|
|
||||||
scanner := bufio.NewScanner(rd)
|
|
||||||
|
|
||||||
isattr := true
|
|
||||||
attr := ""
|
|
||||||
attrmap := map[string]string{}
|
|
||||||
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := strings.TrimSpace(scanner.Text())
|
|
||||||
|
|
||||||
if line == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if isattr {
|
|
||||||
attr = line
|
|
||||||
} else {
|
|
||||||
attrmap[attr] = line
|
|
||||||
}
|
|
||||||
|
|
||||||
isattr = !isattr
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println(attributes)
|
|
||||||
|
|
||||||
return attrmap
|
|
||||||
}
|
|
||||||
|
|
||||||
func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error {
|
func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error {
|
||||||
// fetch images
|
// fetch images
|
||||||
img := 1
|
img := 1
|
||||||
|
|||||||
Reference in New Issue
Block a user