Make ad directory name tunable, adapt to kleinanzeigen.de site changes

Add template for ad name, adapt kleinanzeigen.de changes
This commit is contained in:
T.v.Dein
2024-01-12 14:56:08 +01:00
committed by GitHub
8 changed files with 132 additions and 48 deletions

25
ad.go
View File

@@ -19,6 +19,7 @@ package main
import ( import (
"log/slog" "log/slog"
"strings"
) )
type Index struct { type Index struct {
@@ -26,16 +27,16 @@ type Index struct {
} }
type Ad struct { type Ad struct {
Title string `goquery:"h1"` Title string `goquery:"h1"`
Slug string Slug string
Id string Id string
Condition string Condition string `goquery:".addetailslist--detail--value,text"`
Category string Category string
Price string `goquery:"h2#viewad-price"` CategoryTree []string `goquery:".breadcrump-link,text"`
Created string `goquery:"#viewad-extra-info,text"` Price string `goquery:"h2#viewad-price"`
Text string `goquery:"p#viewad-description-text,html"` Created string `goquery:"#viewad-extra-info,text"`
Images []string `goquery:".galleryimage-element img,[src]"` Text string `goquery:"p#viewad-description-text,html"`
Meta []string `goquery:".addetailslist--detail--value,text"` Images []string `goquery:".galleryimage-element img,[src]"`
} }
// Used by slog to pretty print an ad // Used by slog to pretty print an ad
@@ -46,6 +47,8 @@ func (ad *Ad) LogValue() slog.Value {
slog.String("id", ad.Id), slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)), slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)), slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition),
) )
} }
@@ -58,7 +61,7 @@ func (ad *Ad) LogValue() slog.Value {
// //
// Note: we return true for "ad is incomplete" and false for "ad is complete"! // Note: we return true for "ad is incomplete" and false for "ad is complete"!
func (ad *Ad) Incomplete() bool { func (ad *Ad) Incomplete() bool {
if ad.Category == "" || ad.Condition == "" || ad.Created == "" || ad.Text == "" { if ad.Category == "" || ad.Created == "" || ad.Text == "" {
return true return true
} }

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -33,7 +33,7 @@ import (
) )
const ( const (
VERSION string = "0.1.0" VERSION string = "0.1.1"
Baseuri string = "https://www.kleinanzeigen.de" Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html" Listuri string = "/s-bestandsliste.html"
Defaultdir string = "." Defaultdir string = "."
@@ -43,6 +43,7 @@ const (
"Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n" "Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nCreated: {{.Created}}\r\n\r\n{{.Text}}\r\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
DefaultAdNameTemplate string = "{{.Slug}}-{{.Id}}"
) )
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
@@ -72,6 +73,7 @@ type Config struct {
User int `koanf:"user"` User int `koanf:"user"`
Outdir string `koanf:"outdir"` Outdir string `koanf:"outdir"`
Template string `koanf:"template"` Template string `koanf:"template"`
Adnametemplate string `koanf:"adnametemplate"`
Loglevel string `koanf:"loglevel"` Loglevel string `koanf:"loglevel"`
Limit int `koanf:"limit"` Limit int `koanf:"limit"`
Adlinks []string Adlinks []string
@@ -99,10 +101,11 @@ func InitConfig(w io.Writer) (*Config, error) {
// Load default values using the confmap provider. // Load default values using the confmap provider.
if err := k.Load(confmap.Provider(map[string]interface{}{ if err := k.Load(confmap.Provider(map[string]interface{}{
"template": template, "template": template,
"outdir": ".", "outdir": ".",
"loglevel": "notice", "loglevel": "notice",
"userid": 0, "userid": 0,
"adnametemplate": DefaultAdNameTemplate,
}, "."), nil); err != nil { }, "."), nil); err != nil {
return nil, err return nil, err
} }

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2023-12-19" "1" "User Commands" .TH KLEINGEBAECK 1 "2024-01-12" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -142,7 +142,7 @@
kleingebaeck \- kleinanzeigen.de backup tool kleingebaeck \- kleinanzeigen.de backup tool
.SH "SYNOPSYS" .SH "SYNOPSYS"
.IX Header "SYNOPSYS" .IX Header "SYNOPSYS"
.Vb 10 .Vb 11
\& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...] \& Usage: kleingebaeck [\-dvVhmoc] [<ad\-listing\-url>,...]
\& Options: \& Options:
\& \-\-user \-u <uid> Backup ads from user with uid <uid>. \& \-\-user \-u <uid> Backup ads from user with uid <uid>.
@@ -153,6 +153,7 @@ kleingebaeck \- kleinanzeigen.de backup tool
\& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck). \& \-\-config \-c <file> Use config file <file> (default: ~/.kleingebaeck).
\& \-\-manual \-m Show manual. \& \-\-manual \-m Show manual.
\& \-\-help \-h Show usage. \& \-\-help \-h Show usage.
\& \-\-version \-V Show program version.
.Ve .Ve
.SH "DESCRIPTION" .SH "DESCRIPTION"
.IX Header "DESCRIPTION" .IX Header "DESCRIPTION"
@@ -235,7 +236,20 @@ Also there's currently no parallelization implemented. This will
change in the future. change in the future.
.SH "LICENSE" .SH "LICENSE"
.IX Header "LICENSE" .IX Header "LICENSE"
Licensed under the \s-1GNU GENERAL PUBLIC LICENSE\s0 version 3. Copyright 2023\-2024 Thomas von Dein
.PP
This program is free software: you can redistribute it and/or modify
it under the terms of the \s-1GNU\s0 General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
.PP
This program is distributed in the hope that it will be useful,
but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE.\s0 See the
\&\s-1GNU\s0 General Public License for more details.
.PP
You should have received a copy of the \s-1GNU\s0 General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.SH "Author" .SH "Author"
.IX Header "Author" .IX Header "Author"
T.v.Dein <tom \s-1AT\s0 vondein \s-1DOT\s0 org> T.v.Dein <tom \s-1AT\s0 vondein \s-1DOT\s0 org>

View File

@@ -15,6 +15,7 @@ SYNOPSYS
--config -c <file> Use config file <file> (default: ~/.kleingebaeck). --config -c <file> Use config file <file> (default: ~/.kleingebaeck).
--manual -m Show manual. --manual -m Show manual.
--help -h Show usage. --help -h Show usage.
--version -V Show program version.
DESCRIPTION DESCRIPTION
This tool can be used to backup ads on the german ad page This tool can be used to backup ads on the german ad page
@@ -89,7 +90,20 @@ LIMITATIONS
in the future. in the future.
LICENSE LICENSE
Licensed under the GNU GENERAL PUBLIC LICENSE version 3. Copyright 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
Author Author
T.v.Dein <tom AT vondein DOT org> T.v.Dein <tom AT vondein DOT org>

View File

@@ -96,7 +96,20 @@ change in the future.
=head1 LICENSE =head1 LICENSE
Licensed under the GNU GENERAL PUBLIC LICENSE version 3. Copyright 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=head1 Author =head1 Author

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -60,6 +60,16 @@ const ADTPL string = `DOCTYPE html>
</head> </head>
<body> <body>
<div class="l-container-row">
<div id="vap-brdcrmb" class="breadcrump">
<a class="breadcrump-link" itemprop="url" href="/" title="Kleinanzeigen ">
<span itemprop="title">Kleinanzeigen </span>
</a>
<a class="breadcrump-link" itemprop="url" href="/egal">
<span itemprop="title">{{ .Category }}</span></a>
</div>
</div>
{{ range $image := .Images }} {{ range $image := .Images }}
<div class="galleryimage-element" data-ix="3"> <div class="galleryimage-element" data-ix="3">
<img src="{{ $image }}"/> <img src="{{ $image }}"/>
@@ -79,10 +89,6 @@ const ADTPL string = `DOCTYPE html>
<div class="splitlinebox l-container-row" id="viewad-details"> <div class="splitlinebox l-container-row" id="viewad-details">
<ul class="addetailslist"> <ul class="addetailslist">
<li class="addetailslist--detail">
Art<span class="addetailslist--detail--value" >
{{ .Category }}</span>
</li>
<li class="addetailslist--detail"> <li class="addetailslist--detail">
Zustand<span class="addetailslist--detail--value" > Zustand<span class="addetailslist--detail--value" >
{{ .Condition }}</span> {{ .Condition }}</span>
@@ -438,9 +444,17 @@ func SetIntercept(ads []Adsource) {
} }
func VerifyAd(ad AdConfig) error { func VerifyAd(ad AdConfig) error {
body := ad.Title + ad.Price + ad.Id + ad.Category + ad.Condition + ad.Created body := ad.Title + ad.Price + ad.Id + "Kleinanzeigen => " + ad.Category + ad.Condition + ad.Created
file := fmt.Sprintf("t/out/%s/Adlisting.txt", ad.Slug) // prepare ad dir name using DefaultAdNameTemplate
c := Config{Adnametemplate: DefaultAdNameTemplate}
adstruct := Ad{Slug: ad.Slug, Id: ad.Id}
addir, err := AdDirName(&c, &adstruct)
if err != nil {
return err
}
file := fmt.Sprintf("t/out/%s/Adlisting.txt", addir)
content, err := os.ReadFile(file) content, err := os.ReadFile(file)
if err != nil { if err != nil {
return err return err

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -135,36 +135,37 @@ func Scrape(c *Config, uri string) error {
if err != nil { if err != nil {
return err return err
} }
if len(ad.Meta) == 2 {
ad.Category = ad.Meta[0] if len(ad.CategoryTree) > 0 {
ad.Condition = ad.Meta[1] ad.Category = strings.Join(ad.CategoryTree, " => ")
} }
if ad.Incomplete() { if ad.Incomplete() {
slog.Debug("got ad", "ad", ad)
return errors.New("could not extract ad data from page, got empty struct") return errors.New("could not extract ad data from page, got empty struct")
} }
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
// write listing // write listing
err = WriteAd(c.Outdir, ad, c.Template) addir, err := WriteAd(c, ad)
if err != nil { if err != nil {
return err return err
} }
c.IncrAds() c.IncrAds()
return ScrapeImages(c, ad) return ScrapeImages(c, ad, addir)
} }
func ScrapeImages(c *Config, ad *Ad) error { func ScrapeImages(c *Config, ad *Ad, addir string) error {
// fetch images // fetch images
img := 1 img := 1
g := new(errgroup.Group) g := new(errgroup.Group)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
imguri := imguri imguri := imguri
file := filepath.Join(c.Outdir, ad.Slug, fmt.Sprintf("%d.jpg", img)) file := filepath.Join(c.Outdir, addir, fmt.Sprintf("%d.jpg", img))
g.Go(func() error { g.Go(func() error {
err := Getimage(imguri, file) err := Getimage(imguri, file)
if err != nil { if err != nil {

View File

@@ -1,5 +1,5 @@
/* /*
Copyright © 2023 Thomas von Dein Copyright © 2023-2024 Thomas von Dein
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"bytes"
"io" "io"
"log/slog" "log/slog"
"os" "os"
@@ -27,19 +28,40 @@ import (
tpl "text/template" tpl "text/template"
) )
func WriteAd(dir string, ad *Ad, template string) error { func AdDirName(c *Config, ad *Ad) (string, error) {
// prepare output dir tmpl, err := tpl.New("adname").Parse(c.Adnametemplate)
dir = filepath.Join(dir, ad.Slug)
err := Mkdir(dir)
if err != nil { if err != nil {
return err return "", err
}
buf := bytes.Buffer{}
err = tmpl.Execute(&buf, ad)
if err != nil {
return "", err
}
return buf.String(), nil
}
func WriteAd(c *Config, ad *Ad) (string, error) {
// prepare ad dir name
addir, err := AdDirName(c, ad)
if err != nil {
return "", err
}
// prepare output dir
dir := filepath.Join(c.Outdir, addir)
err = Mkdir(dir)
if err != nil {
return "", err
} }
// write ad file // write ad file
listingfile := filepath.Join(dir, "Adlisting.txt") listingfile := filepath.Join(dir, "Adlisting.txt")
f, err := os.Create(listingfile) f, err := os.Create(listingfile)
if err != nil { if err != nil {
return err return "", err
} }
defer f.Close() defer f.Close()
@@ -49,19 +71,19 @@ func WriteAd(dir string, ad *Ad, template string) error {
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n") ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
} }
tmpl, err := tpl.New("adlisting").Parse(template) tmpl, err := tpl.New("adlisting").Parse(c.Template)
if err != nil { if err != nil {
return err return "", err
} }
err = tmpl.Execute(f, ad) err = tmpl.Execute(f, ad)
if err != nil { if err != nil {
return err return "", err
} }
slog.Info("wrote ad listing", "listingfile", listingfile) slog.Info("wrote ad listing", "listingfile", listingfile)
return nil return addir, nil
} }
func WriteImage(filename string, reader io.ReadCloser) error { func WriteImage(filename string, reader io.ReadCloser) error {