From e904ed6687ba062a59c2500e4727f29e291843de Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Sat, 16 Dec 2023 20:32:10 +0100 Subject: [PATCH] added custom template support, added more ad data, use concurrency --- README.md | 4 +++ config.go | 10 +++--- example.hcl | 5 +++ go.mod | 2 -- kleingebaeck.1 | 13 +++++-- kleingebaeck.go | 8 +++++ kleingebaeck.pod | 7 ++++ main.go | 34 +++++++++++------- scrape.go | 91 +++++++++++++++++++++++++++++++----------------- 9 files changed, 121 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index a6acad0..c1d990e 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ [![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE) [![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck) +![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck) +[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest) + This tool can be used to backup ads on the german ad page https://kleinanzeigen.de @@ -48,6 +51,7 @@ Format is simple: user = 1010101 verbose = true outdir = "test" +template = "" ``` ## Usage diff --git a/config.go b/config.go index 140c1d7..1f0641d 100644 --- a/config.go +++ b/config.go @@ -17,14 +17,16 @@ along with this program. If not, see . package main import ( - "github.com/hashicorp/hcl/v2/hclsimple" "os" + + "github.com/hashicorp/hcl/v2/hclsimple" ) type Config struct { - Verbose bool `hcl:"verbose"` - User int `hcl:"user"` - Outdir string `hcl:"outdir"` + Verbose bool `hcl:"verbose"` + User int `hcl:"user"` + Outdir string `hcl:"outdir"` + Template string `hcl:"template"` } func ParseConfigfile(file string) (*Config, error) { diff --git a/example.hcl b/example.hcl index 9845efa..7c65e73 100644 --- a/example.hcl +++ b/example.hcl @@ -13,3 +13,8 @@ verbose = true # directory where to store downloaded ads. kleingebaeck will try to # create it. must be a quoted string. outdir = "test" + +# template. leave empty to use the default one, which is: +# Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n +# take care to include exactly 7 times '%s'! +template = "" diff --git a/go.mod b/go.mod index 8e408e3..53b3578 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,6 @@ module kleingebaeck go 1.21 -toolchain go1.21.1 - require ( astuart.co/goq v1.0.0 // indirect github.com/PuerkitoBio/goquery v1.5.0 // indirect diff --git a/kleingebaeck.1 b/kleingebaeck.1 index 8847c93..9e46bce 100644 --- a/kleingebaeck.1 +++ b/kleingebaeck.1 @@ -133,7 +133,7 @@ .\" ======================================================================== .\" .IX Title "KLEINGEBAECK 1" -.TH KLEINGEBAECK 1 "2023-12-15" "1" "User Commands" +.TH KLEINGEBAECK 1 "2023-12-16" "1" "User Commands" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l @@ -169,11 +169,20 @@ You can create a config file to save typing. By default .PP Format is simple: .PP -.Vb 3 +.Vb 4 \& user = 1010101 \& verbose = true \& outdir = "test" +\& template = "" .Ve +.PP +Be carefull if you want to change the template. The default one looks like this: +.PP +.Vb 1 +\& Title: %s\enPrice: %s\enId: %s\enCategory: %s\enCondition: %s\enCreated: %s\enBody:\en\en%s\en +.Ve +.PP +If you change it, include 7 times the '%s' format tag. .SH "SETUP" .IX Header "SETUP" To setup the tool, you need to lookup your userid on diff --git a/kleingebaeck.go b/kleingebaeck.go index a79a0c1..27691e2 100644 --- a/kleingebaeck.go +++ b/kleingebaeck.go @@ -34,6 +34,14 @@ CONFIGURATION user = 1010101 verbose = true outdir = "test" + template = "" + + Be carefull if you want to change the template. The default one looks + like this: + + Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n + + If you change it, include 7 times the '%s' format tag. SETUP To setup the tool, you need to lookup your userid on kleinanzeigen.de. diff --git a/kleingebaeck.pod b/kleingebaeck.pod index 5357682..b104706 100644 --- a/kleingebaeck.pod +++ b/kleingebaeck.pod @@ -34,6 +34,13 @@ Format is simple: user = 1010101 verbose = true outdir = "test" + template = "" + +Be carefull if you want to change the template. The default one looks like this: + + Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n + +If you change it, include 7 times the '%s' format tag. =head1 SETUP diff --git a/main.go b/main.go index d4baedf..3d98bca 100644 --- a/main.go +++ b/main.go @@ -20,19 +20,21 @@ package main import ( "errors" "fmt" - "github.com/lmittmann/tint" - flag "github.com/spf13/pflag" "log/slog" "os" "runtime/debug" + + "github.com/lmittmann/tint" + flag "github.com/spf13/pflag" ) -const VERSION string = "0.0.2" +const VERSION string = "0.0.3" const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" const Baseuri string = "https://www.kleinanzeigen.de" const Listuri string = "/s-bestandsliste.html" const Defaultdir string = "." +const DefaultTemplate string = "Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n" const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. Usage: kleingebaeck [-dvVhmoc] [,...] @@ -102,6 +104,14 @@ func Main() int { return 0 } + if showmanual { + err := man() + if err != nil { + return Die(err) + } + return 0 + } + conf, err := ParseConfigfile(configfile) if err != nil { return Die(err) @@ -132,14 +142,6 @@ func Main() int { slog.Debug("config", "conf", conf) - if showmanual { - err := man() - if err != nil { - return Die(err) - } - return 0 - } - if len(dir) == 0 { if len(conf.Outdir) > 0 { dir = conf.Outdir @@ -154,10 +156,16 @@ func Main() int { return Die(err) } + // which template to use + template := DefaultTemplate + if len(conf.Template) > 0 { + template = conf.Template + } + // directly backup ad listing[s] if len(flag.Args()) >= 1 { for _, uri := range flag.Args() { - err := Scrape(uri, dir) + err := Scrape(uri, dir, template) if err != nil { return Die(err) } @@ -172,7 +180,7 @@ func Main() int { } if uid > 0 { - err := Start(fmt.Sprintf("%d", uid), dir) + err := Start(fmt.Sprintf("%d", uid), dir, template) if err != nil { return Die(err) } diff --git a/scrape.go b/scrape.go index 43e9655..cca98c6 100644 --- a/scrape.go +++ b/scrape.go @@ -22,10 +22,10 @@ import ( "fmt" "io" "log/slog" + "net/http" "os" "strings" - - "net/http" + "sync" "astuart.co/goq" ) @@ -34,6 +34,29 @@ type Index struct { Links []string `goquery:".text-module-begin a,[href]"` } +type Ad struct { + Title string `goquery:"h1"` + Slug string + Id string + Condition string + Category string + Price string `goquery:"h2#viewad-price"` + Created string `goquery:"#viewad-extra-info,text"` + Text string `goquery:"p#viewad-description-text,html"` + Images []string `goquery:".galleryimage-element img,[src]"` + Meta []string `goquery:".addetailslist--detail--value,text"` +} + +func (ad *Ad) LogValue() slog.Value { + return slog.GroupValue( + slog.String("title", ad.Title), + slog.String("price", ad.Price), + slog.String("id", ad.Id), + slog.Int("imagecount", len(ad.Images)), + slog.Int("bodysize", len(ad.Text)), + ) +} + // fetch some web page content func Get(uri string, client *http.Client) (io.ReadCloser, error) { req, err := http.NewRequest("GET", uri, nil) @@ -56,7 +79,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) { // extract links from all ad listing pages (that is: use pagination) // and scrape every page -func Start(uid string, dir string) error { +func Start(uid string, dir string, template string) error { client := &http.Client{} adlinks := []string{} @@ -96,7 +119,7 @@ func Start(uid string, dir string) error { } for _, adlink := range adlinks { - err := Scrape(Baseuri+adlink, dir) + err := Scrape(Baseuri+adlink, dir, template) if err != nil { return err } @@ -105,27 +128,8 @@ func Start(uid string, dir string) error { return nil } -type Ad struct { - Title string `goquery:"h1"` - Slug string - Id string - Text string `goquery:"p#viewad-description-text,html"` - Images []string `goquery:".galleryimage-element img,[src]"` - Price string `goquery:"h2#viewad-price"` -} - -func (ad *Ad) LogValue() slog.Value { - return slog.GroupValue( - slog.String("title", ad.Title), - slog.String("price", ad.Price), - slog.String("id", ad.Id), - slog.Int("imagecount", len(ad.Images)), - slog.Int("bodysize", len(ad.Text)), - ) -} - // scrape an ad. uri is the full uri of the ad, dir is the basedir -func Scrape(uri string, dir string) error { +func Scrape(uri string, dir string, template string) error { client := &http.Client{} ad := &Ad{} @@ -150,6 +154,10 @@ func Scrape(uri string, dir string) error { if err != nil { return err } + if len(ad.Meta) == 2 { + ad.Category = ad.Meta[0] + ad.Condition = ad.Meta[1] + } slog.Debug("extracted ad listing", "ad", ad) // prepare output dir @@ -167,26 +175,45 @@ func Scrape(uri string, dir string) error { } ad.Text = strings.ReplaceAll(ad.Text, "
", "\n") - _, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n", - ad.Title, ad.Price, ad.Id, ad.Text) + _, err = fmt.Fprintf(f, template, + ad.Title, ad.Price, ad.Id, ad.Category, ad.Condition, ad.Created, ad.Text) if err != nil { return err } slog.Info("wrote ad listing", "listingfile", listingfile) + return ScrapeImages(dir, ad) +} + +func ScrapeImages(dir string, ad *Ad) error { // fetch images img := 1 + var wg sync.WaitGroup + wg.Add(len(ad.Images)) + failure := make(chan string) + for _, imguri := range ad.Images { file := fmt.Sprintf("%s/%d.jpg", dir, img) - err := Getimage(imguri, file) - if err != nil { - return err - } - slog.Info("wrote ad image", "image", file) - + go func() { + defer wg.Done() + err := Getimage(imguri, file) + if err != nil { + failure <- err.Error() + return + } + slog.Info("wrote ad image", "image", file) + }() img++ } + close(failure) + wg.Wait() + goterr := <-failure + + if goterr != "" { + return errors.New(goterr) + } + return nil }