added custom template support, added more ad data, use concurrency

This commit is contained in:
2023-12-16 20:32:10 +01:00
parent df6baadc85
commit e904ed6687
9 changed files with 121 additions and 53 deletions

View File

@@ -4,6 +4,9 @@
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE) [![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck) [![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
This tool can be used to backup ads on the german ad page https://kleinanzeigen.de This tool can be used to backup ads on the german ad page https://kleinanzeigen.de
@@ -48,6 +51,7 @@ Format is simple:
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
``` ```
## Usage ## Usage

View File

@@ -17,14 +17,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"github.com/hashicorp/hcl/v2/hclsimple"
"os" "os"
"github.com/hashicorp/hcl/v2/hclsimple"
) )
type Config struct { type Config struct {
Verbose bool `hcl:"verbose"` Verbose bool `hcl:"verbose"`
User int `hcl:"user"` User int `hcl:"user"`
Outdir string `hcl:"outdir"` Outdir string `hcl:"outdir"`
Template string `hcl:"template"`
} }
func ParseConfigfile(file string) (*Config, error) { func ParseConfigfile(file string) (*Config, error) {

View File

@@ -13,3 +13,8 @@ verbose = true
# directory where to store downloaded ads. kleingebaeck will try to # directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string. # create it. must be a quoted string.
outdir = "test" outdir = "test"
# template. leave empty to use the default one, which is:
# Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
# take care to include exactly 7 times '%s'!
template = ""

2
go.mod
View File

@@ -2,8 +2,6 @@ module kleingebaeck
go 1.21 go 1.21
toolchain go1.21.1
require ( require (
astuart.co/goq v1.0.0 // indirect astuart.co/goq v1.0.0 // indirect
github.com/PuerkitoBio/goquery v1.5.0 // indirect github.com/PuerkitoBio/goquery v1.5.0 // indirect

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2023-12-15" "1" "User Commands" .TH KLEINGEBAECK 1 "2023-12-16" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -169,11 +169,20 @@ You can create a config file to save typing. By default
.PP .PP
Format is simple: Format is simple:
.PP .PP
.Vb 3 .Vb 4
\& user = 1010101 \& user = 1010101
\& verbose = true \& verbose = true
\& outdir = "test" \& outdir = "test"
\& template = ""
.Ve .Ve
.PP
Be carefull if you want to change the template. The default one looks like this:
.PP
.Vb 1
\& Title: %s\enPrice: %s\enId: %s\enCategory: %s\enCondition: %s\enCreated: %s\enBody:\en\en%s\en
.Ve
.PP
If you change it, include 7 times the '%s' format tag.
.SH "SETUP" .SH "SETUP"
.IX Header "SETUP" .IX Header "SETUP"
To setup the tool, you need to lookup your userid on To setup the tool, you need to lookup your userid on

View File

@@ -34,6 +34,14 @@ CONFIGURATION
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks
like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
SETUP SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de. To setup the tool, you need to lookup your userid on kleinanzeigen.de.

View File

@@ -34,6 +34,13 @@ Format is simple:
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
=head1 SETUP =head1 SETUP

34
main.go
View File

@@ -20,19 +20,21 @@ package main
import ( import (
"errors" "errors"
"fmt" "fmt"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
"log/slog" "log/slog"
"os" "os"
"runtime/debug" "runtime/debug"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
) )
const VERSION string = "0.0.2" const VERSION string = "0.0.3"
const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
const Baseuri string = "https://www.kleinanzeigen.de" const Baseuri string = "https://www.kleinanzeigen.de"
const Listuri string = "/s-bestandsliste.html" const Listuri string = "/s-bestandsliste.html"
const Defaultdir string = "." const Defaultdir string = "."
const DefaultTemplate string = "Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n"
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
@@ -102,6 +104,14 @@ func Main() int {
return 0 return 0
} }
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
conf, err := ParseConfigfile(configfile) conf, err := ParseConfigfile(configfile)
if err != nil { if err != nil {
return Die(err) return Die(err)
@@ -132,14 +142,6 @@ func Main() int {
slog.Debug("config", "conf", conf) slog.Debug("config", "conf", conf)
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
if len(dir) == 0 { if len(dir) == 0 {
if len(conf.Outdir) > 0 { if len(conf.Outdir) > 0 {
dir = conf.Outdir dir = conf.Outdir
@@ -154,10 +156,16 @@ func Main() int {
return Die(err) return Die(err)
} }
// which template to use
template := DefaultTemplate
if len(conf.Template) > 0 {
template = conf.Template
}
// directly backup ad listing[s] // directly backup ad listing[s]
if len(flag.Args()) >= 1 { if len(flag.Args()) >= 1 {
for _, uri := range flag.Args() { for _, uri := range flag.Args() {
err := Scrape(uri, dir) err := Scrape(uri, dir, template)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
@@ -172,7 +180,7 @@ func Main() int {
} }
if uid > 0 { if uid > 0 {
err := Start(fmt.Sprintf("%d", uid), dir) err := Start(fmt.Sprintf("%d", uid), dir, template)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }

View File

@@ -22,10 +22,10 @@ import (
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
"net/http"
"os" "os"
"strings" "strings"
"sync"
"net/http"
"astuart.co/goq" "astuart.co/goq"
) )
@@ -34,6 +34,29 @@ type Index struct {
Links []string `goquery:".text-module-begin a,[href]"` Links []string `goquery:".text-module-begin a,[href]"`
} }
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string
Category string
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Meta []string `goquery:".addetailslist--detail--value,text"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// fetch some web page content // fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) { func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil) req, err := http.NewRequest("GET", uri, nil)
@@ -56,7 +79,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func Start(uid string, dir string) error { func Start(uid string, dir string, template string) error {
client := &http.Client{} client := &http.Client{}
adlinks := []string{} adlinks := []string{}
@@ -96,7 +119,7 @@ func Start(uid string, dir string) error {
} }
for _, adlink := range adlinks { for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir) err := Scrape(Baseuri+adlink, dir, template)
if err != nil { if err != nil {
return err return err
} }
@@ -105,27 +128,8 @@ func Start(uid string, dir string) error {
return nil return nil
} }
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Price string `goquery:"h2#viewad-price"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string) error { func Scrape(uri string, dir string, template string) error {
client := &http.Client{} client := &http.Client{}
ad := &Ad{} ad := &Ad{}
@@ -150,6 +154,10 @@ func Scrape(uri string, dir string) error {
if err != nil { if err != nil {
return err return err
} }
if len(ad.Meta) == 2 {
ad.Category = ad.Meta[0]
ad.Condition = ad.Meta[1]
}
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
// prepare output dir // prepare output dir
@@ -167,26 +175,45 @@ func Scrape(uri string, dir string) error {
} }
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n") ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n", _, err = fmt.Fprintf(f, template,
ad.Title, ad.Price, ad.Id, ad.Text) ad.Title, ad.Price, ad.Id, ad.Category, ad.Condition, ad.Created, ad.Text)
if err != nil { if err != nil {
return err return err
} }
slog.Info("wrote ad listing", "listingfile", listingfile) slog.Info("wrote ad listing", "listingfile", listingfile)
return ScrapeImages(dir, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
// fetch images // fetch images
img := 1 img := 1
var wg sync.WaitGroup
wg.Add(len(ad.Images))
failure := make(chan string)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
file := fmt.Sprintf("%s/%d.jpg", dir, img) file := fmt.Sprintf("%s/%d.jpg", dir, img)
go func() {
defer wg.Done()
err := Getimage(imguri, file) err := Getimage(imguri, file)
if err != nil { if err != nil {
return err failure <- err.Error()
return
} }
slog.Info("wrote ad image", "image", file) slog.Info("wrote ad image", "image", file)
}()
img++ img++
} }
close(failure)
wg.Wait()
goterr := <-failure
if goterr != "" {
return errors.New(goterr)
}
return nil return nil
} }