added custom template support, added more ad data, use concurrency

This commit is contained in:
2023-12-16 20:32:10 +01:00
parent df6baadc85
commit e904ed6687
9 changed files with 121 additions and 53 deletions

View File

@@ -4,6 +4,9 @@
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
This tool can be used to backup ads on the german ad page https://kleinanzeigen.de
@@ -48,6 +51,7 @@ Format is simple:
user = 1010101
verbose = true
outdir = "test"
template = ""
```
## Usage

View File

@@ -17,14 +17,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main
import (
"github.com/hashicorp/hcl/v2/hclsimple"
"os"
"github.com/hashicorp/hcl/v2/hclsimple"
)
type Config struct {
Verbose bool `hcl:"verbose"`
User int `hcl:"user"`
Outdir string `hcl:"outdir"`
Template string `hcl:"template"`
}
func ParseConfigfile(file string) (*Config, error) {

View File

@@ -13,3 +13,8 @@ verbose = true
# directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string.
outdir = "test"
# template. leave empty to use the default one, which is:
# Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
# take care to include exactly 7 times '%s'!
template = ""

2
go.mod
View File

@@ -2,8 +2,6 @@ module kleingebaeck
go 1.21
toolchain go1.21.1
require (
astuart.co/goq v1.0.0 // indirect
github.com/PuerkitoBio/goquery v1.5.0 // indirect

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2023-12-15" "1" "User Commands"
.TH KLEINGEBAECK 1 "2023-12-16" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -169,11 +169,20 @@ You can create a config file to save typing. By default
.PP
Format is simple:
.PP
.Vb 3
.Vb 4
\& user = 1010101
\& verbose = true
\& outdir = "test"
\& template = ""
.Ve
.PP
Be carefull if you want to change the template. The default one looks like this:
.PP
.Vb 1
\& Title: %s\enPrice: %s\enId: %s\enCategory: %s\enCondition: %s\enCreated: %s\enBody:\en\en%s\en
.Ve
.PP
If you change it, include 7 times the '%s' format tag.
.SH "SETUP"
.IX Header "SETUP"
To setup the tool, you need to lookup your userid on

View File

@@ -34,6 +34,14 @@ CONFIGURATION
user = 1010101
verbose = true
outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks
like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de.

View File

@@ -34,6 +34,13 @@ Format is simple:
user = 1010101
verbose = true
outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
=head1 SETUP

34
main.go
View File

@@ -20,19 +20,21 @@ package main
import (
"errors"
"fmt"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
"log/slog"
"os"
"runtime/debug"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
)
const VERSION string = "0.0.2"
const VERSION string = "0.0.3"
const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
const Baseuri string = "https://www.kleinanzeigen.de"
const Listuri string = "/s-bestandsliste.html"
const Defaultdir string = "."
const DefaultTemplate string = "Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n"
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
@@ -102,6 +104,14 @@ func Main() int {
return 0
}
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
conf, err := ParseConfigfile(configfile)
if err != nil {
return Die(err)
@@ -132,14 +142,6 @@ func Main() int {
slog.Debug("config", "conf", conf)
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
if len(dir) == 0 {
if len(conf.Outdir) > 0 {
dir = conf.Outdir
@@ -154,10 +156,16 @@ func Main() int {
return Die(err)
}
// which template to use
template := DefaultTemplate
if len(conf.Template) > 0 {
template = conf.Template
}
// directly backup ad listing[s]
if len(flag.Args()) >= 1 {
for _, uri := range flag.Args() {
err := Scrape(uri, dir)
err := Scrape(uri, dir, template)
if err != nil {
return Die(err)
}
@@ -172,7 +180,7 @@ func Main() int {
}
if uid > 0 {
err := Start(fmt.Sprintf("%d", uid), dir)
err := Start(fmt.Sprintf("%d", uid), dir, template)
if err != nil {
return Die(err)
}

View File

@@ -22,10 +22,10 @@ import (
"fmt"
"io"
"log/slog"
"net/http"
"os"
"strings"
"net/http"
"sync"
"astuart.co/goq"
)
@@ -34,6 +34,29 @@ type Index struct {
Links []string `goquery:".text-module-begin a,[href]"`
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string
Category string
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Meta []string `goquery:".addetailslist--detail--value,text"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
@@ -56,7 +79,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func Start(uid string, dir string) error {
func Start(uid string, dir string, template string) error {
client := &http.Client{}
adlinks := []string{}
@@ -96,7 +119,7 @@ func Start(uid string, dir string) error {
}
for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir)
err := Scrape(Baseuri+adlink, dir, template)
if err != nil {
return err
}
@@ -105,27 +128,8 @@ func Start(uid string, dir string) error {
return nil
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Price string `goquery:"h2#viewad-price"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string) error {
func Scrape(uri string, dir string, template string) error {
client := &http.Client{}
ad := &Ad{}
@@ -150,6 +154,10 @@ func Scrape(uri string, dir string) error {
if err != nil {
return err
}
if len(ad.Meta) == 2 {
ad.Category = ad.Meta[0]
ad.Condition = ad.Meta[1]
}
slog.Debug("extracted ad listing", "ad", ad)
// prepare output dir
@@ -167,26 +175,45 @@ func Scrape(uri string, dir string) error {
}
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n",
ad.Title, ad.Price, ad.Id, ad.Text)
_, err = fmt.Fprintf(f, template,
ad.Title, ad.Price, ad.Id, ad.Category, ad.Condition, ad.Created, ad.Text)
if err != nil {
return err
}
slog.Info("wrote ad listing", "listingfile", listingfile)
return ScrapeImages(dir, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
// fetch images
img := 1
var wg sync.WaitGroup
wg.Add(len(ad.Images))
failure := make(chan string)
for _, imguri := range ad.Images {
file := fmt.Sprintf("%s/%d.jpg", dir, img)
go func() {
defer wg.Done()
err := Getimage(imguri, file)
if err != nil {
return err
failure <- err.Error()
return
}
slog.Info("wrote ad image", "image", file)
}()
img++
}
close(failure)
wg.Wait()
goterr := <-failure
if goterr != "" {
return errors.New(goterr)
}
return nil
}