Compare commits

..

6 Commits

Author SHA1 Message Date
T.v.Dein
d7b13e8a9a Merge pull request #1 from TLINDEN/dev
added custom template support, added more ad data, use concurrency
2023-12-16 20:35:18 +01:00
e904ed6687 added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00
df6baadc85 better sample config 2023-12-16 00:01:29 +01:00
314315a1c6 fix pod entities => markdown 2023-12-15 18:29:42 +01:00
2e83e68f20 fix logo 2023-12-15 18:02:34 +01:00
b5e51b43c9 add logo 2023-12-15 18:00:41 +01:00
11 changed files with 141 additions and 59 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
.github/assets/kleingebaecklogo.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

View File

@@ -1,13 +1,18 @@
## Kleingebäck - kleinanzeigen.de Backup ## Kleingebäck - kleinanzeigen.de Backup
![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png)
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE) [![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck) [![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
This tool can be used to backup ads on the german ad page https://kleinanzeigen.de This tool can be used to backup ads on the german ad page https://kleinanzeigen.de
It downloads all (or only the specified ones) ads of one user into a It downloads all (or only the specified ones) ads of one user into a
directory, each ad into its own subdirectory. The backup will contain directory, each ad into its own subdirectory. The backup will contain
a textfile B<Adlisting.txt> which contains the ad contents as the a textfile `Adlisting.txt` which contains the ad contents as the
title, body, price etc. All images will be downloaded as well. title, body, price etc. All images will be downloaded as well.
The tool doesn't need authentication and doesn't have any The tool doesn't need authentication and doesn't have any
@@ -37,8 +42,8 @@ otherwise backup all ads of the given user.
## Configfile ## Configfile
You can create a config file to save typing. By default You can create a config file to save typing. By default
C<~/.kleingebaeck.hcl> is being used but you can specify one with `~/.kleingebaeck.hcl` is being used but you can specify one with
C<-c> as well. `-c` as well.
Format is simple: Format is simple:
@@ -46,6 +51,7 @@ Format is simple:
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
``` ```
## Usage ## Usage
@@ -56,7 +62,7 @@ in:
https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
The B<XXXXX> part is your userid. The `XXXXX` part is your userid.
Put it into the configfile as outlined above. Also specify an output Put it into the configfile as outlined above. Also specify an output
directory. Then just execute `kleingebaeck`. directory. Then just execute `kleingebaeck`.
@@ -78,7 +84,7 @@ In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github: submit a patch, please open an issue on github:
https://github.com/TLINDEN/kleingebaeck/issues. https://github.com/TLINDEN/kleingebaeck/issues.
Please repeat the failing command with debugging enabled C<-d> and Please repeat the failing command with debugging enabled `-d` and
include the output in the issue. include the output in the issue.
## Copyright und License ## Copyright und License

View File

@@ -17,14 +17,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main package main
import ( import (
"github.com/hashicorp/hcl/v2/hclsimple"
"os" "os"
"github.com/hashicorp/hcl/v2/hclsimple"
) )
type Config struct { type Config struct {
Verbose bool `hcl:"verbose"` Verbose bool `hcl:"verbose"`
User int `hcl:"user"` User int `hcl:"user"`
Outdir string `hcl:"outdir"` Outdir string `hcl:"outdir"`
Template string `hcl:"template"`
} }
func ParseConfigfile(file string) (*Config, error) { func ParseConfigfile(file string) (*Config, error) {

View File

@@ -1,3 +1,20 @@
user = 89056200 #
# kleingebaeck sample configuration file.
# put this to ~/.kleingebaeck.hcl.
#
# Comments start with the '#' character.
# kleinanzeigen.de user-id. must be an unquoted number
user = 00000000
# enable verbose output (same as -v), may be true or false.
verbose = true verbose = true
# directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string.
outdir = "test" outdir = "test"
# template. leave empty to use the default one, which is:
# Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
# take care to include exactly 7 times '%s'!
template = ""

2
go.mod
View File

@@ -2,8 +2,6 @@ module kleingebaeck
go 1.21 go 1.21
toolchain go1.21.1
require ( require (
astuart.co/goq v1.0.0 // indirect astuart.co/goq v1.0.0 // indirect
github.com/PuerkitoBio/goquery v1.5.0 // indirect github.com/PuerkitoBio/goquery v1.5.0 // indirect

View File

@@ -133,7 +133,7 @@
.\" ======================================================================== .\" ========================================================================
.\" .\"
.IX Title "KLEINGEBAECK 1" .IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2023-12-15" "1" "User Commands" .TH KLEINGEBAECK 1 "2023-12-16" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents. .\" way too many mistakes in technical documents.
.if n .ad l .if n .ad l
@@ -169,11 +169,20 @@ You can create a config file to save typing. By default
.PP .PP
Format is simple: Format is simple:
.PP .PP
.Vb 3 .Vb 4
\& user = 1010101 \& user = 1010101
\& verbose = true \& verbose = true
\& outdir = "test" \& outdir = "test"
\& template = ""
.Ve .Ve
.PP
Be carefull if you want to change the template. The default one looks like this:
.PP
.Vb 1
\& Title: %s\enPrice: %s\enId: %s\enCategory: %s\enCondition: %s\enCreated: %s\enBody:\en\en%s\en
.Ve
.PP
If you change it, include 7 times the '%s' format tag.
.SH "SETUP" .SH "SETUP"
.IX Header "SETUP" .IX Header "SETUP"
To setup the tool, you need to lookup your userid on To setup the tool, you need to lookup your userid on

View File

@@ -34,6 +34,14 @@ CONFIGURATION
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks
like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
SETUP SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de. To setup the tool, you need to lookup your userid on kleinanzeigen.de.

View File

@@ -34,6 +34,13 @@ Format is simple:
user = 1010101 user = 1010101
verbose = true verbose = true
outdir = "test" outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks like this:
Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n
If you change it, include 7 times the '%s' format tag.
=head1 SETUP =head1 SETUP

34
main.go
View File

@@ -20,19 +20,21 @@ package main
import ( import (
"errors" "errors"
"fmt" "fmt"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
"log/slog" "log/slog"
"os" "os"
"runtime/debug" "runtime/debug"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
) )
const VERSION string = "0.0.2" const VERSION string = "0.0.3"
const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
const Baseuri string = "https://www.kleinanzeigen.de" const Baseuri string = "https://www.kleinanzeigen.de"
const Listuri string = "/s-bestandsliste.html" const Listuri string = "/s-bestandsliste.html"
const Defaultdir string = "." const Defaultdir string = "."
const DefaultTemplate string = "Title: %s\nPrice: %s\nId: %s\nCategory: %s\nCondition: %s\nCreated: %s\nBody:\n\n%s\n"
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...] Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
@@ -102,6 +104,14 @@ func Main() int {
return 0 return 0
} }
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
conf, err := ParseConfigfile(configfile) conf, err := ParseConfigfile(configfile)
if err != nil { if err != nil {
return Die(err) return Die(err)
@@ -132,14 +142,6 @@ func Main() int {
slog.Debug("config", "conf", conf) slog.Debug("config", "conf", conf)
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
if len(dir) == 0 { if len(dir) == 0 {
if len(conf.Outdir) > 0 { if len(conf.Outdir) > 0 {
dir = conf.Outdir dir = conf.Outdir
@@ -154,10 +156,16 @@ func Main() int {
return Die(err) return Die(err)
} }
// which template to use
template := DefaultTemplate
if len(conf.Template) > 0 {
template = conf.Template
}
// directly backup ad listing[s] // directly backup ad listing[s]
if len(flag.Args()) >= 1 { if len(flag.Args()) >= 1 {
for _, uri := range flag.Args() { for _, uri := range flag.Args() {
err := Scrape(uri, dir) err := Scrape(uri, dir, template)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }
@@ -172,7 +180,7 @@ func Main() int {
} }
if uid > 0 { if uid > 0 {
err := Start(fmt.Sprintf("%d", uid), dir) err := Start(fmt.Sprintf("%d", uid), dir, template)
if err != nil { if err != nil {
return Die(err) return Die(err)
} }

View File

@@ -22,10 +22,10 @@ import (
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
"net/http"
"os" "os"
"strings" "strings"
"sync"
"net/http"
"astuart.co/goq" "astuart.co/goq"
) )
@@ -34,6 +34,29 @@ type Index struct {
Links []string `goquery:".text-module-begin a,[href]"` Links []string `goquery:".text-module-begin a,[href]"`
} }
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string
Category string
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Meta []string `goquery:".addetailslist--detail--value,text"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// fetch some web page content // fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) { func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil) req, err := http.NewRequest("GET", uri, nil)
@@ -56,7 +79,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination) // extract links from all ad listing pages (that is: use pagination)
// and scrape every page // and scrape every page
func Start(uid string, dir string) error { func Start(uid string, dir string, template string) error {
client := &http.Client{} client := &http.Client{}
adlinks := []string{} adlinks := []string{}
@@ -96,7 +119,7 @@ func Start(uid string, dir string) error {
} }
for _, adlink := range adlinks { for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir) err := Scrape(Baseuri+adlink, dir, template)
if err != nil { if err != nil {
return err return err
} }
@@ -105,27 +128,8 @@ func Start(uid string, dir string) error {
return nil return nil
} }
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Price string `goquery:"h2#viewad-price"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir // scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string) error { func Scrape(uri string, dir string, template string) error {
client := &http.Client{} client := &http.Client{}
ad := &Ad{} ad := &Ad{}
@@ -150,6 +154,10 @@ func Scrape(uri string, dir string) error {
if err != nil { if err != nil {
return err return err
} }
if len(ad.Meta) == 2 {
ad.Category = ad.Meta[0]
ad.Condition = ad.Meta[1]
}
slog.Debug("extracted ad listing", "ad", ad) slog.Debug("extracted ad listing", "ad", ad)
// prepare output dir // prepare output dir
@@ -167,26 +175,45 @@ func Scrape(uri string, dir string) error {
} }
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n") ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n", _, err = fmt.Fprintf(f, template,
ad.Title, ad.Price, ad.Id, ad.Text) ad.Title, ad.Price, ad.Id, ad.Category, ad.Condition, ad.Created, ad.Text)
if err != nil { if err != nil {
return err return err
} }
slog.Info("wrote ad listing", "listingfile", listingfile) slog.Info("wrote ad listing", "listingfile", listingfile)
return ScrapeImages(dir, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
// fetch images // fetch images
img := 1 img := 1
var wg sync.WaitGroup
wg.Add(len(ad.Images))
failure := make(chan string)
for _, imguri := range ad.Images { for _, imguri := range ad.Images {
file := fmt.Sprintf("%s/%d.jpg", dir, img) file := fmt.Sprintf("%s/%d.jpg", dir, img)
err := Getimage(imguri, file) go func() {
if err != nil { defer wg.Done()
return err err := Getimage(imguri, file)
} if err != nil {
slog.Info("wrote ad image", "image", file) failure <- err.Error()
return
}
slog.Info("wrote ad image", "image", file)
}()
img++ img++
} }
close(failure)
wg.Wait()
goterr := <-failure
if goterr != "" {
return errors.New(goterr)
}
return nil return nil
} }