Compare commits

...

13 Commits

Author SHA1 Message Date
T.v.Dein
def063afe9 Merge pull request #6 from TLINDEN/dev 2023-12-18 09:23:55 +01:00
f1908f02cb bump version 2023-12-18 09:23:18 +01:00
4a528ad9d1 fix #5: add exe extension to built windows binaries 2023-12-18 09:22:08 +01:00
5c1161f227 fix #4, use filepath.Join to create portable path's 2023-12-18 09:21:26 +01:00
bd9d8fdb2c fix version finding 2023-12-17 17:53:01 +01:00
T.v.Dein
1ee886c504 Merge pull request #2 from TLINDEN/dev
re-orgainzied code a little, using go templates instead format string
2023-12-17 17:49:27 +01:00
f932d7be83 re-orgainzied code a little, using go templates instead format string 2023-12-17 17:32:05 +01:00
T.v.Dein
d7b13e8a9a Merge pull request #1 from TLINDEN/dev
added custom template support, added more ad data, use concurrency
2023-12-16 20:35:18 +01:00
e904ed6687 added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00
df6baadc85 better sample config 2023-12-16 00:01:29 +01:00
314315a1c6 fix pod entities => markdown 2023-12-15 18:29:42 +01:00
2e83e68f20 fix logo 2023-12-15 18:02:34 +01:00
b5e51b43c9 add logo 2023-12-15 18:00:41 +01:00
14 changed files with 255 additions and 90 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
.github/assets/kleingebaecklogo.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

View File

@@ -17,7 +17,7 @@
#
# no need to modify anything below
tool = kleingebaeck
VERSION = $(shell grep VERSION main.go | head -1 | cut -d '"' -f2)
VERSION = $(shell grep VERSION config.go | head -1 | cut -d '"' -f2)
archs = darwin freebsd linux windows
PREFIX = /usr/local
UID = root

View File

@@ -1,13 +1,18 @@
## Kleingebäck - kleinanzeigen.de Backup
![Kleingebaeck Logo](https://github.com/TLINDEN/kleingebaeck/blob/main/.github/assets/kleingebaecklogo-small.png)
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/kleingebaeck/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/kleingebaeck)](https://goreportcard.com/report/github.com/tlinden/kleingebaeck)
![GitHub License](https://img.shields.io/github/license/tlinden/kleingebaeck)
[![GitHub release](https://img.shields.io/github/v/release/tlinden/kleingebaeck?color=%2300a719)](https://github.com/TLINDEN/kleingebaeck/releases/latest)
This tool can be used to backup ads on the german ad page https://kleinanzeigen.de
It downloads all (or only the specified ones) ads of one user into a
directory, each ad into its own subdirectory. The backup will contain
a textfile B<Adlisting.txt> which contains the ad contents as the
a textfile `Adlisting.txt` which contains the ad contents as the
title, body, price etc. All images will be downloaded as well.
The tool doesn't need authentication and doesn't have any
@@ -37,8 +42,8 @@ otherwise backup all ads of the given user.
## Configfile
You can create a config file to save typing. By default
C<~/.kleingebaeck.hcl> is being used but you can specify one with
C<-c> as well.
`~/.kleingebaeck.hcl` is being used but you can specify one with
`-c` as well.
Format is simple:
@@ -46,6 +51,7 @@ Format is simple:
user = 1010101
verbose = true
outdir = "test"
template = ""
```
## Usage
@@ -56,11 +62,32 @@ in:
https://www.kleinanzeigen.de/s-bestandsliste.html?userId=XXXXXX
The B<XXXXX> part is your userid.
The `XXXXX` part is your userid.
Put it into the configfile as outlined above. Also specify an output
directory. Then just execute `kleingebaeck`.
Inside the output directory you'll find a new subdirectory for each
ad. Every directory contains a file `Adlisting.txt`, which will look
somewhat like this:
```default
Title: A book I sell
Price: 99 € VB
Id: 1919191919
Category: Sachbücher
Condition: Sehr Gut
Created: 10.12.2023
This is the description text.
Pay with paypal.
```
You can change the formatting using the `template` config
variable. The supplied sample config contains the default template.
All images will be stored in the same directory.
## Kleingebäck?
@@ -78,7 +105,7 @@ In order to report a bug, unexpected behavior, feature requests or to
submit a patch, please open an issue on github:
https://github.com/TLINDEN/kleingebaeck/issues.
Please repeat the failing command with debugging enabled C<-d> and
Please repeat the failing command with debugging enabled `-d` and
include the output in the issue.
## Copyright und License

View File

@@ -17,14 +17,27 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main
import (
"github.com/hashicorp/hcl/v2/hclsimple"
"os"
"github.com/hashicorp/hcl/v2/hclsimple"
)
const (
VERSION string = "0.0.5"
Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html"
Defaultdir string = "."
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n"
Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
type Config struct {
Verbose bool `hcl:"verbose"`
User int `hcl:"user"`
Outdir string `hcl:"outdir"`
Template string `hcl:"template"`
}
func ParseConfigfile(file string) (*Config, error) {

View File

@@ -1,3 +1,19 @@
user = 89056200
#
# kleingebaeck sample configuration file.
# put this to ~/.kleingebaeck.hcl.
#
# Comments start with the '#' character.
# kleinanzeigen.de user-id. must be an unquoted number
user = 00000000
# enable verbose output (same as -v), may be true or false.
verbose = true
# directory where to store downloaded ads. kleingebaeck will try to
# create it. must be a quoted string.
outdir = "test"
# template. leave empty to use the default one, which is:
# "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n"
template = ""

2
go.mod
View File

@@ -2,8 +2,6 @@ module kleingebaeck
go 1.21
toolchain go1.21.1
require (
astuart.co/goq v1.0.0 // indirect
github.com/PuerkitoBio/goquery v1.5.0 // indirect

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2023-12-15" "1" "User Commands"
.TH KLEINGEBAECK 1 "2023-12-17" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -169,11 +169,21 @@ You can create a config file to save typing. By default
.PP
Format is simple:
.PP
.Vb 3
.Vb 4
\& user = 1010101
\& verbose = true
\& outdir = "test"
\& template = ""
.Ve
.PP
Be carefull if you want to change the template. The default one looks like this:
.PP
.Vb 1
\& Title: {{.Title}}\enPrice: {{.Price}}\enId: {{.Id}}\enCategory: {{.Category}}\enCondition: {{.Condition}}\enCreated: {{.Created}}\en\en{{.Text}}\en
.Ve
.PP
You can left out certain fields and use any formatting you like. Refer
to <https://pkg.go.dev/text/template> for details how to write a template.
.SH "SETUP"
.IX Header "SETUP"
To setup the tool, you need to lookup your userid on

View File

@@ -34,6 +34,16 @@ CONFIGURATION
user = 1010101
verbose = true
outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks
like this:
Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n
You can left out certain fields and use any formatting you like. Refer
to <https://pkg.go.dev/text/template> for details how to write a
template.
SETUP
To setup the tool, you need to lookup your userid on kleinanzeigen.de.

View File

@@ -34,6 +34,14 @@ Format is simple:
user = 1010101
verbose = true
outdir = "test"
template = ""
Be carefull if you want to change the template. The default one looks like this:
Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.Id}}\nCategory: {{.Category}}\nCondition: {{.Condition}}\nCreated: {{.Created}}\n\n{{.Text}}\n
You can left out certain fields and use any formatting you like. Refer
to L<https://pkg.go.dev/text/template> for details how to write a template.
=head1 SETUP

38
main.go
View File

@@ -20,19 +20,13 @@ package main
import (
"errors"
"fmt"
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
"log/slog"
"os"
"runtime/debug"
)
const VERSION string = "0.0.2"
const Useragent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
const Baseuri string = "https://www.kleinanzeigen.de"
const Listuri string = "/s-bestandsliste.html"
const Defaultdir string = "."
"github.com/lmittmann/tint"
flag "github.com/spf13/pflag"
)
const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool.
Usage: kleingebaeck [-dvVhmoc] [<ad-listing-url>,...]
@@ -102,6 +96,14 @@ func Main() int {
return 0
}
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
conf, err := ParseConfigfile(configfile)
if err != nil {
return Die(err)
@@ -132,14 +134,6 @@ func Main() int {
slog.Debug("config", "conf", conf)
if showmanual {
err := man()
if err != nil {
return Die(err)
}
return 0
}
if len(dir) == 0 {
if len(conf.Outdir) > 0 {
dir = conf.Outdir
@@ -154,10 +148,16 @@ func Main() int {
return Die(err)
}
// which template to use
template := DefaultTemplate
if len(conf.Template) > 0 {
template = conf.Template
}
// directly backup ad listing[s]
if len(flag.Args()) >= 1 {
for _, uri := range flag.Args() {
err := Scrape(uri, dir)
err := Scrape(uri, dir, template)
if err != nil {
return Die(err)
}
@@ -172,7 +172,7 @@ func Main() int {
}
if uid > 0 {
err := Start(fmt.Sprintf("%d", uid), dir)
err := Start(fmt.Sprintf("%d", uid), dir, template)
if err != nil {
return Die(err)
}

View File

@@ -40,6 +40,11 @@ for D in $DIST; do
os=${D/\/*/}
arch=${D/*\//}
binfile="releases/${tool}-${os}-${arch}-${version}"
if test "$os" = "windows"; then
binfile="${binfile}.exe"
fi
tardir="${tool}-${os}-${arch}-${version}"
tarfile="releases/${tool}-${os}-${arch}-${version}.tar.gz"
set -x

109
scrape.go
View File

@@ -22,10 +22,10 @@ import (
"fmt"
"io"
"log/slog"
"os"
"strings"
"net/http"
"path/filepath"
"strings"
"sync"
"astuart.co/goq"
)
@@ -34,6 +34,29 @@ type Index struct {
Links []string `goquery:".text-module-begin a,[href]"`
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Condition string
Category string
Price string `goquery:"h2#viewad-price"`
Created string `goquery:"#viewad-extra-info,text"`
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Meta []string `goquery:".addetailslist--detail--value,text"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// fetch some web page content
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req, err := http.NewRequest("GET", uri, nil)
@@ -56,7 +79,7 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func Start(uid string, dir string) error {
func Start(uid string, dir string, template string) error {
client := &http.Client{}
adlinks := []string{}
@@ -96,7 +119,7 @@ func Start(uid string, dir string) error {
}
for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir)
err := Scrape(Baseuri+adlink, dir, template)
if err != nil {
return err
}
@@ -105,27 +128,8 @@ func Start(uid string, dir string) error {
return nil
}
type Ad struct {
Title string `goquery:"h1"`
Slug string
Id string
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Price string `goquery:"h2#viewad-price"`
}
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string) error {
func Scrape(uri string, dir string, template string) error {
client := &http.Client{}
ad := &Ad{}
@@ -150,43 +154,50 @@ func Scrape(uri string, dir string) error {
if err != nil {
return err
}
if len(ad.Meta) == 2 {
ad.Category = ad.Meta[0]
ad.Condition = ad.Meta[1]
}
slog.Debug("extracted ad listing", "ad", ad)
// prepare output dir
dir = dir + "/" + ad.Slug
err = Mkdir(dir)
// write listing
err = WriteAd(dir, ad, template)
if err != nil {
return err
}
// write ad file
listingfile := strings.Join([]string{dir, "Adlisting.txt"}, "/")
f, err := os.Create(listingfile)
if err != nil {
return err
}
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n",
ad.Title, ad.Price, ad.Id, ad.Text)
if err != nil {
return err
}
slog.Info("wrote ad listing", "listingfile", listingfile)
return ScrapeImages(dir, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
// fetch images
img := 1
var wg sync.WaitGroup
wg.Add(len(ad.Images))
failure := make(chan string)
for _, imguri := range ad.Images {
file := fmt.Sprintf("%s/%d.jpg", dir, img)
file := filepath.Join(dir, ad.Slug, fmt.Sprintf("%d.jpg", img))
go func() {
defer wg.Done()
err := Getimage(imguri, file)
if err != nil {
return err
failure <- err.Error()
return
}
slog.Info("wrote ad image", "image", file)
}()
img++
}
close(failure)
wg.Wait()
goterr := <-failure
if goterr != "" {
return errors.New(goterr)
}
return nil
}
@@ -203,13 +214,7 @@ func Getimage(uri, fileName string) error {
return errors.New("received non 200 response code")
}
file, err := os.Create(fileName)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
err = WriteImage(fileName, response.Body)
if err != nil {
return err
}

73
store.go Normal file
View File

@@ -0,0 +1,73 @@
/*
Copyright © 2023 Thomas von Dein
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main
import (
"io"
"log/slog"
"os"
"path/filepath"
"strings"
tpl "text/template"
)
func WriteAd(dir string, ad *Ad, template string) error {
// prepare output dir
dir = filepath.Join(dir, ad.Slug)
err := Mkdir(dir)
if err != nil {
return err
}
// write ad file
listingfile := filepath.Join(dir, "Adlisting.txt")
f, err := os.Create(listingfile)
if err != nil {
return err
}
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
tmpl, err := tpl.New("adlisting").Parse(template)
if err != nil {
return err
}
err = tmpl.Execute(f, ad)
if err != nil {
return err
}
slog.Info("wrote ad listing", "listingfile", listingfile)
return nil
}
func WriteImage(filename string, reader io.ReadCloser) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, reader)
if err != nil {
return err
}
return nil
}