enhancements:

- english README (german version will be put to the homepage)
- better commandline options
- enhanced logging capabilities and error handling
- config file support
- support to backup one or more singular ads
- add id to adlisting
- added manual page
- fixed config file reading
- fixed typo
This commit is contained in:
2023-12-15 14:50:40 +01:00
parent c2f378be05
commit 1b55d887bc
12 changed files with 621 additions and 65 deletions

View File

@@ -21,6 +21,7 @@ import (
"errors"
"fmt"
"io"
"log/slog"
"os"
"strings"
@@ -42,13 +43,14 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
req.Header.Set("User-Agent", Useragent)
// fmt.Println(uri)
res, err := client.Do(req)
if err != nil {
return nil, err
}
slog.Debug("response", "code", res.StatusCode, "status",
res.Status, "size", res.ContentLength)
return res.Body, nil
}
@@ -56,14 +58,17 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// and scrape every page
func Start(uid string, dir string) error {
client := &http.Client{}
ads := []string{}
adlinks := []string{}
baseuri := Baseuri + Listuri + "?userId=" + uid
page := 1
uri := baseuri
slog.Info("fetching ad pages", "user", uid)
for {
var index Index
slog.Debug("fetching page", "uri", uri)
body, err := Get(uri, client)
if err != nil {
return err
@@ -79,17 +84,19 @@ func Start(uid string, dir string) error {
break
}
slog.Debug("extracted ad links", "count", len(index.Links))
for _, href := range index.Links {
ads = append(ads, href)
fmt.Println(href)
adlinks = append(adlinks, href)
slog.Debug("ad link", "href", href)
}
page++
uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
}
for _, ad := range ads {
err := Scrape(ad, dir)
for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir)
if err != nil {
return err
}
@@ -99,40 +106,75 @@ func Start(uid string, dir string) error {
}
type Ad struct {
Title string `goquery:"h1"`
Title string `goquery:"h1"`
Slug string
Id string
Text string `goquery:"p#viewad-description-text,html"`
Images []string `goquery:".galleryimage-element img,[src]"`
Price string `goquery:"h2#viewad-price"`
}
func Scrape(link string, dir string) error {
client := &http.Client{}
uri := Baseuri + link
slurp := strings.Split(uri, "/")[1]
func (ad *Ad) LogValue() slog.Value {
return slog.GroupValue(
slog.String("title", ad.Title),
slog.String("price", ad.Price),
slog.String("id", ad.Id),
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
)
}
var ad Ad
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string) error {
client := &http.Client{}
ad := &Ad{}
// extract slug and id from uri
uriparts := strings.Split(uri, "/")
if len(uriparts) < 6 {
return errors.New("invalid uri")
}
ad.Slug = uriparts[4]
ad.Id = uriparts[5]
// get the ad
slog.Debug("fetching ad page", "uri", uri)
body, err := Get(uri, client)
if err != nil {
return err
}
defer body.Close()
// extract ad contents with goquery/goq
err = goq.NewDecoder(body).Decode(&ad)
if err != nil {
return err
}
slog.Debug("extracted ad listing", "ad", ad)
f, err := os.Create(strings.Join([]string{dir, slurp, "Anzeige.txt"}, "/"))
// prepare output dir
dir = dir + "/" + ad.Slug
err = Mkdir(dir)
if err != nil {
return err
}
// write ad file
listingfile := strings.Join([]string{dir, "Adlisting.txt"}, "/")
f, err := os.Create(listingfile)
if err != nil {
return err
}
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\n\n%s", ad.Title, ad.Price, ad.Text)
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n",
ad.Title, ad.Price, ad.Id, ad.Text)
if err != nil {
return err
}
slog.Info("wrote ad listing", "listingfile", listingfile)
// fetch images
img := 1
for _, imguri := range ad.Images {
file := fmt.Sprintf("%s/%d.jpg", dir, img)
@@ -140,6 +182,7 @@ func Scrape(link string, dir string) error {
if err != nil {
return err
}
slog.Info("wrote ad image", "image", file)
img++
}
@@ -149,6 +192,7 @@ func Scrape(link string, dir string) error {
// fetch an image
func Getimage(uri, fileName string) error {
slog.Debug("fetching ad image", "uri", uri)
response, err := http.Get(uri)
if err != nil {
return err