mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-16 12:01:00 +01:00
enhancements:
- english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo
This commit is contained in:
74
scrape.go
74
scrape.go
@@ -21,6 +21,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
@@ -42,13 +43,14 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||
|
||||
req.Header.Set("User-Agent", Useragent)
|
||||
|
||||
// fmt.Println(uri)
|
||||
|
||||
res, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("response", "code", res.StatusCode, "status",
|
||||
res.Status, "size", res.ContentLength)
|
||||
|
||||
return res.Body, nil
|
||||
}
|
||||
|
||||
@@ -56,14 +58,17 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||
// and scrape every page
|
||||
func Start(uid string, dir string) error {
|
||||
client := &http.Client{}
|
||||
ads := []string{}
|
||||
adlinks := []string{}
|
||||
|
||||
baseuri := Baseuri + Listuri + "?userId=" + uid
|
||||
page := 1
|
||||
uri := baseuri
|
||||
|
||||
slog.Info("fetching ad pages", "user", uid)
|
||||
|
||||
for {
|
||||
var index Index
|
||||
slog.Debug("fetching page", "uri", uri)
|
||||
body, err := Get(uri, client)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -79,17 +84,19 @@ func Start(uid string, dir string) error {
|
||||
break
|
||||
}
|
||||
|
||||
slog.Debug("extracted ad links", "count", len(index.Links))
|
||||
|
||||
for _, href := range index.Links {
|
||||
ads = append(ads, href)
|
||||
fmt.Println(href)
|
||||
adlinks = append(adlinks, href)
|
||||
slog.Debug("ad link", "href", href)
|
||||
}
|
||||
|
||||
page++
|
||||
uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
|
||||
}
|
||||
|
||||
for _, ad := range ads {
|
||||
err := Scrape(ad, dir)
|
||||
for _, adlink := range adlinks {
|
||||
err := Scrape(Baseuri+adlink, dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -99,40 +106,75 @@ func Start(uid string, dir string) error {
|
||||
}
|
||||
|
||||
type Ad struct {
|
||||
Title string `goquery:"h1"`
|
||||
Title string `goquery:"h1"`
|
||||
Slug string
|
||||
Id string
|
||||
Text string `goquery:"p#viewad-description-text,html"`
|
||||
Images []string `goquery:".galleryimage-element img,[src]"`
|
||||
Price string `goquery:"h2#viewad-price"`
|
||||
}
|
||||
|
||||
func Scrape(link string, dir string) error {
|
||||
client := &http.Client{}
|
||||
uri := Baseuri + link
|
||||
slurp := strings.Split(uri, "/")[1]
|
||||
func (ad *Ad) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("title", ad.Title),
|
||||
slog.String("price", ad.Price),
|
||||
slog.String("id", ad.Id),
|
||||
slog.Int("imagecount", len(ad.Images)),
|
||||
slog.Int("bodysize", len(ad.Text)),
|
||||
)
|
||||
}
|
||||
|
||||
var ad Ad
|
||||
// scrape an ad. uri is the full uri of the ad, dir is the basedir
|
||||
func Scrape(uri string, dir string) error {
|
||||
client := &http.Client{}
|
||||
ad := &Ad{}
|
||||
|
||||
// extract slug and id from uri
|
||||
uriparts := strings.Split(uri, "/")
|
||||
if len(uriparts) < 6 {
|
||||
return errors.New("invalid uri")
|
||||
}
|
||||
ad.Slug = uriparts[4]
|
||||
ad.Id = uriparts[5]
|
||||
|
||||
// get the ad
|
||||
slog.Debug("fetching ad page", "uri", uri)
|
||||
body, err := Get(uri, client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer body.Close()
|
||||
|
||||
// extract ad contents with goquery/goq
|
||||
err = goq.NewDecoder(body).Decode(&ad)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Debug("extracted ad listing", "ad", ad)
|
||||
|
||||
f, err := os.Create(strings.Join([]string{dir, slurp, "Anzeige.txt"}, "/"))
|
||||
// prepare output dir
|
||||
dir = dir + "/" + ad.Slug
|
||||
err = Mkdir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// write ad file
|
||||
listingfile := strings.Join([]string{dir, "Adlisting.txt"}, "/")
|
||||
f, err := os.Create(listingfile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ad.Text = strings.ReplaceAll(ad.Text, "<br/>", "\n")
|
||||
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\n\n%s", ad.Title, ad.Price, ad.Text)
|
||||
_, err = fmt.Fprintf(f, "Title: %s\nPrice: %s\nId: %s\nBody:\n\n%s\n",
|
||||
ad.Title, ad.Price, ad.Id, ad.Text)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Info("wrote ad listing", "listingfile", listingfile)
|
||||
|
||||
// fetch images
|
||||
img := 1
|
||||
for _, imguri := range ad.Images {
|
||||
file := fmt.Sprintf("%s/%d.jpg", dir, img)
|
||||
@@ -140,6 +182,7 @@ func Scrape(link string, dir string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Info("wrote ad image", "image", file)
|
||||
|
||||
img++
|
||||
}
|
||||
@@ -149,6 +192,7 @@ func Scrape(link string, dir string) error {
|
||||
|
||||
// fetch an image
|
||||
func Getimage(uri, fileName string) error {
|
||||
slog.Debug("fetching ad image", "uri", uri)
|
||||
response, err := http.Get(uri)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user