mirror of
https://codeberg.org/scip/kleingebaeck.git
synced 2025-12-17 04:21:00 +01:00
put ad code into separate file, enhance error checking
This commit is contained in:
66
ad.go
Normal file
66
ad.go
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
Copyright © 2023 Thomas von Dein
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log/slog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Index struct {
|
||||||
|
Links []string `goquery:".text-module-begin a,[href]"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Ad struct {
|
||||||
|
Title string `goquery:"h1"`
|
||||||
|
Slug string
|
||||||
|
Id string
|
||||||
|
Condition string
|
||||||
|
Category string
|
||||||
|
Price string `goquery:"h2#viewad-price"`
|
||||||
|
Created string `goquery:"#viewad-extra-info,text"`
|
||||||
|
Text string `goquery:"p#viewad-description-text,html"`
|
||||||
|
Images []string `goquery:".galleryimage-element img,[src]"`
|
||||||
|
Meta []string `goquery:".addetailslist--detail--value,text"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Used by slog to pretty print an ad
|
||||||
|
func (ad *Ad) LogValue() slog.Value {
|
||||||
|
return slog.GroupValue(
|
||||||
|
slog.String("title", ad.Title),
|
||||||
|
slog.String("price", ad.Price),
|
||||||
|
slog.String("id", ad.Id),
|
||||||
|
slog.Int("imagecount", len(ad.Images)),
|
||||||
|
slog.Int("bodysize", len(ad.Text)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for completeness. I erected these fields to be mandatory
|
||||||
|
// (though I really don't know if they really are). I consider images
|
||||||
|
// and meta optional. So, if either of the checked fields here is
|
||||||
|
// empty we return an error. All the checked fields are extracted
|
||||||
|
// using goquery. However, I think price is optional since there are
|
||||||
|
// ads for gifts as well.
|
||||||
|
//
|
||||||
|
// Note: we return true for "ad is incomplete" and false for "ad is complete"!
|
||||||
|
func (ad *Ad) Incomplete() bool {
|
||||||
|
if ad.Category == "" || ad.Condition == "" || ad.Created == "" || ad.Text == "" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
36
scrape.go
36
scrape.go
@@ -30,33 +30,6 @@ import (
|
|||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Index struct {
|
|
||||||
Links []string `goquery:".text-module-begin a,[href]"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type Ad struct {
|
|
||||||
Title string `goquery:"h1"`
|
|
||||||
Slug string
|
|
||||||
Id string
|
|
||||||
Condition string
|
|
||||||
Category string
|
|
||||||
Price string `goquery:"h2#viewad-price"`
|
|
||||||
Created string `goquery:"#viewad-extra-info,text"`
|
|
||||||
Text string `goquery:"p#viewad-description-text,html"`
|
|
||||||
Images []string `goquery:".galleryimage-element img,[src]"`
|
|
||||||
Meta []string `goquery:".addetailslist--detail--value,text"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ad *Ad) LogValue() slog.Value {
|
|
||||||
return slog.GroupValue(
|
|
||||||
slog.String("title", ad.Title),
|
|
||||||
slog.String("price", ad.Price),
|
|
||||||
slog.String("id", ad.Id),
|
|
||||||
slog.Int("imagecount", len(ad.Images)),
|
|
||||||
slog.Int("bodysize", len(ad.Text)),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// fetch some web page content
|
// fetch some web page content
|
||||||
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
||||||
req, err := http.NewRequest("GET", uri, nil)
|
req, err := http.NewRequest("GET", uri, nil)
|
||||||
@@ -74,6 +47,10 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
|
|||||||
slog.Debug("response", "code", res.StatusCode, "status",
|
slog.Debug("response", "code", res.StatusCode, "status",
|
||||||
res.Status, "size", res.ContentLength)
|
res.Status, "size", res.ContentLength)
|
||||||
|
|
||||||
|
if res.StatusCode != 200 {
|
||||||
|
return nil, errors.New("could not get page via HTTP")
|
||||||
|
}
|
||||||
|
|
||||||
return res.Body, nil
|
return res.Body, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,6 +139,11 @@ func Scrape(c *Config, uri string) error {
|
|||||||
ad.Category = ad.Meta[0]
|
ad.Category = ad.Meta[0]
|
||||||
ad.Condition = ad.Meta[1]
|
ad.Condition = ad.Meta[1]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ad.Incomplete() {
|
||||||
|
return errors.New("could not extract ad data from page, got empty struct")
|
||||||
|
}
|
||||||
|
|
||||||
slog.Debug("extracted ad listing", "ad", ad)
|
slog.Debug("extracted ad listing", "ad", ad)
|
||||||
|
|
||||||
// write listing
|
// write listing
|
||||||
|
|||||||
Reference in New Issue
Block a user