Bugfixes (#13)

* several fixes:

- fix #9 + #10: switched to koanf module and dropped support for HCL
- fix #11: disabling colors on windows
- fix #12: fixed race condition in go routine call inside for loop,
  images had been downloaded multiple times
- remove hcl support and use toml format (same thing, better parser)
- update documentation and example config on TOML format of config file
- use Config as arg instead of singular args
- use x/errgroup instead of sync.Waitgroup inside image download loop

---------

Co-authored-by: Thomas von Dein <tom@vondein.org>
This commit is contained in:
T.v.Dein
2023-12-19 18:23:41 +01:00
committed by Thomas von Dein
parent ae5e3daea3
commit b8977df986
12 changed files with 394 additions and 222 deletions

View File

@@ -25,9 +25,9 @@ import (
"net/http"
"path/filepath"
"strings"
"sync"
"astuart.co/goq"
"golang.org/x/sync/errgroup"
)
type Index struct {
@@ -79,15 +79,15 @@ func Get(uri string, client *http.Client) (io.ReadCloser, error) {
// extract links from all ad listing pages (that is: use pagination)
// and scrape every page
func Start(uid string, dir string, template string) error {
func Start(conf *Config) error {
client := &http.Client{}
adlinks := []string{}
baseuri := Baseuri + Listuri + "?userId=" + uid
baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, conf.User)
page := 1
uri := baseuri
slog.Info("fetching ad pages", "user", uid)
slog.Info("fetching ad pages", "user", conf.User)
for {
var index Index
@@ -118,18 +118,22 @@ func Start(uid string, dir string, template string) error {
uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
}
for _, adlink := range adlinks {
err := Scrape(Baseuri+adlink, dir, template)
for i, adlink := range adlinks {
err := Scrape(conf, Baseuri+adlink)
if err != nil {
return err
}
if conf.Limit > 0 && i == conf.Limit-1 {
break
}
}
return nil
}
// scrape an ad. uri is the full uri of the ad, dir is the basedir
func Scrape(uri string, dir string, template string) error {
func Scrape(c *Config, uri string) error {
client := &http.Client{}
ad := &Ad{}
@@ -161,43 +165,42 @@ func Scrape(uri string, dir string, template string) error {
slog.Debug("extracted ad listing", "ad", ad)
// write listing
err = WriteAd(dir, ad, template)
err = WriteAd(c.Outdir, ad, c.Template)
if err != nil {
return err
}
return ScrapeImages(dir, ad)
c.IncrAds()
return ScrapeImages(c, ad)
}
func ScrapeImages(dir string, ad *Ad) error {
func ScrapeImages(c *Config, ad *Ad) error {
// fetch images
img := 1
var wg sync.WaitGroup
wg.Add(len(ad.Images))
failure := make(chan string)
g := new(errgroup.Group)
for _, imguri := range ad.Images {
file := filepath.Join(dir, ad.Slug, fmt.Sprintf("%d.jpg", img))
go func() {
defer wg.Done()
imguri := imguri
file := filepath.Join(c.Outdir, ad.Slug, fmt.Sprintf("%d.jpg", img))
g.Go(func() error {
err := Getimage(imguri, file)
if err != nil {
failure <- err.Error()
return
return err
}
slog.Info("wrote ad image", "image", file)
}()
return nil
})
img++
}
close(failure)
wg.Wait()
goterr := <-failure
if goterr != "" {
return errors.New(goterr)
if err := g.Wait(); err != nil {
return err
}
c.IncrImgs(len(ad.Images))
return nil
}