scrape.go

/*
Copyright © 2023-2024 Thomas von Dein

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package main

import (
	"errors"
	"fmt"
	"log/slog"
	"path/filepath"
	"strings"

	"astuart.co/goq"
	"golang.org/x/sync/errgroup"
)

// extract links from  all ad listing pages (that  is: use pagination)
// and scrape every page
func ScrapeUser(fetch *Fetcher) error {
	adlinks := []string{}

	baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)
	page := 1
	uri := baseuri

	slog.Info("fetching ad pages", "user", fetch.Config.User)

	for {
		var index Index
		slog.Debug("fetching page", "uri", uri)
		body, err := fetch.Get(uri)
		if err != nil {
			return err
		}
		defer body.Close()

		err = goq.NewDecoder(body).Decode(&index)
		if err != nil {
			return err
		}

		if len(index.Links) == 0 {
			break
		}

		slog.Debug("extracted ad links", "count", len(index.Links))

		for _, href := range index.Links {
			adlinks = append(adlinks, href)
			slog.Debug("ad link", "href", href)
		}

		page++
		uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)
	}

	for i, adlink := range adlinks {
		err := ScrapeAd(fetch, Baseuri+adlink)
		if err != nil {
			return err
		}

		if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {
			break
		}
	}

	return nil
}

// scrape an ad. uri is the full uri of the ad, dir is the basedir
func ScrapeAd(fetch *Fetcher, uri string) error {
	ad := &Ad{}

	// extract slug and id from uri
	uriparts := strings.Split(uri, "/")
	if len(uriparts) < 6 {
		return errors.New("invalid uri: " + uri)
	}
	ad.Slug = uriparts[4]
	ad.Id = uriparts[5]

	// get the ad
	slog.Debug("fetching ad page", "uri", uri)
	body, err := fetch.Get(uri)
	if err != nil {
		return err
	}
	defer body.Close()

	// extract ad contents with goquery/goq
	err = goq.NewDecoder(body).Decode(&ad)
	if err != nil {
		return err
	}

	if len(ad.CategoryTree) > 0 {
		ad.Category = strings.Join(ad.CategoryTree, " => ")
	}

	if ad.Incomplete() {
		slog.Debug("got ad", "ad", ad)
		return errors.New("could not extract ad data from page, got empty struct")
	}

	slog.Debug("extracted ad listing", "ad", ad)

	// write listing
	addir, err := WriteAd(fetch.Config, ad)
	if err != nil {
		return err
	}

	fetch.Config.IncrAds()

	return ScrapeImages(fetch, ad, addir)
}

func ScrapeImages(fetch *Fetcher, ad *Ad, addir string) error {
	// fetch images
	img := 1
	g := new(errgroup.Group)

	for _, imguri := range ad.Images {
		imguri := imguri
		file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))
		g.Go(func() error {
			body, err := fetch.Getimage(imguri)
			if err != nil {
				return err
			}

			err = WriteImage(file, body)
			if err != nil {
				return err
			}

			return nil
		})
		img++
	}

	if err := g.Wait(); err != nil {
		return err
	}

	fetch.Config.IncrImgs(len(ad.Images))

	return nil
}
initial commit 2023-12-14 19:00:04 +01:00			`/*`
added template for ad directory, by default include id now 2024-01-12 13:29:59 +01:00			`Copyright © 2023-2024 Thomas von Dein`
initial commit 2023-12-14 19:00:04 +01:00
			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`package main`

			`import (`
			`"errors"`
			`"fmt"`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`"log/slog"`
fix #4, use filepath.Join to create portable path's 2023-12-18 09:21:26 +01:00			`"path/filepath"`
initial commit 2023-12-14 19:00:04 +01:00			`"strings"`

			`"astuart.co/goq"`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`"golang.org/x/sync/errgroup"`
initial commit 2023-12-14 19:00:04 +01:00			`)`

			`// extract links from all ad listing pages (that is: use pagination)`
			`// and scrape every page`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`func ScrapeUser(fetch *Fetcher) error {`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`adlinks := []string{}`
initial commit 2023-12-14 19:00:04 +01:00
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User)`
initial commit 2023-12-14 19:00:04 +01:00			`page := 1`
			`uri := baseuri`

refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`slog.Info("fetching ad pages", "user", fetch.Config.User)`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00
initial commit 2023-12-14 19:00:04 +01:00			`for {`
			`var index Index`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`slog.Debug("fetching page", "uri", uri)`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`body, err := fetch.Get(uri)`
initial commit 2023-12-14 19:00:04 +01:00			`if err != nil {`
			`return err`
			`}`
			`defer body.Close()`

			`err = goq.NewDecoder(body).Decode(&index)`
			`if err != nil {`
			`return err`
			`}`

			`if len(index.Links) == 0 {`
			`break`
			`}`

enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`slog.Debug("extracted ad links", "count", len(index.Links))`

initial commit 2023-12-14 19:00:04 +01:00			`for _, href := range index.Links {`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`adlinks = append(adlinks, href)`
			`slog.Debug("ad link", "href", href)`
initial commit 2023-12-14 19:00:04 +01:00			`}`

			`page++`
			`uri = baseuri + "&pageNum=" + fmt.Sprintf("%d", page)`
			`}`

Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`for i, adlink := range adlinks {`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`err := ScrapeAd(fetch, Baseuri+adlink)`
initial commit 2023-12-14 19:00:04 +01:00			`if err != nil {`
			`return err`
			`}`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`if fetch.Config.Limit > 0 && i == fetch.Config.Limit-1 {`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`break`
			`}`
initial commit 2023-12-14 19:00:04 +01:00			`}`

			`return nil`
			`}`

enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`// scrape an ad. uri is the full uri of the ad, dir is the basedir`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`func ScrapeAd(fetch *Fetcher, uri string) error {`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`ad := &Ad{}`
initial commit 2023-12-14 19:00:04 +01:00
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`// extract slug and id from uri`
			`uriparts := strings.Split(uri, "/")`
			`if len(uriparts) < 6 {`
Test/add mock tests (#24) * add scrape unit test using httpmock lib 2023-12-29 13:47:18 +01:00			`return errors.New("invalid uri: " + uri)`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`}`
			`ad.Slug = uriparts[4]`
			`ad.Id = uriparts[5]`

			`// get the ad`
			`slog.Debug("fetching ad page", "uri", uri)`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`body, err := fetch.Get(uri)`
initial commit 2023-12-14 19:00:04 +01:00			`if err != nil {`
			`return err`
			`}`
			`defer body.Close()`

enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`// extract ad contents with goquery/goq`
initial commit 2023-12-14 19:00:04 +01:00			`err = goq.NewDecoder(body).Decode(&ad)`
			`if err != nil {`
			`return err`
			`}`
fixed changes on kleinanzeigen.de: - Meta did not contain condition and category together anymore, they removed the category. Therefore fetching (that is, validation) failed. - Now we extract the condition and category directly. - On top, category now includes the whole category tree. - unit tests had to be tweaked for this measure. 2024-01-12 14:11:02 +01:00
			`if len(ad.CategoryTree) > 0 {`
			`ad.Category = strings.Join(ad.CategoryTree, " => ")`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00			`}`
put ad code into separate file, enhance error checking 2024-01-01 16:24:07 +01:00
			`if ad.Incomplete() {`
fixed changes on kleinanzeigen.de: - Meta did not contain condition and category together anymore, they removed the category. Therefore fetching (that is, validation) failed. - Now we extract the condition and category directly. - On top, category now includes the whole category tree. - unit tests had to be tweaked for this measure. 2024-01-12 14:11:02 +01:00			`slog.Debug("got ad", "ad", ad)`
put ad code into separate file, enhance error checking 2024-01-01 16:24:07 +01:00			`return errors.New("could not extract ad data from page, got empty struct")`
			`}`

enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`slog.Debug("extracted ad listing", "ad", ad)`

re-orgainzied code a little, using go templates instead format string 2023-12-17 17:32:05 +01:00			`// write listing`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`addir, err := WriteAd(fetch.Config, ad)`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`if err != nil {`
			`return err`
			`}`
initial commit 2023-12-14 19:00:04 +01:00
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`fetch.Config.IncrAds()`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`return ScrapeImages(fetch, ad, addir)`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00			`}`

refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`func ScrapeImages(fetch Fetcher, ad Ad, addir string) error {`
enhancements: - english README (german version will be put to the homepage) - better commandline options - enhanced logging capabilities and error handling - config file support - support to backup one or more singular ads - add id to adlisting - added manual page - fixed config file reading - fixed typo 2023-12-15 14:50:40 +01:00			`// fetch images`
initial commit 2023-12-14 19:00:04 +01:00			`img := 1`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`g := new(errgroup.Group)`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00
initial commit 2023-12-14 19:00:04 +01:00			`for _, imguri := range ad.Images {`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`imguri := imguri`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`file := filepath.Join(fetch.Config.Outdir, addir, fmt.Sprintf("%d.jpg", img))`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`g.Go(func() error {`
refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`body, err := fetch.Getimage(imguri)`
			`if err != nil {`
			`return err`
			`}`

			`err = WriteImage(file, body)`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00			`if err != nil {`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`return err`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00			`}`
Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00
			`return nil`
			`})`
initial commit 2023-12-14 19:00:04 +01:00			`img++`
			`}`

Bugfixes (#13) * several fixes: - fix #9 + #10: switched to koanf module and dropped support for HCL - fix #11: disabling colors on windows - fix #12: fixed race condition in go routine call inside for loop, images had been downloaded multiple times - remove hcl support and use toml format (same thing, better parser) - update documentation and example config on TOML format of config file - use Config as arg instead of singular args - use x/errgroup instead of sync.Waitgroup inside image download loop --------- Co-authored-by: Thomas von Dein <tom@vondein.org> 2023-12-19 18:23:41 +01:00			`if err := g.Wait(); err != nil {`
			`return err`
added custom template support, added more ad data, use concurrency 2023-12-16 20:32:10 +01:00			`}`

refactored out http fetching code into Fetcher{}/fetch.go 2024-01-16 19:27:46 +01:00			`fetch.Config.IncrImgs(len(ad.Images))`
initial commit 2023-12-14 19:00:04 +01:00
			`return nil`
			`}`