2 Commits

Author SHA1 Message Date
23fd9ab3a9 fix format error 2025-10-22 13:45:21 +02:00
37f76b38ab fix version printing 2025-10-22 13:42:14 +02:00
7 changed files with 99 additions and 257 deletions

View File

@@ -1,7 +1,3 @@
[![Actions](https://github.com/tlinden/io-exporter/actions/workflows/ci.yaml/badge.svg)](https://github.com/tlinden/io-exporter/actions)
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](https://github.com/tlinden/io-exporter/blob/master/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/tlinden/io-exporter)](https://goreportcard.com/report/github.com/tlinden/io-exporter)
# io-exporter
Report if a given filesystem is operating properly
@@ -18,15 +14,10 @@ specified via commandline.
```default
io-exporter [options] <file>
Options:
-t --timeout <int> When should the operation timeout in seconds
-s --sleeptime <int> Time to sleep between checks (default: 5s)
-l --label <label=value> Add label to exported metric
-i --internals Also add labels about resource usage
-r --read Only execute the read test
-w --write Only execute the write test
-d --debug Enable debug log level
-h --help Show help
-v --version Show program version
-t --timeout <int> When should the operation timeout in seconds
-l --label <label=value> Add label to exported metric
-h --help Show help
-v --version Show program version
```
## Output
@@ -40,20 +31,14 @@ io-exporter -l foo=bar -l blah=blubb t/blah
You'll get such metrics:
```default
# HELP io_exporter_io_latency how long does the operation take in seconds
# TYPE io_exporter_io_latency gauge
io_exporter_io_latency{file="/tmp/blah",maxwait="1",namespace="debug",pod="foo1"} 0.0001142815
# HELP io_exporter_io_operation whether io is working on the pvc, 1=ok, 0=fail
# TYPE io_exporter_io_operation gauge
io_exporter_io_operation{blah="blubb",exectime="1761148383705",file="t/blah",foo="bar",maxwait="1"} 1
# HELP io_exporter_io_read_latency how long does the read operation take in seconds
# TYPE io_exporter_io_read_latency gauge
io_exporter_io_read_latency{blah="blubb",exectime="1761148383705",file="t/blah",foo="bar",maxwait="1"} 0.0040411716
# HELP io_exporter_io_write_latency how long does the write operation take in seconds
# TYPE io_exporter_io_write_latency gauge
io_exporter_io_write_latency{blah="blubb",exectime="1761148383705",file="t/blah",foo="bar",maxwait="1"} 0
io_exporter_io_operation{file="/tmp/blah",maxwait="1",namespace="debug",pod="foo1"} 1
```
You may also restrict the exporter to only test read (`-r` flag) or
write (`-w` flag) operation.
## Installation
There are no released binaries yet.

1
blah Normal file
View File

@@ -0,0 +1 @@


View File

@@ -1,11 +1,6 @@
package cmd
import (
"bytes"
"errors"
"github.com/ncw/directio"
)
import "github.com/ncw/directio"
// aligned allocs used for testing
type Alloc struct {
@@ -30,12 +25,3 @@ func NewAlloc() *Alloc {
readBlock: directio.AlignedBlock(directio.BlockSize),
}
}
func (alloc *Alloc) Compare() bool {
// compare
if !bytes.Equal(alloc.writeBlock, alloc.readBlock) {
return report(errors.New("read not the same as written"), nil)
}
return true
}

View File

@@ -15,7 +15,7 @@ import (
)
const (
Version = `v0.0.7`
Version = `v0.0.4`
SLEEP = 5
Usage = `io-exporter [options] <file>
Options:
@@ -23,15 +23,9 @@ Options:
-s --sleeptime <int> Time to sleep between checks (default: 5s)
-l --label <label=value> Add label to exported metric
-i --internals Also add labels about resource usage
-r --read Only execute the read test
-w --write Only execute the write test
-d --debug Enable debug log level
-h --help Show help
-v --version Show program version`
O_R = iota
O_W
O_RW
)
// config via commandline flags
@@ -40,8 +34,6 @@ type Config struct {
Showhelp bool `koanf:"help"` // -h
Internals bool `koanf:"internals"` // -i
Debug bool `koanf:"debug"` // -d
ReadMode bool `koanf:"read"` // -r
WriteMode bool `koanf:"write"` // -w
Label []string `koanf:"label"` // -v
Timeout int `koanf:"timeout"` // -t
Port int `koanf:"port"` // -p
@@ -68,8 +60,6 @@ func InitConfig(output io.Writer) (*Config, error) {
flagset.BoolP("help", "h", false, "show help")
flagset.BoolP("debug", "d", false, "enable debug logs")
flagset.BoolP("internals", "i", false, "add internal metrics")
flagset.BoolP("read", "r", false, "only execute the read test")
flagset.BoolP("write", "w", false, "only execute the write test")
flagset.StringArrayP("label", "l", nil, "additional labels")
flagset.IntP("timeout", "t", 1, "timeout for file operation in seconds")
flagset.IntP("port", "p", 9187, "prometheus metrics port to listen to")
@@ -113,10 +103,5 @@ func InitConfig(output io.Writer) (*Config, error) {
conf.Labels = append(conf.Labels, Label{Name: parts[0], Value: parts[1]})
}
if !conf.ReadMode && !conf.WriteMode {
conf.ReadMode = true
conf.WriteMode = true
}
return conf, nil
}

View File

@@ -1,182 +1,114 @@
package cmd
import (
"bytes"
"context"
"errors"
"io"
"log/slog"
"os"
"sync"
"time"
"github.com/ncw/directio"
)
// our primary container for the io checks
type Exporter struct {
conf *Config
alloc *Alloc
metrics *Metrics
}
func die(err error, fd *os.File) bool {
slog.Debug("failed to check io", "error", err)
type Result struct {
result bool
elapsed float64
}
func NewExporter(conf *Config, alloc *Alloc, metrics *Metrics) *Exporter {
return &Exporter{
conf: conf,
alloc: alloc,
metrics: metrics,
}
}
// starts the primary go-routine, which will run the io checks for ever
func (exp *Exporter) RunIOchecks() *sync.WaitGroup {
var wg sync.WaitGroup
wg.Add(1)
go func() {
for {
var res_r, res_w Result
exp.alloc.Clean()
if exp.conf.WriteMode {
res_w = exp.measure(O_W)
slog.Debug("elapsed write time", "elapsed", res_w.elapsed, "result", res_w.result)
}
if exp.conf.ReadMode {
res_r = exp.measure(O_R)
slog.Debug("elapsed read time", "elapsed", res_r.elapsed, "result", res_r.result)
}
if (exp.conf.WriteMode && exp.conf.ReadMode) && (res_r.result && res_w.result) {
if !exp.alloc.Compare() {
res_r.result = false
}
}
exp.metrics.Set(res_r, res_w)
time.Sleep(time.Duration(exp.conf.Sleeptime) * time.Second)
if fd != nil {
if err := fd.Close(); err != nil {
slog.Debug("failed to close filehandle", "error", err)
}
}()
return &wg
}
// call an io measurement and collect time needed
func (exp *Exporter) measure(mode int) Result {
start := time.Now()
result := exp.runExporter(mode)
// ns => s
now := time.Now()
elapsed := float64(now.Sub(start).Nanoseconds()) / 10000000000
// makes no sense to measure latency if operation failed
if !result {
elapsed = 0
}
return Result{elapsed: elapsed, result: result}
return false
}
// Calls runcheck's with context timeout
func (exp *Exporter) runExporter(mode int) bool {
// Calls runcheck() with timeout
func runExporter(file string, alloc *Alloc, timeout time.Duration) bool {
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Duration(exp.conf.Timeout)*time.Second)
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
run := make(chan struct{}, 1)
var res bool
go func() {
switch mode {
case O_R:
res = exp.runcheck_r()
case O_W:
res = exp.runcheck_w()
}
res = runcheck(file, alloc)
run <- struct{}{}
}()
for {
select {
case <-ctx.Done():
return report(ctx.Err(), nil)
return die(ctx.Err(), nil)
case <-run:
return res
}
}
}
// Checks file io on the specified path:
//
// - opens it for reading
// - reads the block
// - closes file again
//
// Returns false if anything failed during that sequence,
// true otherwise.
func (exp *Exporter) runcheck_r() bool {
// read
in, err := directio.OpenFile(exp.conf.File, os.O_RDONLY, 0640)
if err != nil {
report(err, nil)
}
n, err := io.ReadFull(in, exp.alloc.readBlock)
if err != nil {
return report(err, in)
}
if n != len(exp.alloc.writeBlock) {
return report(errors.New("failed to read block"), in)
}
if err := in.Close(); err != nil {
return report(err, nil)
}
return true
}
// Checks file io on the specified path:
//
// - open the file (create if it doesnt exist)
// - truncate it if it already exists
// - write some data to it
// - closes the file
// - re-opens it for reading
// - reads the block
// - compares if written block is equal to read block
// - closes file again
//
// Returns false if anything failed during that sequence,
// true otherwise.
func (exp *Exporter) runcheck_w() bool {
func runcheck(file string, alloc *Alloc) bool {
alloc.Clean()
// write
fd, err := directio.OpenFile(exp.conf.File, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0640)
fd, err := directio.OpenFile(file, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0640)
if err != nil {
report(err, nil)
die(err, nil)
}
for i := 0; i < len(exp.alloc.writeBlock); i++ {
exp.alloc.writeBlock[i] = 'A'
for i := 0; i < len(alloc.writeBlock); i++ {
alloc.writeBlock[i] = 'A'
}
n, err := fd.Write(exp.alloc.writeBlock)
n, err := fd.Write(alloc.writeBlock)
if err != nil {
return report(err, fd)
return die(err, fd)
}
if n != len(exp.alloc.writeBlock) {
return report(errors.New("failed to write block"), fd)
if n != len(alloc.writeBlock) {
return die(errors.New("failed to write block"), fd)
}
if err := fd.Close(); err != nil {
return report(err, nil)
return die(err, nil)
}
// read
in, err := directio.OpenFile(file, os.O_RDONLY, 0640)
if err != nil {
die(err, nil)
}
n, err = io.ReadFull(in, alloc.readBlock)
if err != nil {
return die(err, in)
}
if n != len(alloc.writeBlock) {
return die(errors.New("failed to read block"), fd)
}
if err := in.Close(); err != nil {
return die(err, nil)
}
// compare
if !bytes.Equal(alloc.writeBlock, alloc.readBlock) {
return die(errors.New("read not the same as written"), nil)
}
return true

View File

@@ -2,7 +2,6 @@ package cmd
import (
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
@@ -15,17 +14,15 @@ type Label struct {
// simple prometheus wrapper
type Metrics struct {
run *prometheus.GaugeVec
latency_r *prometheus.GaugeVec
latency_w *prometheus.GaugeVec
registry *prometheus.Registry
values []string
mode int
run *prometheus.GaugeVec
latency *prometheus.GaugeVec
registry *prometheus.Registry
values []string
}
func NewMetrics(conf *Config) *Metrics {
labels := []string{"file", "maxwait", "exectime"}
LabelLen := 3
labels := []string{"file", "maxwait"}
LabelLen := 2
for _, label := range conf.Labels {
labels = append(labels, label.Name)
@@ -39,17 +36,10 @@ func NewMetrics(conf *Config) *Metrics {
},
labels,
),
latency_r: prometheus.NewGaugeVec(
latency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "io_exporter_io_read_latency",
Help: "how long does the read operation take in seconds",
},
labels,
),
latency_w: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "io_exporter_io_write_latency",
Help: "how long does the write operation take in seconds",
Name: "io_exporter_io_latency",
Help: "how long does the operation take in seconds",
},
labels,
),
@@ -63,8 +53,7 @@ func NewMetrics(conf *Config) *Metrics {
if conf.Internals {
metrics.registry.MustRegister(
metrics.run,
metrics.latency_r,
metrics.latency_w,
metrics.latency,
// we might need to take care of the exporter in terms of
// resources, so also report those internals
@@ -76,50 +65,28 @@ func NewMetrics(conf *Config) *Metrics {
),
)
} else {
metrics.registry.MustRegister(metrics.run, metrics.latency_r, metrics.latency_w)
metrics.registry.MustRegister(metrics.run, metrics.latency)
}
// static labels
metrics.values[0] = conf.File
metrics.values[1] = fmt.Sprintf("%d", conf.Timeout)
metrics.values[2] = fmt.Sprintf("%d", time.Now().UnixMilli())
// custom labels via -l label=value
for idx, label := range conf.Labels {
metrics.values[idx+LabelLen] = label.Value
}
switch {
case conf.ReadMode && conf.WriteMode:
metrics.mode = O_RW
case conf.ReadMode:
metrics.mode = O_R
case conf.WriteMode:
metrics.mode = O_W
}
return metrics
}
func (metrics *Metrics) Set(result_r, result_w Result) {
func (metrics *Metrics) Set(result bool, elapsed float64) {
var res float64
switch metrics.mode {
case O_RW:
if result_r.result && result_w.result {
res = 1
}
case O_R:
if result_r.result {
res = 1
}
case O_W:
if result_w.result {
res = 1
}
if result {
res = 1
}
metrics.run.WithLabelValues(metrics.values...).Set(res)
metrics.latency_r.WithLabelValues(metrics.values...).Set(result_r.elapsed)
metrics.latency_w.WithLabelValues(metrics.values...).Set(result_w.elapsed)
metrics.latency.WithLabelValues(metrics.values...).Set(elapsed)
}

View File

@@ -7,13 +7,11 @@ import (
"net/http"
"os"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// Main program. starts 2 goroutines: our exporter and the http server
// for the prometheus metrics. The exporter reports measurement
// results to prometheus metrics directly
func Run() {
conf, err := InitConfig(os.Stdout)
if err != nil {
@@ -25,48 +23,36 @@ func Run() {
os.Exit(0)
}
setLogger(os.Stdout, conf.Debug)
metrics := NewMetrics(conf)
alloc := NewAlloc()
exporter := NewExporter(conf, alloc, metrics)
wg := exporter.RunIOchecks()
setLogger(os.Stdout, conf.Debug)
go func() {
for {
start := time.Now()
result := runExporter(conf.File, alloc, time.Duration(conf.Timeout)*time.Second)
// ns => s
now := time.Now()
elapsed := float64(now.Sub(start).Nanoseconds()) / 10000000000
slog.Debug("elapsed time", "elapsed", elapsed, "result", result)
metrics.Set(result, elapsed)
time.Sleep(time.Duration(conf.Sleeptime) * time.Second)
}
}()
http.Handle("/metrics", promhttp.HandlerFor(
metrics.registry,
promhttp.HandlerOpts{},
))
slog.Info(" ╭──")
slog.Info(" │ io-exporter starting up", "version", Version)
slog.Info(" │ serving metrics", "host", "localhost", "port", conf.Port)
slog.Info(" │ test setup", "file", conf.File, "labels", strings.Join(conf.Label, ","))
slog.Info(" │ measuring", "read", conf.ReadMode, "write", conf.WriteMode, "timeout(s)", conf.Timeout)
slog.Info(" │ debugging", "enabled", conf.Debug)
slog.Info(" ╰──")
slog.Info("start testing and serving metrics on localhost", "port", conf.Port)
slog.Info("test setup", "file", conf.File, "labels", strings.Join(conf.Label, ","))
if err := http.ListenAndServe(fmt.Sprintf(":%d", conf.Port), nil); err != nil {
log.Fatal(err)
}
wg.Wait()
}
func report(err error, fd *os.File) bool {
failure := err.Error()
if err.Error() == "context deadline exceeded" {
failure = "operation timed out"
}
slog.Error("io error", "error", failure)
if fd != nil {
if err := fd.Close(); err != nil {
slog.Debug("failed to close filehandle", "error", failure)
}
}
return false
}