add some handy builtin character classes as split separators (#84)

This commit is contained in:
T.v.Dein
2025-10-09 23:03:57 +02:00
committed by GitHub
parent 253ef8262e
commit ec0b210167
7 changed files with 249 additions and 13 deletions

View File

@@ -27,13 +27,26 @@ import (
"github.com/hashicorp/hcl/v2/hclsimple"
)
const DefaultSeparator string = `(\s\s+|\t)`
const Version string = "v1.5.8"
const MAXPARTS = 2
const (
Version = "v1.5.9"
MAXPARTS = 2
)
var DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
var (
DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
VERSION string // maintained by -x
var VERSION string // maintained by -x
SeparatorTemplates = map[string]string{
":tab:": `\s*\t\s*`, // tab but eats spaces around
":spaces:": `\s{2,}`, // 2 or more spaces
":pipe:": `\s*\|\s*`, // one pipe eating spaces around
":default:": `(\s\s+|\t)`, // 2 or more spaces or tab
":nonword:": `\W`, // word boundary
":nondigit:": `\D`, // same for numbers
":special:": `[\*\+\-_\(\)\[\]\{\}?\\/<>=&$§"':,\^]+`, // match any special char
":nonprint:": `[[:^print:]]+`, // non printables
}
)
// public config, set via config file or using defaults
type Settings struct {
@@ -356,6 +369,13 @@ func (conf *Config) ApplyDefaults() {
if conf.OutputMode == Yaml || conf.OutputMode == CSV {
conf.Numbering = false
}
if conf.Separator[0] == ':' && conf.Separator[len(conf.Separator)-1] == ':' {
separator, ok := SeparatorTemplates[conf.Separator]
if ok {
conf.Separator = separator
}
}
}
func (conf *Config) PreparePattern(patterns []*Pattern) error {

View File

@@ -123,7 +123,7 @@ func Execute() {
"Use alternating background colors")
rootCmd.PersistentFlags().StringVarP(&ShowCompletion, "completion", "", "",
"Display completion code")
rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.DefaultSeparator,
rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.SeparatorTemplates[":default:"],
"Custom field separator")
rootCmd.PersistentFlags().StringVarP(&conf.Columns, "columns", "c", "",
"Only show the speficied columns (separated by ,)")

View File

@@ -14,7 +14,7 @@ SYNOPSIS
-n, --numbering Enable header numbering
-N, --no-color Disable pattern highlighting
-H, --no-headers Disable headers display
-s, --separator <string> Custom field separator
-s, --separator <string> Custom field separator (maybe char, string or :class:)
-k, --sort-by <int|name> Sort by column (default: 1)
-z, --fuzzy Use fuzzy search [experimental]
-F, --filter <field[!]=reg> Filter given field with regex, can be used multiple times
@@ -141,6 +141,57 @@ DESCRIPTION
Finally the -d option enables debugging output which is mostly useful
for the developer.
SEPARATOR
The option -s can be a single character, in which case the CSV parser
will be invoked. You can also specify a string as separator. The string
will be interpreted as literal string unless it is a valid go regular
expression. For example:
-s '\t{2,}\'
is being used as a regexp and will match two or more consecutive tabs.
-s 'foo'
on the other hand is no regular expression and will be used literally.
To make live easier, there are a couple of predefined regular
expressions, which you can specify as classes:
* :tab:
Matches a tab and eats spaces around it.
* :spaces:
Matches 2 or more spaces.
* :pipe:
Matches a pipe character and eats spaces around it.
* :default:
Matches 2 or more spaces or tab. This is the default separator if
none is specified.
* :nonword:
Matches a non-word character.
* :nondigit:
Matches a non-digit character.
* :special:
Matches one or more special chars like brackets, dollar sign,
slashes etc.
* :nonprint:
Matches one or more non-printable characters.
PATTERNS AND FILTERING
You can reduce the rows being displayed by using one or more regular
expression patterns. The regexp language being used is the one of
@@ -458,7 +509,7 @@ Operational Flags:
-n, --numbering Enable header numbering
-N, --no-color Disable pattern highlighting
-H, --no-headers Disable headers display
-s, --separator <string> Custom field separator
-s, --separator <string> Custom field separator (maybe char, string or :class:)
-k, --sort-by <int|name> Sort by column (default: 1)
-z, --fuzzy Use fuzzy search [experimental]
-F, --filter <field[!]=reg> Filter given field with regex, can be used multiple times

View File

@@ -34,7 +34,7 @@ var input = []struct {
}{
{
name: "tabular-data",
separator: cfg.DefaultSeparator,
separator: cfg.SeparatorTemplates[":default:"],
text: `
ONE TWO THREE
asd igig cxxxncnc
@@ -148,7 +148,7 @@ asd igig
19191 EDD 1 X`
readFd := strings.NewReader(strings.TrimSpace(table))
conf := cfg.Config{Separator: cfg.DefaultSeparator}
conf := cfg.Config{Separator: cfg.SeparatorTemplates[":default:"]}
gotdata, err := wrapValidateParser(conf, readFd)
assert.NoError(t, err)
@@ -314,6 +314,58 @@ func TestParserJSONInput(t *testing.T) {
}
}
func TestParserSeparators(t *testing.T) {
list := []string{"alpha", "beta", "delta"}
tests := []struct {
input string
sep string
}{
{
input: `🎲`,
sep: ":nonprint:",
},
{
input: `|`,
sep: ":pipe:",
},
{
input: ` `,
sep: ":spaces:",
},
{
input: " \t ",
sep: ":tab:",
},
{
input: `-`,
sep: ":nonword:",
},
{
input: `//$`,
sep: ":special:",
},
}
for _, testdata := range tests {
testname := fmt.Sprintf("parse-%s", testdata.sep)
t.Run(testname, func(t *testing.T) {
header := strings.Join(list, testdata.input)
row := header
content := header + "\n" + row
readFd := strings.NewReader(strings.TrimSpace(content))
conf := cfg.Config{Separator: testdata.sep}
conf.ApplyDefaults()
gotdata, err := wrapValidateParser(conf, readFd)
assert.NoError(t, err)
assert.EqualValues(t, [][]string{list}, gotdata.entries)
})
}
}
func wrapValidateParser(conf cfg.Config, input io.Reader) (Tabdata, error) {
data, err := Parse(conf, input)

View File

@@ -292,6 +292,7 @@ func TestPrinter(t *testing.T) {
conf.UseSortByColumn = []int{testdata.column}
}
conf.Separator = cfg.SeparatorTemplates[":default:"]
conf.ApplyDefaults()
// the test checks the len!

View File

@@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "TABLIZER 1"
.TH TABLIZER 1 "2025-10-01" "1" "User Commands"
.TH TABLIZER 1 "2025-10-09" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -152,7 +152,7 @@ tablizer \- Manipulate tabular output of other programs
\& \-n, \-\-numbering Enable header numbering
\& \-N, \-\-no\-color Disable pattern highlighting
\& \-H, \-\-no\-headers Disable headers display
\& \-s, \-\-separator <string> Custom field separator
\& \-s, \-\-separator <string> Custom field separator (maybe char, string or :class:)
\& \-k, \-\-sort\-by <int|name> Sort by column (default: 1)
\& \-z, \-\-fuzzy Use fuzzy search [experimental]
\& \-F, \-\-filter <field[!]=reg> Filter given field with regex, can be used multiple times
@@ -293,6 +293,62 @@ Sorts timestamps.
.PP
Finally the \fB\-d\fR option enables debugging output which is mostly
useful for the developer.
.SS "\s-1SEPARATOR\s0"
.IX Subsection "SEPARATOR"
The option \fB\-s\fR can be a single character, in which case the \s-1CSV\s0
parser will be invoked. You can also specify a string as
separator. The string will be interpreted as literal string unless it
is a valid go regular expression. For example:
.PP
.Vb 1
\& \-s \*(Aq\et{2,}\e\*(Aq
.Ve
.PP
is being used as a regexp and will match two or more consecutive tabs.
.PP
.Vb 1
\& \-s \*(Aqfoo\*(Aq
.Ve
.PP
on the other hand is no regular expression and will be used literally.
.PP
To make live easier, there are a couple of predefined regular
expressions, which you can specify as classes:
.Sp
.RS 4
* :tab:
.Sp
Matches a tab and eats spaces around it.
.Sp
* :spaces:
.Sp
Matches 2 or more spaces.
.Sp
* :pipe:
.Sp
Matches a pipe character and eats spaces around it.
.Sp
* :default:
.Sp
Matches 2 or more spaces or tab. This is the default separator if none
is specified.
.Sp
* :nonword:
.Sp
Matches a non-word character.
.Sp
* :nondigit:
.Sp
Matches a non-digit character.
.Sp
* :special:
.Sp
Matches one or more special chars like brackets, dollar sign, slashes etc.
.Sp
* :nonprint:
.Sp
Matches one or more non-printable characters.
.RE
.SS "\s-1PATTERNS AND FILTERING\s0"
.IX Subsection "PATTERNS AND FILTERING"
You can reduce the rows being displayed by using one or more regular

View File

@@ -13,7 +13,7 @@ tablizer - Manipulate tabular output of other programs
-n, --numbering Enable header numbering
-N, --no-color Disable pattern highlighting
-H, --no-headers Disable headers display
-s, --separator <string> Custom field separator
-s, --separator <string> Custom field separator (maybe char, string or :class:)
-k, --sort-by <int|name> Sort by column (default: 1)
-z, --fuzzy Use fuzzy search [experimental]
-F, --filter <field[!]=reg> Filter given field with regex, can be used multiple times
@@ -153,6 +153,62 @@ Sorts timestamps.
Finally the B<-d> option enables debugging output which is mostly
useful for the developer.
=head2 SEPARATOR
The option B<-s> can be a single character, in which case the CSV
parser will be invoked. You can also specify a string as
separator. The string will be interpreted as literal string unless it
is a valid go regular expression. For example:
-s '\t{2,}\'
is being used as a regexp and will match two or more consecutive tabs.
-s 'foo'
on the other hand is no regular expression and will be used literally.
To make live easier, there are a couple of predefined regular
expressions, which you can specify as classes:
=over
* :tab:
Matches a tab and eats spaces around it.
* :spaces:
Matches 2 or more spaces.
* :pipe:
Matches a pipe character and eats spaces around it.
* :default:
Matches 2 or more spaces or tab. This is the default separator if none
is specified.
* :nonword:
Matches a non-word character.
* :nondigit:
Matches a non-digit character.
* :special:
Matches one or more special chars like brackets, dollar sign, slashes etc.
* :nonprint:
Matches one or more non-printable characters.
=back
=head2 PATTERNS AND FILTERING
You can reduce the rows being displayed by using one or more regular