From ec0b210167e6e8d4a6ccbc64bd62b12ff8621fcb Mon Sep 17 00:00:00 2001 From: "T.v.Dein" Date: Thu, 9 Oct 2025 23:03:57 +0200 Subject: [PATCH] add some handy builtin character classes as split separators (#84) --- cfg/config.go | 30 +++++++++++++++++++---- cmd/root.go | 2 +- cmd/tablizer.go | 55 +++++++++++++++++++++++++++++++++++++++-- lib/parser_test.go | 56 ++++++++++++++++++++++++++++++++++++++++-- lib/printer_test.go | 1 + tablizer.1 | 60 +++++++++++++++++++++++++++++++++++++++++++-- tablizer.pod | 58 ++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 249 insertions(+), 13 deletions(-) diff --git a/cfg/config.go b/cfg/config.go index ca7da0b..610d32c 100644 --- a/cfg/config.go +++ b/cfg/config.go @@ -27,13 +27,26 @@ import ( "github.com/hashicorp/hcl/v2/hclsimple" ) -const DefaultSeparator string = `(\s\s+|\t)` -const Version string = "v1.5.8" -const MAXPARTS = 2 +const ( + Version = "v1.5.9" + MAXPARTS = 2 +) -var DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config" +var ( + DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config" + VERSION string // maintained by -x -var VERSION string // maintained by -x + SeparatorTemplates = map[string]string{ + ":tab:": `\s*\t\s*`, // tab but eats spaces around + ":spaces:": `\s{2,}`, // 2 or more spaces + ":pipe:": `\s*\|\s*`, // one pipe eating spaces around + ":default:": `(\s\s+|\t)`, // 2 or more spaces or tab + ":nonword:": `\W`, // word boundary + ":nondigit:": `\D`, // same for numbers + ":special:": `[\*\+\-_\(\)\[\]\{\}?\\/<>=&$§"':,\^]+`, // match any special char + ":nonprint:": `[[:^print:]]+`, // non printables + } +) // public config, set via config file or using defaults type Settings struct { @@ -356,6 +369,13 @@ func (conf *Config) ApplyDefaults() { if conf.OutputMode == Yaml || conf.OutputMode == CSV { conf.Numbering = false } + + if conf.Separator[0] == ':' && conf.Separator[len(conf.Separator)-1] == ':' { + separator, ok := SeparatorTemplates[conf.Separator] + if ok { + conf.Separator = separator + } + } } func (conf *Config) PreparePattern(patterns []*Pattern) error { diff --git a/cmd/root.go b/cmd/root.go index 066abea..24d12ef 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -123,7 +123,7 @@ func Execute() { "Use alternating background colors") rootCmd.PersistentFlags().StringVarP(&ShowCompletion, "completion", "", "", "Display completion code") - rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.DefaultSeparator, + rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.SeparatorTemplates[":default:"], "Custom field separator") rootCmd.PersistentFlags().StringVarP(&conf.Columns, "columns", "c", "", "Only show the speficied columns (separated by ,)") diff --git a/cmd/tablizer.go b/cmd/tablizer.go index 5b68746..0a84dc5 100644 --- a/cmd/tablizer.go +++ b/cmd/tablizer.go @@ -14,7 +14,7 @@ SYNOPSIS -n, --numbering Enable header numbering -N, --no-color Disable pattern highlighting -H, --no-headers Disable headers display - -s, --separator Custom field separator + -s, --separator Custom field separator (maybe char, string or :class:) -k, --sort-by Sort by column (default: 1) -z, --fuzzy Use fuzzy search [experimental] -F, --filter Filter given field with regex, can be used multiple times @@ -141,6 +141,57 @@ DESCRIPTION Finally the -d option enables debugging output which is mostly useful for the developer. + SEPARATOR + The option -s can be a single character, in which case the CSV parser + will be invoked. You can also specify a string as separator. The string + will be interpreted as literal string unless it is a valid go regular + expression. For example: + + -s '\t{2,}\' + + is being used as a regexp and will match two or more consecutive tabs. + + -s 'foo' + + on the other hand is no regular expression and will be used literally. + + To make live easier, there are a couple of predefined regular + expressions, which you can specify as classes: + + * :tab: + + Matches a tab and eats spaces around it. + + * :spaces: + + Matches 2 or more spaces. + + * :pipe: + + Matches a pipe character and eats spaces around it. + + * :default: + + Matches 2 or more spaces or tab. This is the default separator if + none is specified. + + * :nonword: + + Matches a non-word character. + + * :nondigit: + + Matches a non-digit character. + + * :special: + + Matches one or more special chars like brackets, dollar sign, + slashes etc. + + * :nonprint: + + Matches one or more non-printable characters. + PATTERNS AND FILTERING You can reduce the rows being displayed by using one or more regular expression patterns. The regexp language being used is the one of @@ -458,7 +509,7 @@ Operational Flags: -n, --numbering Enable header numbering -N, --no-color Disable pattern highlighting -H, --no-headers Disable headers display - -s, --separator Custom field separator + -s, --separator Custom field separator (maybe char, string or :class:) -k, --sort-by Sort by column (default: 1) -z, --fuzzy Use fuzzy search [experimental] -F, --filter Filter given field with regex, can be used multiple times diff --git a/lib/parser_test.go b/lib/parser_test.go index 652680e..74fe685 100644 --- a/lib/parser_test.go +++ b/lib/parser_test.go @@ -34,7 +34,7 @@ var input = []struct { }{ { name: "tabular-data", - separator: cfg.DefaultSeparator, + separator: cfg.SeparatorTemplates[":default:"], text: ` ONE TWO THREE asd igig cxxxncnc @@ -148,7 +148,7 @@ asd igig 19191 EDD 1 X` readFd := strings.NewReader(strings.TrimSpace(table)) - conf := cfg.Config{Separator: cfg.DefaultSeparator} + conf := cfg.Config{Separator: cfg.SeparatorTemplates[":default:"]} gotdata, err := wrapValidateParser(conf, readFd) assert.NoError(t, err) @@ -314,6 +314,58 @@ func TestParserJSONInput(t *testing.T) { } } +func TestParserSeparators(t *testing.T) { + list := []string{"alpha", "beta", "delta"} + + tests := []struct { + input string + sep string + }{ + { + input: `🎲`, + sep: ":nonprint:", + }, + { + input: `|`, + sep: ":pipe:", + }, + { + input: ` `, + sep: ":spaces:", + }, + { + input: " \t ", + sep: ":tab:", + }, + { + input: `-`, + sep: ":nonword:", + }, + { + input: `//$`, + sep: ":special:", + }, + } + + for _, testdata := range tests { + testname := fmt.Sprintf("parse-%s", testdata.sep) + t.Run(testname, func(t *testing.T) { + header := strings.Join(list, testdata.input) + row := header + content := header + "\n" + row + + readFd := strings.NewReader(strings.TrimSpace(content)) + conf := cfg.Config{Separator: testdata.sep} + conf.ApplyDefaults() + + gotdata, err := wrapValidateParser(conf, readFd) + + assert.NoError(t, err) + assert.EqualValues(t, [][]string{list}, gotdata.entries) + }) + } +} + func wrapValidateParser(conf cfg.Config, input io.Reader) (Tabdata, error) { data, err := Parse(conf, input) diff --git a/lib/printer_test.go b/lib/printer_test.go index 00b294c..7506a90 100644 --- a/lib/printer_test.go +++ b/lib/printer_test.go @@ -292,6 +292,7 @@ func TestPrinter(t *testing.T) { conf.UseSortByColumn = []int{testdata.column} } + conf.Separator = cfg.SeparatorTemplates[":default:"] conf.ApplyDefaults() // the test checks the len! diff --git a/tablizer.1 b/tablizer.1 index 7bc687d..6927aa5 100644 --- a/tablizer.1 +++ b/tablizer.1 @@ -133,7 +133,7 @@ .\" ======================================================================== .\" .IX Title "TABLIZER 1" -.TH TABLIZER 1 "2025-10-01" "1" "User Commands" +.TH TABLIZER 1 "2025-10-09" "1" "User Commands" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l @@ -152,7 +152,7 @@ tablizer \- Manipulate tabular output of other programs \& \-n, \-\-numbering Enable header numbering \& \-N, \-\-no\-color Disable pattern highlighting \& \-H, \-\-no\-headers Disable headers display -\& \-s, \-\-separator Custom field separator +\& \-s, \-\-separator Custom field separator (maybe char, string or :class:) \& \-k, \-\-sort\-by Sort by column (default: 1) \& \-z, \-\-fuzzy Use fuzzy search [experimental] \& \-F, \-\-filter Filter given field with regex, can be used multiple times @@ -293,6 +293,62 @@ Sorts timestamps. .PP Finally the \fB\-d\fR option enables debugging output which is mostly useful for the developer. +.SS "\s-1SEPARATOR\s0" +.IX Subsection "SEPARATOR" +The option \fB\-s\fR can be a single character, in which case the \s-1CSV\s0 +parser will be invoked. You can also specify a string as +separator. The string will be interpreted as literal string unless it +is a valid go regular expression. For example: +.PP +.Vb 1 +\& \-s \*(Aq\et{2,}\e\*(Aq +.Ve +.PP +is being used as a regexp and will match two or more consecutive tabs. +.PP +.Vb 1 +\& \-s \*(Aqfoo\*(Aq +.Ve +.PP +on the other hand is no regular expression and will be used literally. +.PP +To make live easier, there are a couple of predefined regular +expressions, which you can specify as classes: +.Sp +.RS 4 +* :tab: +.Sp +Matches a tab and eats spaces around it. +.Sp +* :spaces: +.Sp +Matches 2 or more spaces. +.Sp +* :pipe: +.Sp +Matches a pipe character and eats spaces around it. +.Sp +* :default: +.Sp +Matches 2 or more spaces or tab. This is the default separator if none +is specified. +.Sp +* :nonword: +.Sp +Matches a non-word character. +.Sp +* :nondigit: +.Sp +Matches a non-digit character. +.Sp +* :special: +.Sp +Matches one or more special chars like brackets, dollar sign, slashes etc. +.Sp +* :nonprint: +.Sp +Matches one or more non-printable characters. +.RE .SS "\s-1PATTERNS AND FILTERING\s0" .IX Subsection "PATTERNS AND FILTERING" You can reduce the rows being displayed by using one or more regular diff --git a/tablizer.pod b/tablizer.pod index c183705..e731a94 100644 --- a/tablizer.pod +++ b/tablizer.pod @@ -13,7 +13,7 @@ tablizer - Manipulate tabular output of other programs -n, --numbering Enable header numbering -N, --no-color Disable pattern highlighting -H, --no-headers Disable headers display - -s, --separator Custom field separator + -s, --separator Custom field separator (maybe char, string or :class:) -k, --sort-by Sort by column (default: 1) -z, --fuzzy Use fuzzy search [experimental] -F, --filter Filter given field with regex, can be used multiple times @@ -153,6 +153,62 @@ Sorts timestamps. Finally the B<-d> option enables debugging output which is mostly useful for the developer. +=head2 SEPARATOR + +The option B<-s> can be a single character, in which case the CSV +parser will be invoked. You can also specify a string as +separator. The string will be interpreted as literal string unless it +is a valid go regular expression. For example: + + -s '\t{2,}\' + +is being used as a regexp and will match two or more consecutive tabs. + + -s 'foo' + +on the other hand is no regular expression and will be used literally. + +To make live easier, there are a couple of predefined regular +expressions, which you can specify as classes: + +=over + +* :tab: + +Matches a tab and eats spaces around it. + +* :spaces: + +Matches 2 or more spaces. + +* :pipe: + +Matches a pipe character and eats spaces around it. + +* :default: + +Matches 2 or more spaces or tab. This is the default separator if none +is specified. + +* :nonword: + +Matches a non-word character. + +* :nondigit: + +Matches a non-digit character. + +* :special: + +Matches one or more special chars like brackets, dollar sign, slashes etc. + +* :nonprint: + +Matches one or more non-printable characters. + + +=back + =head2 PATTERNS AND FILTERING You can reduce the rows being displayed by using one or more regular