add some handy builtin character classes as split separators (#84)

2026-02-04 10:20:59 +01:00 · 2025-10-09 23:03:57 +02:00
parent 253ef8262e
commit ec0b210167
7 changed files with 249 additions and 13 deletions
--- a/cfg/config.go
+++ b/cfg/config.go
@@ -27,13 +27,26 @@ import (
 	"github.com/hashicorp/hcl/v2/hclsimple"
 )
-const DefaultSeparator string = `(\s\s+|\t)`
+const (
-const Version string = "v1.5.8"
+	Version  = "v1.5.9"
-const MAXPARTS = 2
+	MAXPARTS = 2
 )
-var DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
+var (
 	DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
 	VERSION           string // maintained by -x
-var VERSION string // maintained by -x
+	SeparatorTemplates = map[string]string{
 		":tab:":      `\s*\t\s*`,                               // tab but eats spaces around
 		":spaces:":   `\s{2,}`,                                 // 2 or more spaces
 		":pipe:":     `\s*\|\s*`,                               // one pipe eating spaces around
 		":default:":  `(\s\s+|\t)`,                             // 2 or more spaces or tab
 		":nonword:":  `\W`,                                     // word boundary
 		":nondigit:": `\D`,                                     // same for numbers
 		":special:":  `[\*\+\-_\(\)\[\]\{\}?\\/<>=&$§"':,\^]+`, // match any special char
 		":nonprint:": `[[:^print:]]+`,                          // non printables
 	}
 )
 // public config, set via config file or using defaults
 type Settings struct {
@@ -356,6 +369,13 @@ func (conf *Config) ApplyDefaults() {
 	if conf.OutputMode == Yaml || conf.OutputMode == CSV {
 		conf.Numbering = false
 	}
 	if conf.Separator[0] == ':' && conf.Separator[len(conf.Separator)-1] == ':' {
 		separator, ok := SeparatorTemplates[conf.Separator]
 		if ok {
 			conf.Separator = separator
 		}
 	}
 }
 func (conf *Config) PreparePattern(patterns []*Pattern) error {
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -123,7 +123,7 @@ func Execute() {
 		"Use alternating background colors")
 	rootCmd.PersistentFlags().StringVarP(&ShowCompletion, "completion", "", "",
 		"Display completion code")
-	rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.DefaultSeparator,
+	rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.SeparatorTemplates[":default:"],
 		"Custom field separator")
 	rootCmd.PersistentFlags().StringVarP(&conf.Columns, "columns", "c", "",
 		"Only show the speficied columns (separated by ,)")
--- a/cmd/tablizer.go
+++ b/cmd/tablizer.go
@@ -14,7 +14,7 @@ SYNOPSIS
          -n, --numbering                    Enable header numbering
          -N, --no-color                     Disable pattern highlighting
          -H, --no-headers                   Disable headers display
-          -s, --separator <string>           Custom field separator
+          -s, --separator <string>           Custom field separator (maybe char, string or :class:)
          -k, --sort-by <int|name>           Sort by column (default: 1)
          -z, --fuzzy                        Use fuzzy search [experimental]
          -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -141,6 +141,57 @@ DESCRIPTION
    Finally the -d option enables debugging output which is mostly useful
    for the developer.
  SEPARATOR
    The option -s can be a single character, in which case the CSV parser
    will be invoked. You can also specify a string as separator. The string
    will be interpreted as literal string unless it is a valid go regular
    expression. For example:
        -s '\t{2,}\'
    is being used as a regexp and will match two or more consecutive tabs.
        -s 'foo'
    on the other hand is no regular expression and will be used literally.
    To make live easier, there are a couple of predefined regular
    expressions, which you can specify as classes:
        * :tab:
        Matches a tab and eats spaces around it.
        * :spaces:
        Matches 2 or more spaces.
        * :pipe:
        Matches a pipe character and eats spaces around it.
        * :default:
        Matches 2 or more spaces or tab. This is the default separator if
        none is specified.
        * :nonword:
        Matches a non-word character.
        * :nondigit:
        Matches a non-digit character.
        * :special:
        Matches one or more special chars like brackets, dollar sign,
        slashes etc.
        * :nonprint:
        Matches one or more non-printable characters.
  PATTERNS AND FILTERING
    You can reduce the rows being displayed by using one or more regular
    expression patterns. The regexp language being used is the one of
@@ -458,7 +509,7 @@ Operational Flags:
  -n, --numbering                    Enable header numbering
  -N, --no-color                     Disable pattern highlighting
  -H, --no-headers                   Disable headers display
-  -s, --separator <string>           Custom field separator
+  -s, --separator <string>           Custom field separator (maybe char, string or :class:)
  -k, --sort-by <int|name>           Sort by column (default: 1)
  -z, --fuzzy                        Use fuzzy search [experimental]
  -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
--- a/lib/parser_test.go
+++ b/lib/parser_test.go
@@ -34,7 +34,7 @@ var input = []struct {
 }{
 	{
 		name:      "tabular-data",
-		separator: cfg.DefaultSeparator,
+		separator: cfg.SeparatorTemplates[":default:"],
 		text: `
 ONE    TWO    THREE  
 asd    igig   cxxxncnc  
@@ -148,7 +148,7 @@ asd    igig
 19191  EDD 1  X`
 	readFd := strings.NewReader(strings.TrimSpace(table))
-	conf := cfg.Config{Separator: cfg.DefaultSeparator}
+	conf := cfg.Config{Separator: cfg.SeparatorTemplates[":default:"]}
 	gotdata, err := wrapValidateParser(conf, readFd)
 	assert.NoError(t, err)
@@ -314,6 +314,58 @@ func TestParserJSONInput(t *testing.T) {
 	}
 }
 func TestParserSeparators(t *testing.T) {
 	list := []string{"alpha", "beta", "delta"}
 	tests := []struct {
 		input string
 		sep   string
 	}{
 		{
 			input: `🎲`,
 			sep:   ":nonprint:",
 		},
 		{
 			input: `|`,
 			sep:   ":pipe:",
 		},
 		{
 			input: `   `,
 			sep:   ":spaces:",
 		},
 		{
 			input: "   \t  ",
 			sep:   ":tab:",
 		},
 		{
 			input: `-`,
 			sep:   ":nonword:",
 		},
 		{
 			input: `//$`,
 			sep:   ":special:",
 		},
 	}
 	for _, testdata := range tests {
 		testname := fmt.Sprintf("parse-%s", testdata.sep)
 		t.Run(testname, func(t *testing.T) {
 			header := strings.Join(list, testdata.input)
 			row := header
 			content := header + "\n" + row
 			readFd := strings.NewReader(strings.TrimSpace(content))
 			conf := cfg.Config{Separator: testdata.sep}
 			conf.ApplyDefaults()
 			gotdata, err := wrapValidateParser(conf, readFd)
 			assert.NoError(t, err)
 			assert.EqualValues(t, [][]string{list}, gotdata.entries)
 		})
 	}
 }
 func wrapValidateParser(conf cfg.Config, input io.Reader) (Tabdata, error) {
 	data, err := Parse(conf, input)
--- a/lib/printer_test.go
+++ b/lib/printer_test.go
@@ -292,6 +292,7 @@ func TestPrinter(t *testing.T) {
 				conf.UseSortByColumn = []int{testdata.column}
 			}
 			conf.Separator = cfg.SeparatorTemplates[":default:"]
 			conf.ApplyDefaults()
 			// the test checks the len!
--- a/tablizer.1
+++ b/tablizer.1
@@ -133,7 +133,7 @@
 .\" ========================================================================
 .\"
 .IX Title "TABLIZER 1"
-.TH TABLIZER 1 "2025-10-01" "1" "User Commands"
+.TH TABLIZER 1 "2025-10-09" "1" "User Commands"
 .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
 .\" way too many mistakes in technical documents.
 .if n .ad l
@@ -152,7 +152,7 @@ tablizer \- Manipulate tabular output of other programs
 \&      \-n, \-\-numbering                    Enable header numbering
 \&      \-N, \-\-no\-color                     Disable pattern highlighting
 \&      \-H, \-\-no\-headers                   Disable headers display
-\&      \-s, \-\-separator <string>           Custom field separator
+\&      \-s, \-\-separator <string>           Custom field separator (maybe char, string or :class:)
 \&      \-k, \-\-sort\-by <int|name>           Sort by column (default: 1)
 \&      \-z, \-\-fuzzy                        Use fuzzy search [experimental]
 \&      \-F, \-\-filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -293,6 +293,62 @@ Sorts timestamps.
 .PP
 Finally the  \fB\-d\fR option  enables debugging  output which  is mostly
 useful for the developer.
 .SS "\s-1SEPARATOR\s0"
 .IX Subsection "SEPARATOR"
 The option \fB\-s\fR can be a single character, in which case the \s-1CSV\s0
 parser will be invoked. You can also specify a string as
 separator. The string will be interpreted as literal string unless it
 is a valid go regular expression. For example:
 .PP
 .Vb 1
 \&    \-s \*(Aq\et{2,}\e\*(Aq
 .Ve
 .PP
 is being used as a regexp and will match two or more consecutive tabs.
 .PP
 .Vb 1
 \&    \-s \*(Aqfoo\*(Aq
 .Ve
 .PP
 on the other hand is no regular expression and will be used literally.
 .PP
 To make live easier, there are a couple of predefined regular
 expressions, which you can specify as classes:
 .Sp
 .RS 4
 * 		:tab:
 .Sp
 Matches a tab and eats spaces around it.
 .Sp
 *		:spaces:
 .Sp
 Matches 2 or more spaces.
 .Sp
 *		:pipe:
 .Sp
 Matches a pipe character and eats spaces around it.
 .Sp
 *		:default:
 .Sp
 Matches 2 or more spaces or tab. This is the default separator if none
 is specified.
 .Sp
 *		:nonword:
 .Sp
 Matches a non-word character.
 .Sp
 *		:nondigit:
 .Sp
 Matches a non-digit character.
 .Sp
 *		:special:
 .Sp
 Matches one or more special chars like brackets, dollar sign, slashes etc.
 .Sp
 *		:nonprint:
 .Sp
 Matches one or more non-printable characters.
 .RE
 .SS "\s-1PATTERNS AND FILTERING\s0"
 .IX Subsection "PATTERNS AND FILTERING"
 You can reduce  the rows being displayed by using  one or more regular
--- a/tablizer.pod
+++ b/tablizer.pod
@@ -13,7 +13,7 @@ tablizer - Manipulate tabular output of other programs
      -n, --numbering                    Enable header numbering
      -N, --no-color                     Disable pattern highlighting
      -H, --no-headers                   Disable headers display
-      -s, --separator <string>           Custom field separator
+      -s, --separator <string>           Custom field separator (maybe char, string or :class:)
      -k, --sort-by <int|name>           Sort by column (default: 1)
      -z, --fuzzy                        Use fuzzy search [experimental]
      -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -153,6 +153,62 @@ Sorts timestamps.
 Finally the  B<-d> option  enables debugging  output which  is mostly
 useful for the developer.
 =head2 SEPARATOR
 The option B<-s> can be a single character, in which case the CSV
 parser will be invoked. You can also specify a string as
 separator. The string will be interpreted as literal string unless it
 is a valid go regular expression. For example:
    -s '\t{2,}\'
 is being used as a regexp and will match two or more consecutive tabs.
    -s 'foo'
 on the other hand is no regular expression and will be used literally.
 To make live easier, there are a couple of predefined regular
 expressions, which you can specify as classes:
 =over
 * 		:tab:      
 Matches a tab and eats spaces around it.
 *		:spaces:
 Matches 2 or more spaces.
 *		:pipe:
 Matches a pipe character and eats spaces around it.
 *		:default:
 Matches 2 or more spaces or tab. This is the default separator if none
 is specified.
 *		:nonword:
 Matches a non-word character.
 *		:nondigit:
 Matches a non-digit character.
 *		:special:
 Matches one or more special chars like brackets, dollar sign, slashes etc.
 *		:nonprint:
 Matches one or more non-printable characters.
 =back
 =head2 PATTERNS AND FILTERING
 You can reduce  the rows being displayed by using  one or more regular