From ec0b210167e6e8d4a6ccbc64bd62b12ff8621fcb Mon Sep 17 00:00:00 2001
From: "T.v.Dein" <git@daemon.de>
Date: Thu, 9 Oct 2025 23:03:57 +0200
Subject: [PATCH] add some handy builtin character classes as split separators
 (#84)

---
 cfg/config.go       | 30 +++++++++++++++++++----
 cmd/root.go         |  2 +-
 cmd/tablizer.go     | 55 +++++++++++++++++++++++++++++++++++++++--
 lib/parser_test.go  | 56 ++++++++++++++++++++++++++++++++++++++++--
 lib/printer_test.go |  1 +
 tablizer.1          | 60 +++++++++++++++++++++++++++++++++++++++++++--
 tablizer.pod        | 58 ++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 249 insertions(+), 13 deletions(-)
diff --git a/cfg/config.go b/cfg/config.go
index ca7da0b..610d32c 100644
--- a/cfg/config.go
+++ b/cfg/config.go
@@ -27,13 +27,26 @@ import (
 	"github.com/hashicorp/hcl/v2/hclsimple"
 )
 
-const DefaultSeparator string = `(\s\s+|\t)`
-const Version string = "v1.5.8"
-const MAXPARTS = 2
+const (
+	Version  = "v1.5.9"
+	MAXPARTS = 2
+)
 
-var DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
+var (
+	DefaultConfigfile = os.Getenv("HOME") + "/.config/tablizer/config"
+	VERSION           string // maintained by -x
 
-var VERSION string // maintained by -x
+	SeparatorTemplates = map[string]string{
+		":tab:":      `\s*\t\s*`,                               // tab but eats spaces around
+		":spaces:":   `\s{2,}`,                                 // 2 or more spaces
+		":pipe:":     `\s*\|\s*`,                               // one pipe eating spaces around
+		":default:":  `(\s\s+|\t)`,                             // 2 or more spaces or tab
+		":nonword:":  `\W`,                                     // word boundary
+		":nondigit:": `\D`,                                     // same for numbers
+		":special:":  `[\*\+\-_\(\)\[\]\{\}?\\/<>=&$§"':,\^]+`, // match any special char
+		":nonprint:": `[[:^print:]]+`,                          // non printables
+	}
+)
 
 // public config, set via config file or using defaults
 type Settings struct {
@@ -356,6 +369,13 @@ func (conf *Config) ApplyDefaults() {
 	if conf.OutputMode == Yaml || conf.OutputMode == CSV {
 		conf.Numbering = false
 	}
+
+	if conf.Separator[0] == ':' && conf.Separator[len(conf.Separator)-1] == ':' {
+		separator, ok := SeparatorTemplates[conf.Separator]
+		if ok {
+			conf.Separator = separator
+		}
+	}
 }
 
 func (conf *Config) PreparePattern(patterns []*Pattern) error {
diff --git a/cmd/root.go b/cmd/root.go
index 066abea..24d12ef 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -123,7 +123,7 @@ func Execute() {
 		"Use alternating background colors")
 	rootCmd.PersistentFlags().StringVarP(&ShowCompletion, "completion", "", "",
 		"Display completion code")
-	rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.DefaultSeparator,
+	rootCmd.PersistentFlags().StringVarP(&conf.Separator, "separator", "s", cfg.SeparatorTemplates[":default:"],
 		"Custom field separator")
 	rootCmd.PersistentFlags().StringVarP(&conf.Columns, "columns", "c", "",
 		"Only show the speficied columns (separated by ,)")
diff --git a/cmd/tablizer.go b/cmd/tablizer.go
index 5b68746..0a84dc5 100644
--- a/cmd/tablizer.go
+++ b/cmd/tablizer.go
@@ -14,7 +14,7 @@ SYNOPSIS
           -n, --numbering                    Enable header numbering
           -N, --no-color                     Disable pattern highlighting
           -H, --no-headers                   Disable headers display
-          -s, --separator <string>           Custom field separator
+          -s, --separator <string>           Custom field separator (maybe char, string or :class:)
           -k, --sort-by <int|name>           Sort by column (default: 1)
           -z, --fuzzy                        Use fuzzy search [experimental]
           -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -141,6 +141,57 @@ DESCRIPTION
     Finally the -d option enables debugging output which is mostly useful
     for the developer.
 
+  SEPARATOR
+    The option -s can be a single character, in which case the CSV parser
+    will be invoked. You can also specify a string as separator. The string
+    will be interpreted as literal string unless it is a valid go regular
+    expression. For example:
+
+        -s '\t{2,}\'
+
+    is being used as a regexp and will match two or more consecutive tabs.
+
+        -s 'foo'
+
+    on the other hand is no regular expression and will be used literally.
+
+    To make live easier, there are a couple of predefined regular
+    expressions, which you can specify as classes:
+
+        * :tab:
+
+        Matches a tab and eats spaces around it.
+
+        * :spaces:
+
+        Matches 2 or more spaces.
+
+        * :pipe:
+
+        Matches a pipe character and eats spaces around it.
+
+        * :default:
+
+        Matches 2 or more spaces or tab. This is the default separator if
+        none is specified.
+
+        * :nonword:
+
+        Matches a non-word character.
+
+        * :nondigit:
+
+        Matches a non-digit character.
+
+        * :special:
+
+        Matches one or more special chars like brackets, dollar sign,
+        slashes etc.
+
+        * :nonprint:
+
+        Matches one or more non-printable characters.
+
   PATTERNS AND FILTERING
     You can reduce the rows being displayed by using one or more regular
     expression patterns. The regexp language being used is the one of
@@ -458,7 +509,7 @@ Operational Flags:
   -n, --numbering                    Enable header numbering
   -N, --no-color                     Disable pattern highlighting
   -H, --no-headers                   Disable headers display
-  -s, --separator <string>           Custom field separator
+  -s, --separator <string>           Custom field separator (maybe char, string or :class:)
   -k, --sort-by <int|name>           Sort by column (default: 1)
   -z, --fuzzy                        Use fuzzy search [experimental]
   -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
diff --git a/lib/parser_test.go b/lib/parser_test.go
index 652680e..74fe685 100644
--- a/lib/parser_test.go
+++ b/lib/parser_test.go
@@ -34,7 +34,7 @@ var input = []struct {
 }{
 	{
 		name:      "tabular-data",
-		separator: cfg.DefaultSeparator,
+		separator: cfg.SeparatorTemplates[":default:"],
 		text: `
 ONE    TWO    THREE  
 asd    igig   cxxxncnc  
@@ -148,7 +148,7 @@ asd    igig
 19191  EDD 1  X`
 
 	readFd := strings.NewReader(strings.TrimSpace(table))
-	conf := cfg.Config{Separator: cfg.DefaultSeparator}
+	conf := cfg.Config{Separator: cfg.SeparatorTemplates[":default:"]}
 	gotdata, err := wrapValidateParser(conf, readFd)
 
 	assert.NoError(t, err)
@@ -314,6 +314,58 @@ func TestParserJSONInput(t *testing.T) {
 	}
 }
 
+func TestParserSeparators(t *testing.T) {
+	list := []string{"alpha", "beta", "delta"}
+
+	tests := []struct {
+		input string
+		sep   string
+	}{
+		{
+			input: `🎲`,
+			sep:   ":nonprint:",
+		},
+		{
+			input: `|`,
+			sep:   ":pipe:",
+		},
+		{
+			input: `   `,
+			sep:   ":spaces:",
+		},
+		{
+			input: "   \t  ",
+			sep:   ":tab:",
+		},
+		{
+			input: `-`,
+			sep:   ":nonword:",
+		},
+		{
+			input: `//$`,
+			sep:   ":special:",
+		},
+	}
+
+	for _, testdata := range tests {
+		testname := fmt.Sprintf("parse-%s", testdata.sep)
+		t.Run(testname, func(t *testing.T) {
+			header := strings.Join(list, testdata.input)
+			row := header
+			content := header + "\n" + row
+
+			readFd := strings.NewReader(strings.TrimSpace(content))
+			conf := cfg.Config{Separator: testdata.sep}
+			conf.ApplyDefaults()
+
+			gotdata, err := wrapValidateParser(conf, readFd)
+
+			assert.NoError(t, err)
+			assert.EqualValues(t, [][]string{list}, gotdata.entries)
+		})
+	}
+}
+
 func wrapValidateParser(conf cfg.Config, input io.Reader) (Tabdata, error) {
 	data, err := Parse(conf, input)
 
diff --git a/lib/printer_test.go b/lib/printer_test.go
index 00b294c..7506a90 100644
--- a/lib/printer_test.go
+++ b/lib/printer_test.go
@@ -292,6 +292,7 @@ func TestPrinter(t *testing.T) {
 				conf.UseSortByColumn = []int{testdata.column}
 			}
 
+			conf.Separator = cfg.SeparatorTemplates[":default:"]
 			conf.ApplyDefaults()
 
 			// the test checks the len!
diff --git a/tablizer.1 b/tablizer.1
index 7bc687d..6927aa5 100644
--- a/tablizer.1
+++ b/tablizer.1
@@ -133,7 +133,7 @@
 .\" ========================================================================
 .\"
 .IX Title "TABLIZER 1"
-.TH TABLIZER 1 "2025-10-01" "1" "User Commands"
+.TH TABLIZER 1 "2025-10-09" "1" "User Commands"
 .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
 .\" way too many mistakes in technical documents.
 .if n .ad l
@@ -152,7 +152,7 @@ tablizer \- Manipulate tabular output of other programs
 \&      \-n, \-\-numbering                    Enable header numbering
 \&      \-N, \-\-no\-color                     Disable pattern highlighting
 \&      \-H, \-\-no\-headers                   Disable headers display
-\&      \-s, \-\-separator <string>           Custom field separator
+\&      \-s, \-\-separator <string>           Custom field separator (maybe char, string or :class:)
 \&      \-k, \-\-sort\-by <int|name>           Sort by column (default: 1)
 \&      \-z, \-\-fuzzy                        Use fuzzy search [experimental]
 \&      \-F, \-\-filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -293,6 +293,62 @@ Sorts timestamps.
 .PP
 Finally the  \fB\-d\fR option  enables debugging  output which  is mostly
 useful for the developer.
+.SS "\s-1SEPARATOR\s0"
+.IX Subsection "SEPARATOR"
+The option \fB\-s\fR can be a single character, in which case the \s-1CSV\s0
+parser will be invoked. You can also specify a string as
+separator. The string will be interpreted as literal string unless it
+is a valid go regular expression. For example:
+.PP
+.Vb 1
+\&    \-s \*(Aq\et{2,}\e\*(Aq
+.Ve
+.PP
+is being used as a regexp and will match two or more consecutive tabs.
+.PP
+.Vb 1
+\&    \-s \*(Aqfoo\*(Aq
+.Ve
+.PP
+on the other hand is no regular expression and will be used literally.
+.PP
+To make live easier, there are a couple of predefined regular
+expressions, which you can specify as classes:
+.Sp
+.RS 4
+* 		:tab:
+.Sp
+Matches a tab and eats spaces around it.
+.Sp
+*		:spaces:
+.Sp
+Matches 2 or more spaces.
+.Sp
+*		:pipe:
+.Sp
+Matches a pipe character and eats spaces around it.
+.Sp
+*		:default:
+.Sp
+Matches 2 or more spaces or tab. This is the default separator if none
+is specified.
+.Sp
+*		:nonword:
+.Sp
+Matches a non-word character.
+.Sp
+*		:nondigit:
+.Sp
+Matches a non-digit character.
+.Sp
+*		:special:
+.Sp
+Matches one or more special chars like brackets, dollar sign, slashes etc.
+.Sp
+*		:nonprint:
+.Sp
+Matches one or more non-printable characters.
+.RE
 .SS "\s-1PATTERNS AND FILTERING\s0"
 .IX Subsection "PATTERNS AND FILTERING"
 You can reduce  the rows being displayed by using  one or more regular
diff --git a/tablizer.pod b/tablizer.pod
index c183705..e731a94 100644
--- a/tablizer.pod
+++ b/tablizer.pod
@@ -13,7 +13,7 @@ tablizer - Manipulate tabular output of other programs
       -n, --numbering                    Enable header numbering
       -N, --no-color                     Disable pattern highlighting
       -H, --no-headers                   Disable headers display
-      -s, --separator <string>           Custom field separator
+      -s, --separator <string>           Custom field separator (maybe char, string or :class:)
       -k, --sort-by <int|name>           Sort by column (default: 1)
       -z, --fuzzy                        Use fuzzy search [experimental]
       -F, --filter <field[!]=reg>        Filter given field with regex, can be used multiple times
@@ -153,6 +153,62 @@ Sorts timestamps.
 Finally the  B<-d> option  enables debugging  output which  is mostly
 useful for the developer.
 
+=head2 SEPARATOR
+
+The option B<-s> can be a single character, in which case the CSV
+parser will be invoked. You can also specify a string as
+separator. The string will be interpreted as literal string unless it
+is a valid go regular expression. For example:
+
+    -s '\t{2,}\'
+
+is being used as a regexp and will match two or more consecutive tabs.
+
+    -s 'foo'
+
+on the other hand is no regular expression and will be used literally.
+
+To make live easier, there are a couple of predefined regular
+expressions, which you can specify as classes:
+
+=over
+
+* 		:tab:      
+
+Matches a tab and eats spaces around it.
+
+*		:spaces:
+
+Matches 2 or more spaces.
+
+*		:pipe:
+
+Matches a pipe character and eats spaces around it.
+
+*		:default:
+
+Matches 2 or more spaces or tab. This is the default separator if none
+is specified.
+
+*		:nonword:
+
+Matches a non-word character.
+
+*		:nondigit:
+
+Matches a non-digit character.
+
+*		:special:
+
+Matches one or more special chars like brackets, dollar sign, slashes etc.
+
+*		:nonprint:
+
+Matches one or more non-printable characters.
+
+
+=back
+
 =head2 PATTERNS AND FILTERING
 
 You can reduce  the rows being displayed by using  one or more regular