fix #1: use scanner.Split() instead of splitting by header position

boundaries, since this splitting cuts utf-8 chars which causes
distorted output.
This commit is contained in:
2022-10-05 12:55:33 +02:00
parent 2c410e1cb3
commit 26e50cf908
3 changed files with 20 additions and 69 deletions

View File

@@ -25,7 +25,7 @@ var (
ShowVersion bool ShowVersion bool
Columns string Columns string
UseColumns []int UseColumns []int
Separator string Separator string = `(\s\s+|\t)`
OutflagExtended bool OutflagExtended bool
OutflagMarkdown bool OutflagMarkdown bool
OutflagOrgtable bool OutflagOrgtable bool

View File

@@ -29,49 +29,37 @@ import (
// contains a whole parsed table // contains a whole parsed table
type Tabdata struct { type Tabdata struct {
maxwidthHeader int // longest header maxwidthHeader int // longest header
maxwidthPerCol []int // max width per column maxwidthPerCol []int // max width per column
columns int columns int // count
headerIndices []map[string]int // [ {beg=>0, end=>17}, ... ] headers []string // [ "ID", "NAME", ...]
headers []string // [ "ID", "NAME", ...]
entries [][]string entries [][]string
} }
/* /*
Parse tabular input. We split the header (first line) by 2 or more Parse tabular input.
spaces, remember the positions of the header fields. We then split
the data (everything after the first line) by those positions. That
way we can turn "tabular data" (with fields containing whitespaces)
into real tabular data. We re-tabulate our input if you will.
*/ */
func parseFile(input io.Reader, pattern string) (Tabdata, error) { func parseFile(input io.Reader, pattern string) (Tabdata, error) {
data := Tabdata{} data := Tabdata{}
var scanner *bufio.Scanner var scanner *bufio.Scanner
var spaces = `\s\s+|$`
if len(Separator) > 0 {
spaces = Separator
}
hadFirst := false hadFirst := false
spacefinder := regexp.MustCompile(spaces) separate := regexp.MustCompile(Separator)
beg := 0
patternR, err := regexp.Compile(pattern)
if err != nil {
return data, errors.Unwrap(fmt.Errorf("Regexp pattern %s is invalid: %w", pattern, err))
}
scanner = bufio.NewScanner(input) scanner = bufio.NewScanner(input)
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())
values := []string{} parts := separate.Split(line, -1)
patternR, err := regexp.Compile(pattern)
if err != nil {
return data, errors.Unwrap(fmt.Errorf("Regexp pattern %s is invalid: %w", pattern, err))
}
if !hadFirst { if !hadFirst {
// header processing // header processing
parts := spacefinder.FindAllStringIndex(line, -1)
data.columns = len(parts) data.columns = len(parts)
// if Debug { // if Debug {
// fmt.Println(parts) // fmt.Println(parts)
@@ -83,30 +71,14 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
// fmt.Printf("Part: <%s>\n", string(line[beg:part[0]])) // fmt.Printf("Part: <%s>\n", string(line[beg:part[0]]))
//} //}
// current field
head := string(line[beg:part[0]])
// register begin and end of field within line
indices := make(map[string]int)
indices["beg"] = beg
if part[0] == part[1] {
indices["end"] = 0
} else {
indices["end"] = part[1] - 1
}
// register widest header field // register widest header field
headerlen := len(head) headerlen := len(part)
if headerlen > data.maxwidthHeader { if headerlen > data.maxwidthHeader {
data.maxwidthHeader = headerlen data.maxwidthHeader = headerlen
} }
// register fields data // register fields data
data.headerIndices = append(data.headerIndices, indices) data.headers = append(data.headers, strings.TrimSpace(part))
data.headers = append(data.headers, head)
// end of current field == begin of next one
beg = part[1]
// done // done
hadFirst = true hadFirst = true
@@ -124,16 +96,9 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
} }
idx := 0 // we cannot use the header index, because we could exclude columns idx := 0 // we cannot use the header index, because we could exclude columns
for _, index := range data.headerIndices { values := []string{}
value := "" for _, part := range parts {
width := len(strings.TrimSpace(part))
if index["end"] == 0 {
value = string(line[index["beg"]:])
} else {
value = string(line[index["beg"]:index["end"]])
}
width := len(strings.TrimSpace(value))
if len(data.maxwidthPerCol)-1 < idx { if len(data.maxwidthPerCol)-1 < idx {
data.maxwidthPerCol = append(data.maxwidthPerCol, width) data.maxwidthPerCol = append(data.maxwidthPerCol, width)
@@ -146,7 +111,7 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
// if Debug { // if Debug {
// fmt.Printf("<%s> ", value) // fmt.Printf("<%s> ", value)
// } // }
values = append(values, strings.TrimSpace(value)) values = append(values, strings.TrimSpace(part))
idx++ idx++
} }

View File

@@ -33,20 +33,6 @@ func TestParser(t *testing.T) {
8, 8,
}, },
columns: 3, columns: 3,
headerIndices: []map[string]int{
map[string]int{
"beg": 0,
"end": 6,
},
map[string]int{
"end": 13,
"beg": 7,
},
map[string]int{
"beg": 14,
"end": 0,
},
},
headers: []string{ headers: []string{
"ONE", "ONE",
"TWO", "TWO",
@@ -78,7 +64,7 @@ asd igig cxxxncnc
} }
if !reflect.DeepEqual(data, gotdata) { if !reflect.DeepEqual(data, gotdata) {
t.Errorf("Parser returned invalid data\nExp: %+v\nGot: %+v\n", data, gotdata) t.Errorf("Parser returned invalid data, Regex: %s\nExp: %+v\nGot: %+v\n", Separator, data, gotdata)
} }
} }