mirror of
https://codeberg.org/scip/tablizer.git
synced 2025-12-17 04:30:56 +01:00
fix #1: use scanner.Split() instead of splitting by header position
boundaries, since this splitting cuts utf-8 chars which causes distorted output.
This commit is contained in:
@@ -25,7 +25,7 @@ var (
|
|||||||
ShowVersion bool
|
ShowVersion bool
|
||||||
Columns string
|
Columns string
|
||||||
UseColumns []int
|
UseColumns []int
|
||||||
Separator string
|
Separator string = `(\s\s+|\t)`
|
||||||
OutflagExtended bool
|
OutflagExtended bool
|
||||||
OutflagMarkdown bool
|
OutflagMarkdown bool
|
||||||
OutflagOrgtable bool
|
OutflagOrgtable bool
|
||||||
|
|||||||
@@ -29,49 +29,37 @@ import (
|
|||||||
|
|
||||||
// contains a whole parsed table
|
// contains a whole parsed table
|
||||||
type Tabdata struct {
|
type Tabdata struct {
|
||||||
maxwidthHeader int // longest header
|
maxwidthHeader int // longest header
|
||||||
maxwidthPerCol []int // max width per column
|
maxwidthPerCol []int // max width per column
|
||||||
columns int
|
columns int // count
|
||||||
headerIndices []map[string]int // [ {beg=>0, end=>17}, ... ]
|
headers []string // [ "ID", "NAME", ...]
|
||||||
headers []string // [ "ID", "NAME", ...]
|
|
||||||
entries [][]string
|
entries [][]string
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Parse tabular input. We split the header (first line) by 2 or more
|
Parse tabular input.
|
||||||
spaces, remember the positions of the header fields. We then split
|
|
||||||
the data (everything after the first line) by those positions. That
|
|
||||||
way we can turn "tabular data" (with fields containing whitespaces)
|
|
||||||
into real tabular data. We re-tabulate our input if you will.
|
|
||||||
*/
|
*/
|
||||||
func parseFile(input io.Reader, pattern string) (Tabdata, error) {
|
func parseFile(input io.Reader, pattern string) (Tabdata, error) {
|
||||||
data := Tabdata{}
|
data := Tabdata{}
|
||||||
|
|
||||||
var scanner *bufio.Scanner
|
var scanner *bufio.Scanner
|
||||||
var spaces = `\s\s+|$`
|
|
||||||
|
|
||||||
if len(Separator) > 0 {
|
|
||||||
spaces = Separator
|
|
||||||
}
|
|
||||||
|
|
||||||
hadFirst := false
|
hadFirst := false
|
||||||
spacefinder := regexp.MustCompile(spaces)
|
separate := regexp.MustCompile(Separator)
|
||||||
beg := 0
|
|
||||||
|
patternR, err := regexp.Compile(pattern)
|
||||||
|
if err != nil {
|
||||||
|
return data, errors.Unwrap(fmt.Errorf("Regexp pattern %s is invalid: %w", pattern, err))
|
||||||
|
}
|
||||||
|
|
||||||
scanner = bufio.NewScanner(input)
|
scanner = bufio.NewScanner(input)
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := strings.TrimSpace(scanner.Text())
|
line := strings.TrimSpace(scanner.Text())
|
||||||
values := []string{}
|
parts := separate.Split(line, -1)
|
||||||
|
|
||||||
patternR, err := regexp.Compile(pattern)
|
|
||||||
if err != nil {
|
|
||||||
return data, errors.Unwrap(fmt.Errorf("Regexp pattern %s is invalid: %w", pattern, err))
|
|
||||||
}
|
|
||||||
|
|
||||||
if !hadFirst {
|
if !hadFirst {
|
||||||
// header processing
|
// header processing
|
||||||
parts := spacefinder.FindAllStringIndex(line, -1)
|
|
||||||
data.columns = len(parts)
|
data.columns = len(parts)
|
||||||
// if Debug {
|
// if Debug {
|
||||||
// fmt.Println(parts)
|
// fmt.Println(parts)
|
||||||
@@ -83,30 +71,14 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
|
|||||||
// fmt.Printf("Part: <%s>\n", string(line[beg:part[0]]))
|
// fmt.Printf("Part: <%s>\n", string(line[beg:part[0]]))
|
||||||
//}
|
//}
|
||||||
|
|
||||||
// current field
|
|
||||||
head := string(line[beg:part[0]])
|
|
||||||
|
|
||||||
// register begin and end of field within line
|
|
||||||
indices := make(map[string]int)
|
|
||||||
indices["beg"] = beg
|
|
||||||
if part[0] == part[1] {
|
|
||||||
indices["end"] = 0
|
|
||||||
} else {
|
|
||||||
indices["end"] = part[1] - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// register widest header field
|
// register widest header field
|
||||||
headerlen := len(head)
|
headerlen := len(part)
|
||||||
if headerlen > data.maxwidthHeader {
|
if headerlen > data.maxwidthHeader {
|
||||||
data.maxwidthHeader = headerlen
|
data.maxwidthHeader = headerlen
|
||||||
}
|
}
|
||||||
|
|
||||||
// register fields data
|
// register fields data
|
||||||
data.headerIndices = append(data.headerIndices, indices)
|
data.headers = append(data.headers, strings.TrimSpace(part))
|
||||||
data.headers = append(data.headers, head)
|
|
||||||
|
|
||||||
// end of current field == begin of next one
|
|
||||||
beg = part[1]
|
|
||||||
|
|
||||||
// done
|
// done
|
||||||
hadFirst = true
|
hadFirst = true
|
||||||
@@ -124,16 +96,9 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
idx := 0 // we cannot use the header index, because we could exclude columns
|
idx := 0 // we cannot use the header index, because we could exclude columns
|
||||||
for _, index := range data.headerIndices {
|
values := []string{}
|
||||||
value := ""
|
for _, part := range parts {
|
||||||
|
width := len(strings.TrimSpace(part))
|
||||||
if index["end"] == 0 {
|
|
||||||
value = string(line[index["beg"]:])
|
|
||||||
} else {
|
|
||||||
value = string(line[index["beg"]:index["end"]])
|
|
||||||
}
|
|
||||||
|
|
||||||
width := len(strings.TrimSpace(value))
|
|
||||||
|
|
||||||
if len(data.maxwidthPerCol)-1 < idx {
|
if len(data.maxwidthPerCol)-1 < idx {
|
||||||
data.maxwidthPerCol = append(data.maxwidthPerCol, width)
|
data.maxwidthPerCol = append(data.maxwidthPerCol, width)
|
||||||
@@ -146,7 +111,7 @@ func parseFile(input io.Reader, pattern string) (Tabdata, error) {
|
|||||||
// if Debug {
|
// if Debug {
|
||||||
// fmt.Printf("<%s> ", value)
|
// fmt.Printf("<%s> ", value)
|
||||||
// }
|
// }
|
||||||
values = append(values, strings.TrimSpace(value))
|
values = append(values, strings.TrimSpace(part))
|
||||||
|
|
||||||
idx++
|
idx++
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -33,20 +33,6 @@ func TestParser(t *testing.T) {
|
|||||||
8,
|
8,
|
||||||
},
|
},
|
||||||
columns: 3,
|
columns: 3,
|
||||||
headerIndices: []map[string]int{
|
|
||||||
map[string]int{
|
|
||||||
"beg": 0,
|
|
||||||
"end": 6,
|
|
||||||
},
|
|
||||||
map[string]int{
|
|
||||||
"end": 13,
|
|
||||||
"beg": 7,
|
|
||||||
},
|
|
||||||
map[string]int{
|
|
||||||
"beg": 14,
|
|
||||||
"end": 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
headers: []string{
|
headers: []string{
|
||||||
"ONE",
|
"ONE",
|
||||||
"TWO",
|
"TWO",
|
||||||
@@ -78,7 +64,7 @@ asd igig cxxxncnc
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !reflect.DeepEqual(data, gotdata) {
|
if !reflect.DeepEqual(data, gotdata) {
|
||||||
t.Errorf("Parser returned invalid data\nExp: %+v\nGot: %+v\n", data, gotdata)
|
t.Errorf("Parser returned invalid data, Regex: %s\nExp: %+v\nGot: %+v\n", Separator, data, gotdata)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user