Skip to content

Commit

Permalink
Merge pull request #553 from Zemnmez/csvpretty-speedup
Browse files Browse the repository at this point in the history
Speed up csvpretty
  • Loading branch information
Zemnmez authored Jul 14, 2022
2 parents 68e4b8c + ecd33b6 commit e768a74
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 42 deletions.
223 changes: 182 additions & 41 deletions go/cmd/csvpretty/csvpretty.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package main
import (
"bytes"
"encoding/csv"
"errors"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
Expand All @@ -26,6 +28,46 @@ func main() {
panic(err)
}

type lineWhitespaceTrimmer struct {
out io.Writer
currentLine []byte
}

func (l lineWhitespaceTrimmer) trimmedCurrentLine() []byte {
return bytes.TrimSpace(l.currentLine)
}

func (l *lineWhitespaceTrimmer) Flush() (err error) {
_, err = l.out.Write(l.trimmedCurrentLine())
l.currentLine = nil

return
}

func (l *lineWhitespaceTrimmer) Close() (err error) {
l.Flush()
return nil
}

func (l *lineWhitespaceTrimmer) Write(b []byte) (n int, err error) {
l.currentLine = append(l.currentLine, b...)

for bytes.Contains(l.currentLine, []byte("\n")) {
splits := bytes.SplitN(l.currentLine, []byte("\n"), 2)
l.currentLine = splits[1]
_, err = l.out.Write(append(bytes.TrimSpace(splits[0]), []byte("\n")...))

if err != nil {
err = fmt.Errorf("lineWhitespaceTrimmer: %w", err)
return
}
}

n = len(b)

return
}

type byteReplacer struct {
out io.Writer
from byte
Expand All @@ -34,19 +76,24 @@ type byteReplacer struct {

func (br byteReplacer) Write(b []byte) (n int, err error) {
n, err = br.out.Write(bytes.Replace(b, []byte{br.from}, []byte(br.to), -1))
if err != nil {
err = fmt.Errorf("byteReplacer: %w", err)
}
return len(b), err
}

var input string
var output string
var overwrite bool
var debug bool
var validate bool

func init() {
flag.StringVar(&input, "input", "", "input file")
flag.StringVar(&output, "output", "", "output file")
flag.BoolVar(&overwrite, "w", false, "overwrite input with output")
flag.BoolVar(&debug, "debug", false, "print debug info")
flag.BoolVar(&validate, "validate", false, "Validate the number of fields is the same on every row.")
}

const holder = '\x01'
Expand All @@ -57,44 +104,63 @@ func (e errUsage) Error() string { return string(e) }

var missingInput errUsage = "missing input"

func do() (err error) {
flag.Parse()
type byteWriteCounter struct {
out io.Writer
ctr uint64
}

if input == "" {
return missingInput
}
func (b *byteWriteCounter) Write(bt []byte) (n int, err error) {
n, err = b.out.Write(bt)

if overwrite {
output = input
}
b.ctr += uint64(n)

var bt []byte
if bt, err = ioutil.ReadFile(input); err != nil {
return
if err != nil {
err = fmt.Errorf("byteWriteCounter: %w", err)
}

var records [][]string
if records, err = csv.NewReader(bytes.NewReader(bt)).ReadAll(); err != nil {
return
}
return
}

var buf bytes.Buffer
var tabFlags uint = 0
type PrettyCSV struct {
rd io.ReadCloser
debug bool
}

func (p PrettyCSV) Read(b []byte) (n int, err error) {
panic("this is secretly not a reader! Please use WriteTo()")
}

func (p PrettyCSV) Close() (err error) {
return p.rd.Close()
}

func (p PrettyCSV) WriteTo(w io.Writer) (n int64, err error) {
var padChr byte = ' '
if debug {

var tabFlags uint = 0
if p.debug {
tabFlags |= tabwriter.Debug
padChr = '-'
}
tabWriter := tabwriter.NewWriter(&buf, 0, 1, 3, padChr, tabFlags)

var tabReplacer = byteReplacer{
// This is a pipeline of
// csv reader -> csv writer -> lineReplacer -> tabReplacer -> tabWriter -> lineWhitespaceTrimmer -> byteWriteCounter -> out

ctr := &byteWriteCounter{out: w}

trimmer := &lineWhitespaceTrimmer{
out: ctr,
}

tabWriter := tabwriter.NewWriter(trimmer, 0, 1, 3, padChr, tabFlags)

tabReplacer := byteReplacer{
out: tabWriter,
from: holder,
to: ",\t",
}

// tabwriter wants a \t at the end of each row too.
var lineReplacer = byteReplacer{
lineReplacer := byteReplacer{
out: tabReplacer,
from: '\n',
to: "\t\n",
Expand All @@ -103,49 +169,124 @@ func do() (err error) {
csvWriter := csv.NewWriter(lineReplacer)
csvWriter.Comma = holder

for _, row := range records {
csvRd := csv.NewReader(p.rd)

for {
var row []string
row, err = csvRd.Read()

if errors.Is(err, csv.ErrFieldCount) && !validate {
err = nil
}

if err != nil {
if errors.Is(err, io.EOF) {
break
}

err = fmt.Errorf("Reading CSV row: %w", err)
return
}

// strip any parsed spaces
for i, f := range row {
// strip any parsed spaces (which i'd like to be ignored)
row[i] = strings.TrimSpace(f)
}

if err = csvWriter.Write(row); err != nil {
err = fmt.Errorf("Writing CSV row: %w", err)
return
}
}

csvWriter.Flush()
if _, err = tabWriter.Write([]byte("\t")); err != nil {

if err = csvWriter.Error(); err != nil {
err = fmt.Errorf("CSV flush: %w", err)
return
}

if _, err = tabWriter.Write([]byte{'\t'}); err != nil {
err = fmt.Errorf("Writing final tab: %w", err)
return
}

if err = tabWriter.Flush(); err != nil {
err = fmt.Errorf("Flushing tabWriter: %w", err)
return
}

// trim extra space around each line
var trimmed bytes.Buffer
for _, line := range bytes.Split(buf.Bytes(), []byte("\n")) {
if _, err = trimmed.Write(append(bytes.TrimSpace(line), []byte("\n")...)); err != nil {
return
}
if err = trimmer.Flush(); err != nil {
err = fmt.Errorf("Flushing line trimmer: %w", err)
return
}

var outfile io.Writer = os.Stdout
return int64(ctr.ctr), nil
}

if output != "" && output != "-" {
var f *os.File
if f, err = os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700); err != nil {
return
}
func do() (err error) {
flag.Parse()

if input == "" {
return missingInput
}

if overwrite {
output = input
}

inputFile, err := os.Open(input)

defer inputFile.Close()

defer f.Close()
outfile = f
if err != nil {
err = fmt.Errorf("Opening input file: %w", err)
return
}

if _, err = io.Copy(outfile, &trimmed); err != nil {
tempOut, err := ioutil.TempFile("", "csvpretty")
defer os.Remove(tempOut.Name())

if err != nil {
err = fmt.Errorf("Creating temporary file: %w", err)
return
}

return
_, err = io.Copy(tempOut, PrettyCSV{rd: inputFile, debug: debug})
if err != nil {
err = fmt.Errorf("Copying pretty CSV to out: %w", err)
return
}

if err = inputFile.Close(); err != nil {
err = fmt.Errorf("Closing input file: %w", err)
return
}

outputFile, err := os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700)
defer outputFile.Close()

if err != nil {
err = fmt.Errorf("Opening output file: %w", err)
return
}

// return to beginning of the file
_, err = tempOut.Seek(0, 0)

if err != nil {
err = fmt.Errorf("Seeking to beginning of temp file: %w", err)
return
}

_, err = io.Copy(outputFile, tempOut)

if err != nil {
err = fmt.Errorf("Copying temp file to output: %w", err)
return
}

outputFile.Close()

return nil
}
1 change: 0 additions & 1 deletion go/cmd/csvpretty/test_out.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ Name, Subspecies, Species
West African Giraffe, G. c. peralta, G. camelopardalis
Meerkat, , s.Suricata
Dog, C. l. familiaris, C. lupus

0 comments on commit e768a74

Please sign in to comment.