utfutil/utfutil.go at main · TomOnTime/utfutil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
// Package utfutil provides methods that make it easy to read data in an UTF-encoding agnostic.
package utfutil

// These functions autodetect UTF BOM and return UTF-8. If no
// BOM is found, a hint is provided as to which encoding to assume.
// You can use them as replacements for os.Open() and ioutil.ReadFile()
// when the encoding of the file is unknown.

// utfutil.OpenFile() is a replacement for os.Open().
// utfutil.ReadFile() is a replacement for ioutil.ReadFile().
// utfutil.NewScanner() takes a filename and returns a Scanner.
// utfutil.NewReader() rewraps an existing scanner to make it UTF-encoding agnostic.
// utfutil.BytesReader() takes a []byte and decodes it to UTF-8.

// When there is no BOM, it is impossible to guess correctly 100%
// of the time.  Therefore, the functions take a 2nd parameter of type
// "EncodingHint" where you specify the default encoding for BOM-less
// data.

// In the future we'd like to have a hint called AUTO that uses
// uchatdet (or a Go rewrite) to guess.

// Inspiration: I wrote this after spending half a day trying
// to figure out how to use unicode.BOMOverride.
// Hopefully this will save other golang newbies from the same.
// (golang.org/x/text/encoding/unicode)

import (
	"bufio"
	"bytes"
	"io"
	"io/ioutil"
	"os"

	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

// EncodingHint indicates the file's encoding if there is no BOM.
type EncodingHint int

const (
	// UTF8 indicates the specified encoding.
	UTF8 EncodingHint = iota
	// UTF16LE indicates the specified encoding.
	UTF16LE
	// UTF16BE indicates the specified encoding.
	UTF16BE
	// WINDOWS indicates that the file came from a MS-Windows system
	WINDOWS = UTF16LE
	// POSIX indicates that the file came from Unix or Unix-like systems
	POSIX = UTF8
	// HTML5 indicates that the file came from the web
	HTML5 = UTF8
)

// UTFReadCloser describes the utfutil ReadCloser structure.
type UTFReadCloser interface {
	Read(p []byte) (n int, err error)
	Close() error
}

// ReadCloser is a readcloser for the UTFUtil package.
type readCloser struct {
	file   *os.File
	reader io.Reader
}

// Read implements the standard Reader interface.
func (u readCloser) Read(p []byte) (n int, err error) {
	return u.reader.Read(p)
}

// Close implements the standard Closer interface.
func (u readCloser) Close() error {
	if u.file != nil {
		return u.file.Close()
	}
	return nil
}

// UTFScanCloser describes a new utfutil ScanCloser structure.
// It's similar to ReadCloser, but with a scanner instead of a reader.
type UTFScanCloser interface {
	Buffer(buf []byte, max int)
	Bytes() []byte
	Err() error
	Scan() bool
	Split(split bufio.SplitFunc)
	Text() string
	Close() error
}

type scanCloser struct {
	file    UTFReadCloser
	scanner *bufio.Scanner
}

// Buffer will run the Buffer function on the underlying bufio.Scanner.
func (sc scanCloser) Buffer(buf []byte, max int) {
	sc.scanner.Buffer(buf, max)
}

// Bytes will run the Bytes function on the underlying bufio.Scanner.
func (sc scanCloser) Bytes() []byte {
	return sc.scanner.Bytes()
}

// Err will run the Err function on the underlying bufio.Scanner.
func (sc scanCloser) Err() error {
	return sc.scanner.Err()
}

// Scan will run the Scan function on the underlying bufio.Scanner.
func (sc scanCloser) Scan() bool {
	return sc.scanner.Scan()
}

// Split will run the Split function on the underlying bufio.Scanner.
func (sc scanCloser) Split(split bufio.SplitFunc) {
	sc.scanner.Split(split)
}

// Text will return the text from the underlying bufio.Scanner.
func (sc scanCloser) Text() string {
	return sc.scanner.Text()
}

// Close will close the underlying file handle.
func (sc scanCloser) Close() error {
	return sc.file.Close()
}

// About utfutil.HTML5:
// This technique is recommended by the W3C for use in HTML 5:
// "For compatibility with deployed content, the byte order
// mark (also known as BOM) is considered more authoritative
// than anything else." http://www.w3.org/TR/encoding/#specification-hooks

// OpenFile is the equivalent of os.Open().
func OpenFile(name string, d EncodingHint) (UTFReadCloser, error) {
	f, err := os.Open(name)
	if err != nil {
		return nil, err
	}

	rc := readCloser{file: f}
	return NewReader(rc, d), nil
}

// ReadFile is the equivalent of ioutil.ReadFile()
func ReadFile(name string, d EncodingHint) ([]byte, error) {
	file, err := OpenFile(name, d)
	if err != nil {
		return nil, err
	}
	defer file.Close()
	return ioutil.ReadAll(file)
}

// NewScanner is a convenience function that takes a filename and returns a scanner.
func NewScanner(name string, d EncodingHint) (UTFScanCloser, error) {
	f, err := OpenFile(name, d)
	if err != nil {
		return nil, err
	}

	return scanCloser{
		scanner: bufio.NewScanner(f),
		file:    f,
	}, nil
}

// NewReader wraps a Reader to decode Unicode to UTF-8 as it reads.
func NewReader(r io.Reader, d EncodingHint) UTFReadCloser {
	var decoder *encoding.Decoder
	switch d {
	case UTF8:
		// Make a transformer that assumes UTF-8 but abides by the BOM.
		decoder = unicode.UTF8.NewDecoder()
	case UTF16LE:
		// Make an tranformer that decodes MS-Windows (16LE) UTF files:
		winutf := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
		// Make a transformer that is like winutf, but abides by BOM if found:
		decoder = winutf.NewDecoder()
	case UTF16BE:
		// Make an tranformer that decodes UTF-16BE files:
		utf16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
		// Make a transformer that is like utf16be, but abides by BOM if found:
		decoder = utf16be.NewDecoder()
	}

	// Make a Reader that uses utf16bom:
	if rc, ok := r.(readCloser); ok {
		rc.reader = transform.NewReader(rc.file, unicode.BOMOverride(decoder))
		return rc
	}

	return readCloser{
		reader: transform.NewReader(r, unicode.BOMOverride(decoder)),
	}
}

// BytesReader is a convenience function that takes a []byte and decodes them to UTF-8.
func BytesReader(b []byte, d EncodingHint) io.Reader {
	return NewReader(bytes.NewReader(b), d)
}

type UTFWriteCloser interface {
	Write(p []byte) (n int, err error)
	Close() error
}

type writeCloser struct {
	file   *os.File
	writer io.Writer
}

func (u writeCloser) Write(p []byte) (n int, err error) {
	return u.writer.Write(p)
}

func (u writeCloser) Close() error {
	if u.file != nil {
		return u.file.Close()
	}
	return nil
}

func NewWriter(r io.Writer, d EncodingHint) UTFWriteCloser {
	var encoder *encoding.Encoder
	switch d {
	case UTF8:
		encoder = unicode.UTF8.NewEncoder()
	case UTF16LE:
		winutf := unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM)
		encoder = winutf.NewEncoder()
	case UTF16BE:
		utf16be := unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM)
		encoder = utf16be.NewEncoder()
	}

	if rc, ok := r.(writeCloser); ok {
		rc.writer = transform.NewWriter(rc.file, unicode.BOMOverride(encoder))
		return rc
	}

	return writeCloser{
		writer: transform.NewWriter(r, unicode.BOMOverride(encoder)),
	}
}

func BytesWriter(b *bytes.Buffer, d EncodingHint) io.Writer {
	return NewWriter(b, d)
}