-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy pathenex.go
234 lines (200 loc) · 5.2 KB
/
enex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
package enex
import (
"bytes"
"encoding/xml"
"errors"
"fmt"
"io"
"regexp"
"strings"
)
type (
// Export represents Evernote enex file structure
Export struct {
XMLName xml.Name `xml:"en-export"`
Date string `xml:"export-date,attr"`
Notes []Note `xml:"note"`
}
// Note is one note in Evernote
Note struct {
XMLName xml.Name `xml:"note"`
Title string `xml:"title"`
Content []byte `xml:"content"`
Updated string `xml:"updated"`
Created string `xml:"created"`
Tags []string `xml:"tag"`
Attributes NoteAttributes `xml:"note-attributes"`
Resources []Resource `xml:"resource"`
}
// NoteAttributes contain the note metadata
NoteAttributes struct {
Source string `xml:"source"`
SourceApplication string `xml:"source-application"`
Latitude string `xml:"latitude"`
Longitude string `xml:"longitude"`
Altitude string `xml:"altitude"`
Author string `xml:"author"`
SourceUrl string `xml:"source-url"`
}
// Resource embedded in the note
Resource struct {
ID string
Type string
Data Data `xml:"data"`
Mime string `xml:"mime"`
Width int `xml:"width"`
Height int `xml:"height"`
Attributes Attributes `xml:"resource-attributes"`
Recognition []byte `xml:"recognition"`
}
// Attributes of the resource
Attributes struct {
Timestamp string `xml:"timestamp"`
Filename string `xml:"file-name"`
SourceUrl string `xml:"source-url"`
}
// Recognition for the resource
Recognition struct {
XMLName xml.Name `xml:"recoIndex"`
ObjID string `xml:"objID,attr"`
ObjType string `xml:"objType,attr"`
}
// Data object in base64
Data struct {
XMLName xml.Name `xml:"data"`
Encoding string `xml:"encoding,attr"`
Content []byte `xml:",innerxml"`
}
// Content of Evernote Notes
Content struct {
Text []byte `xml:",innerxml"`
}
)
var hashRe = regexp.MustCompile(`\b[0-9a-f]{32}\b`)
// Decode will return an Export from evernote
func Decode(data io.Reader) (*Export, error) {
var e Export
err := NewDecoder(data).Decode(&e)
for i := range e.Notes {
if err := decodeContent(&e.Notes[i]); err != nil {
// EOF is a known case when the content is empty
if !errors.Is(err, io.EOF) {
e.Notes = append(e.Notes[:i], e.Notes[+1:]...)
return nil, fmt.Errorf("decoding note %s: %w", e.Notes[i].Title, err)
}
}
err = decodeRecognition(&e.Notes[i])
if err != nil {
return nil, err
}
}
return &e, err
}
type Decoder struct {
xml *xml.Decoder
}
func NewDecoder(r io.Reader) *Decoder {
d := xml.NewDecoder(r)
d.Strict = false
return &Decoder{xml: d}
}
func (d Decoder) Decode(v any) error {
return d.xml.Decode(v)
}
type StreamDecoder struct {
xml *xml.Decoder
}
func NewStreamDecoder(r io.Reader) (*StreamDecoder, error) {
buf := bytes.Buffer{}
if _, err := buf.ReadFrom(r); err != nil {
return nil, err
}
clean := removeNestedCDATA(buf.String())
d := xml.NewDecoder(strings.NewReader(clean))
d.Strict = false
for {
token, err := d.Token()
if err != nil {
if errors.Is(err, io.EOF) {
return nil, fmt.Errorf("failed to initialise stream reader: no en-export data found: %w", err)
}
return nil, err
}
element, ok := token.(xml.StartElement)
if ok && element.Name.Local == "en-export" {
break
}
}
return &StreamDecoder{xml: d}, nil
}
func (d StreamDecoder) Next(n *Note) error {
for {
token, err := d.xml.Token()
if err != nil {
return err
}
element, ok := token.(xml.StartElement)
if ok && element.Name.Local == "note" {
err = d.xml.DecodeElement(n, &element)
if err != nil {
return err
}
err = decodeContent(n)
if err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return err
}
return decodeRecognition(n)
}
}
}
func decodeContent(n *Note) error {
var c Content
var reader = bytes.NewReader(n.Content)
if err := NewDecoder(reader).Decode(&c); err != nil {
return err
}
n.Content = c.Text
return nil
}
func decodeRecognition(n *Note) error {
for j := range n.Resources {
if res := n.Resources[j]; len(res.Recognition) == 0 {
hash := hashRe.FindString(res.Attributes.SourceUrl)
if len(hash) > 0 {
n.Resources[j].ID = hash
}
continue
}
var rec Recognition
decoder := NewDecoder(bytes.NewReader(n.Resources[j].Recognition))
err := decoder.Decode(&rec)
if err != nil {
return fmt.Errorf("decoding resource %s: %w", n.Resources[j].Attributes.Filename, err)
}
n.Resources[j].ID = rec.ObjID
n.Resources[j].Type = rec.ObjType
}
return nil
}
var reCDATA = regexp.MustCompile(`<!\[CDATA\[(.*?)\]\]>`)
// removeNestedCDATA tags in the note content
//
// Nested CDATA tags are not allowed by XML specification
// but Evernote puts them anyway, causing "Unexpected EOF" errors during decoding
func removeNestedCDATA(input string) string {
output := reCDATA.ReplaceAllStringFunc(input, func(match string) string {
submatch := reCDATA.FindStringSubmatch(match)
if len(submatch) > 1 {
return submatch[1]
}
return match
})
// Recursively remove nested CDATA tags
if output != input {
return removeNestedCDATA(output)
}
return output
}