diff --git a/pkg/message/parser.go b/pkg/message/parser.go index 77c148a3..cf747573 100644 --- a/pkg/message/parser.go +++ b/pkg/message/parser.go @@ -205,7 +205,7 @@ func joinChildParts(childParts []parser.Parts) parser.Parts { func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.Parts, error) { // If one of the parts has preferred content type, use that. for i := len(childParts) - 1; i >= 0; i-- { - if allHaveContentType(childParts[i], preferredContentType) { + if allPartsHaveContentType(childParts[i], preferredContentType) { return childParts[i], nil } } @@ -214,7 +214,7 @@ func bestChoice(childParts []parser.Parts, preferredContentType string) (parser. return childParts[len(childParts)-1], nil } -func allHaveContentType(parts parser.Parts, contentType string) bool { +func allPartsHaveContentType(parts parser.Parts, contentType string) bool { for _, part := range parts { t, _, err := part.Header.ContentType() if err != nil { @@ -272,23 +272,11 @@ func getPlainBody(part *parser.Part) []byte { } } -func writeMIMEMessage(p *parser.Parser) (mime string, err error) { - writer := p. - NewWriter(). - WithCondition(func(p *parser.Part) (keep bool) { - disp, _, err := p.Header.ContentDisposition() - if err != nil { - return true - } - - // TODO: Is it true that we don't want to write attachments? I thought this was for externals... - return disp != "attachment" - }) - +func writeMIMEMessage(p *parser.Parser) (string, error) { buf := new(bytes.Buffer) - if err = writer.Write(buf); err != nil { - return + if err := p.NewWriter().Write(buf); err != nil { + return "", err } return buf.String(), nil diff --git a/pkg/message/parser/parser.go b/pkg/message/parser/parser.go index 7b91c57c..b5a78af8 100644 --- a/pkg/message/parser/parser.go +++ b/pkg/message/parser/parser.go @@ -1,11 +1,11 @@ package parser import ( - "fmt" "io" "io/ioutil" "github.com/emersion/go-message" + "github.com/sirupsen/logrus" ) type Parser struct { @@ -13,14 +13,19 @@ type Parser struct { root *Part } -func New(r io.Reader) (p *Parser, err error) { - p = new(Parser) +func New(r io.Reader) (*Parser, error) { + p := new(Parser) - if err = p.parse(r); err != nil { - return + entity, err := message.Read(r) + if err != nil && !message.IsUnknownCharset(err) { + return nil, err } - return + if err := p.parseEntity(entity); err != nil { + return nil, err + } + + return p, nil } func (p *Parser) NewWalker() *Walker { @@ -51,32 +56,25 @@ func (p *Parser) Part(number []int) (part *Part, err error) { return } -func (p *Parser) parse(r io.Reader) error { - entity, err := message.Read(r) - if err != nil { - if !message.IsUnknownCharset(err) { - return err - } else { - fmt.Println(err) - } - } - - return p.parseEntity(entity) -} - -func (p *Parser) enter() { +func (p *Parser) beginPart() { p.stack = append(p.stack, &Part{}) } -func (p *Parser) exit() { - var built *Part +func (p *Parser) endPart() { + var part *Part - p.stack, built = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1] + p.stack, part = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1] if len(p.stack) > 0 { - p.top().children = append(p.top().children, built) + p.top().children = append(p.top().children, part) } else { - p.root = built + p.root = part + } + + if !part.isUTF8() { + if err := part.convertToUTF8(); err != nil { + logrus.WithError(err).Error("failed to convert part to utf-8") + } } } @@ -96,9 +94,9 @@ func (p *Parser) withBody(bytes []byte) { p.top().Body = bytes } -func (p *Parser) parseEntity(e *message.Entity) (err error) { - p.enter() - defer p.exit() +func (p *Parser) parseEntity(e *message.Entity) error { + p.beginPart() + defer p.endPart() p.withHeader(e.Header) diff --git a/pkg/message/parser/parser_test.go b/pkg/message/parser/parser_test.go index baa83753..94a907f1 100644 --- a/pkg/message/parser/parser_test.go +++ b/pkg/message/parser/parser_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -19,6 +20,30 @@ func newTestParser(t *testing.T, msg string) *Parser { return p } +func TestParserSpecifiedLatin1Charset(t *testing.T) { + p := newTestParser(t, "text_plain_latin1.eml") + + checkBodies(t, p, "ééééééé") +} + +func TestParserUnspecifiedLatin1Charset(t *testing.T) { + p := newTestParser(t, "text_plain_unknown_latin1.eml") + + checkBodies(t, p, "ééééééé") +} + +func TestParserSpecifiedLatin2Charset(t *testing.T) { + p := newTestParser(t, "text_plain_latin2.eml") + + checkBodies(t, p, "řšřšřš") +} + +func TestParserEmbeddedLatin2Charset(t *testing.T) { + p := newTestParser(t, "text_html_embedded_latin2_encoding.eml") + + checkBodies(t, p, `latin2 řšřš`) +} + func f(filename string) io.ReadCloser { f, err := os.Open(filepath.Join("testdata", filename)) @@ -37,3 +62,21 @@ func s(filename string) string { return string(b) } + +func checkBodies(t *testing.T, p *Parser, wantBodies ...string) { + var partBodies, expectedBodies [][]byte + + require.NoError(t, p.NewWalker().RegisterDefaultHandler(func(p *Part) (err error) { + if p.Body != nil { + partBodies = append(partBodies, p.Body) + } + + return + }).Walk()) + + for _, body := range wantBodies { + expectedBodies = append(expectedBodies, []byte(body)) + } + + assert.ElementsMatch(t, expectedBodies, partBodies) +} diff --git a/pkg/message/parser/part.go b/pkg/message/parser/part.go index e3ae8ff1..ae808806 100644 --- a/pkg/message/parser/part.go +++ b/pkg/message/parser/part.go @@ -2,8 +2,12 @@ package parser import ( "errors" + "unicode/utf8" + pmmime "github.com/ProtonMail/proton-bridge/pkg/mime" "github.com/emersion/go-message" + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" ) type Parts []*Part @@ -30,35 +34,32 @@ func (p *Part) AddChild(child *Part) { p.children = append(p.children, child) } -func (p *Part) write(writer *message.Writer, w *Writer) (err error) { - if len(p.children) > 0 { - for _, child := range p.children { - if err = child.writeAsChild(writer, w); err != nil { - return - } - } - } - - if _, err = writer.Write(p.Body); err != nil { - return - } - - return +func (p *Part) isUTF8() bool { + return utf8.Valid(p.Body) } -func (p *Part) writeAsChild(writer *message.Writer, w *Writer) (err error) { - if !w.shouldWrite(p) { - return - } - - childWriter, err := writer.CreatePart(p.Header) +// TODO: Do we then need to set charset to utf-8? What if it's embedded in html? +func (p *Part) convertToUTF8() error { + t, params, err := p.Header.ContentType() if err != nil { - return + return err } - if err = p.write(childWriter, w); err != nil { - return + var decoder *encoding.Decoder + + if knownCharset, ok := params["charset"]; !ok { + encoding, _, _ := charset.DetermineEncoding(p.Body, t) + decoder = encoding.NewDecoder() + } else if decoder, err = pmmime.SelectDecoder(knownCharset); err != nil { + return err } - return childWriter.Close() + if p.Body, err = decoder.Bytes(p.Body); err != nil { + return err + } + + params["charset"] = "utf-8" + p.Header.SetContentType(t, params) + + return nil } diff --git a/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml b/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml new file mode 100644 index 00000000..3bba7086 --- /dev/null +++ b/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml @@ -0,0 +1,5 @@ +From: Sender +To: Receiver +Content-Type: text/html + +latin2 ø¹ø¹ \ No newline at end of file diff --git a/pkg/message/parser/testdata/text_plain_latin1.eml b/pkg/message/parser/testdata/text_plain_latin1.eml new file mode 100644 index 00000000..dd8f88e0 --- /dev/null +++ b/pkg/message/parser/testdata/text_plain_latin1.eml @@ -0,0 +1,5 @@ +From: Sender +To: Receiver +Content-Type: text/plain; charset=ISO-8859-1 + +ééééééé \ No newline at end of file diff --git a/pkg/message/parser/testdata/text_plain_latin2.eml b/pkg/message/parser/testdata/text_plain_latin2.eml new file mode 100644 index 00000000..950991bd --- /dev/null +++ b/pkg/message/parser/testdata/text_plain_latin2.eml @@ -0,0 +1,5 @@ +From: Sender +To: Receiver +Content-Type: text/plain; charset=ISO-8859-2 + +ø¹ø¹ø¹ \ No newline at end of file diff --git a/pkg/message/parser/testdata/text_plain_unknown_latin1.eml b/pkg/message/parser/testdata/text_plain_unknown_latin1.eml new file mode 100644 index 00000000..2e1aa2cd --- /dev/null +++ b/pkg/message/parser/testdata/text_plain_unknown_latin1.eml @@ -0,0 +1,5 @@ +From: Sender +To: Receiver +Content-Type: text/plain + +ééééééé \ No newline at end of file diff --git a/pkg/message/parser/testdata/text_plain_unknown_latin2.eml b/pkg/message/parser/testdata/text_plain_unknown_latin2.eml new file mode 100644 index 00000000..549a48a9 --- /dev/null +++ b/pkg/message/parser/testdata/text_plain_unknown_latin2.eml @@ -0,0 +1,5 @@ +From: Sender +To: Receiver +Content-Type: text/plain + +ø¹ø¹ø¹ \ No newline at end of file diff --git a/pkg/message/parser/writer.go b/pkg/message/parser/writer.go index f4ba3463..0dc70ee1 100644 --- a/pkg/message/parser/writer.go +++ b/pkg/message/parser/writer.go @@ -19,19 +19,27 @@ func newWriter(root *Part) *Writer { } } +// WithCondition allows setting a condition when parts should be written. +// Parts are passed to each condition set and if any condition returns false, +// the part is not written. +// This initially seemed like a good idea but is now kinda useless. func (w *Writer) WithCondition(cond Condition) *Writer { w.cond = append(w.cond, cond) return w } -func (w *Writer) Write(ww io.Writer) (err error) { - msgWriter, err := message.CreateWriter(ww, w.root.Header) - if err != nil { - return +func (w *Writer) Write(ww io.Writer) error { + if w.shouldFilter(w.root) { + w.root.Header.Add("Content-Transfer-Encoding", "base64") } - if err = w.root.write(msgWriter, w); err != nil { - return + msgWriter, err := message.CreateWriter(ww, w.root.Header) + if err != nil { + return err + } + + if err := w.write(msgWriter, w.root); err != nil { + return err } return msgWriter.Close() @@ -46,3 +54,56 @@ func (w *Writer) shouldWrite(p *Part) bool { return true } + +func (w *Writer) shouldFilter(p *Part) bool { + encoding := p.Header.Get("Content-Transfer-Encoding") + + if encoding != "" && encoding == "quoted-printable" || encoding == "base64" { + return false + } + + for _, b := range p.Body { + if uint8(b) > 1<<7 { + return true + } + } + + return false +} + +func (w *Writer) write(writer *message.Writer, p *Part) error { + if len(p.children) > 0 { + for _, child := range p.children { + if err := w.writeAsChild(writer, child); err != nil { + return err + } + } + } + + if _, err := writer.Write(p.Body); err != nil { + return err + } + + return nil +} + +func (w *Writer) writeAsChild(writer *message.Writer, p *Part) error { + if !w.shouldWrite(p) { + return nil + } + + if w.shouldFilter(p) { + p.Header.Add("Content-Transfer-Encoding", "base64") + } + + childWriter, err := writer.CreatePart(p.Header) + if err != nil { + return err + } + + if err := w.write(childWriter, p); err != nil { + return err + } + + return childWriter.Close() +} diff --git a/pkg/mime/encoding.go b/pkg/mime/encoding.go index 2afc2bdd..ad522465 100644 --- a/pkg/mime/encoding.go +++ b/pkg/mime/encoding.go @@ -37,7 +37,7 @@ import ( var wordDec = &mime.WordDecoder{ CharsetReader: func(charset string, input io.Reader) (io.Reader, error) { - dec, err := selectDecoder(charset) + dec, err := SelectDecoder(charset) if err != nil { return nil, err } @@ -166,7 +166,7 @@ func getEncoding(charset string) (enc encoding.Encoding, err error) { return } -func selectDecoder(charset string) (decoder *encoding.Decoder, err error) { +func SelectDecoder(charset string) (decoder *encoding.Decoder, err error) { var enc encoding.Encoding lcharset := strings.Trim(strings.ToLower(charset), " \t\r\n") switch lcharset { @@ -211,7 +211,7 @@ func DecodeCharset(original []byte, contentType string) ([]byte, error) { } if charset, ok := params["charset"]; ok { - decoder, err := selectDecoder(charset) + decoder, err := SelectDecoder(charset) if err != nil { return original, errors.Wrap(err, "unknown charset was specified") }