forked from Silverfish/proton-bridge
feat: handle foreign encodings
This commit is contained in:
@ -205,7 +205,7 @@ func joinChildParts(childParts []parser.Parts) parser.Parts {
|
|||||||
func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.Parts, error) {
|
func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.Parts, error) {
|
||||||
// If one of the parts has preferred content type, use that.
|
// If one of the parts has preferred content type, use that.
|
||||||
for i := len(childParts) - 1; i >= 0; i-- {
|
for i := len(childParts) - 1; i >= 0; i-- {
|
||||||
if allHaveContentType(childParts[i], preferredContentType) {
|
if allPartsHaveContentType(childParts[i], preferredContentType) {
|
||||||
return childParts[i], nil
|
return childParts[i], nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -214,7 +214,7 @@ func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.
|
|||||||
return childParts[len(childParts)-1], nil
|
return childParts[len(childParts)-1], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func allHaveContentType(parts parser.Parts, contentType string) bool {
|
func allPartsHaveContentType(parts parser.Parts, contentType string) bool {
|
||||||
for _, part := range parts {
|
for _, part := range parts {
|
||||||
t, _, err := part.Header.ContentType()
|
t, _, err := part.Header.ContentType()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -272,23 +272,11 @@ func getPlainBody(part *parser.Part) []byte {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeMIMEMessage(p *parser.Parser) (mime string, err error) {
|
func writeMIMEMessage(p *parser.Parser) (string, error) {
|
||||||
writer := p.
|
|
||||||
NewWriter().
|
|
||||||
WithCondition(func(p *parser.Part) (keep bool) {
|
|
||||||
disp, _, err := p.Header.ContentDisposition()
|
|
||||||
if err != nil {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Is it true that we don't want to write attachments? I thought this was for externals...
|
|
||||||
return disp != "attachment"
|
|
||||||
})
|
|
||||||
|
|
||||||
buf := new(bytes.Buffer)
|
buf := new(bytes.Buffer)
|
||||||
|
|
||||||
if err = writer.Write(buf); err != nil {
|
if err := p.NewWriter().Write(buf); err != nil {
|
||||||
return
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return buf.String(), nil
|
return buf.String(), nil
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
package parser
|
package parser
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
|
||||||
"github.com/emersion/go-message"
|
"github.com/emersion/go-message"
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Parser struct {
|
type Parser struct {
|
||||||
@ -13,14 +13,19 @@ type Parser struct {
|
|||||||
root *Part
|
root *Part
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(r io.Reader) (p *Parser, err error) {
|
func New(r io.Reader) (*Parser, error) {
|
||||||
p = new(Parser)
|
p := new(Parser)
|
||||||
|
|
||||||
if err = p.parse(r); err != nil {
|
entity, err := message.Read(r)
|
||||||
return
|
if err != nil && !message.IsUnknownCharset(err) {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
if err := p.parseEntity(entity); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return p, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Parser) NewWalker() *Walker {
|
func (p *Parser) NewWalker() *Walker {
|
||||||
@ -51,32 +56,25 @@ func (p *Parser) Part(number []int) (part *Part, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Parser) parse(r io.Reader) error {
|
func (p *Parser) beginPart() {
|
||||||
entity, err := message.Read(r)
|
|
||||||
if err != nil {
|
|
||||||
if !message.IsUnknownCharset(err) {
|
|
||||||
return err
|
|
||||||
} else {
|
|
||||||
fmt.Println(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return p.parseEntity(entity)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *Parser) enter() {
|
|
||||||
p.stack = append(p.stack, &Part{})
|
p.stack = append(p.stack, &Part{})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Parser) exit() {
|
func (p *Parser) endPart() {
|
||||||
var built *Part
|
var part *Part
|
||||||
|
|
||||||
p.stack, built = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1]
|
p.stack, part = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1]
|
||||||
|
|
||||||
if len(p.stack) > 0 {
|
if len(p.stack) > 0 {
|
||||||
p.top().children = append(p.top().children, built)
|
p.top().children = append(p.top().children, part)
|
||||||
} else {
|
} else {
|
||||||
p.root = built
|
p.root = part
|
||||||
|
}
|
||||||
|
|
||||||
|
if !part.isUTF8() {
|
||||||
|
if err := part.convertToUTF8(); err != nil {
|
||||||
|
logrus.WithError(err).Error("failed to convert part to utf-8")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,9 +94,9 @@ func (p *Parser) withBody(bytes []byte) {
|
|||||||
p.top().Body = bytes
|
p.top().Body = bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Parser) parseEntity(e *message.Entity) (err error) {
|
func (p *Parser) parseEntity(e *message.Entity) error {
|
||||||
p.enter()
|
p.beginPart()
|
||||||
defer p.exit()
|
defer p.endPart()
|
||||||
|
|
||||||
p.withHeader(e.Header)
|
p.withHeader(e.Header)
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -19,6 +20,30 @@ func newTestParser(t *testing.T, msg string) *Parser {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParserSpecifiedLatin1Charset(t *testing.T) {
|
||||||
|
p := newTestParser(t, "text_plain_latin1.eml")
|
||||||
|
|
||||||
|
checkBodies(t, p, "ééééééé")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParserUnspecifiedLatin1Charset(t *testing.T) {
|
||||||
|
p := newTestParser(t, "text_plain_unknown_latin1.eml")
|
||||||
|
|
||||||
|
checkBodies(t, p, "ééééééé")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParserSpecifiedLatin2Charset(t *testing.T) {
|
||||||
|
p := newTestParser(t, "text_plain_latin2.eml")
|
||||||
|
|
||||||
|
checkBodies(t, p, "řšřšřš")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParserEmbeddedLatin2Charset(t *testing.T) {
|
||||||
|
p := newTestParser(t, "text_html_embedded_latin2_encoding.eml")
|
||||||
|
|
||||||
|
checkBodies(t, p, `<html><head><meta charset="ISO-8859-2"></head><body>latin2 řšřš</body></html>`)
|
||||||
|
}
|
||||||
|
|
||||||
func f(filename string) io.ReadCloser {
|
func f(filename string) io.ReadCloser {
|
||||||
f, err := os.Open(filepath.Join("testdata", filename))
|
f, err := os.Open(filepath.Join("testdata", filename))
|
||||||
|
|
||||||
@ -37,3 +62,21 @@ func s(filename string) string {
|
|||||||
|
|
||||||
return string(b)
|
return string(b)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func checkBodies(t *testing.T, p *Parser, wantBodies ...string) {
|
||||||
|
var partBodies, expectedBodies [][]byte
|
||||||
|
|
||||||
|
require.NoError(t, p.NewWalker().RegisterDefaultHandler(func(p *Part) (err error) {
|
||||||
|
if p.Body != nil {
|
||||||
|
partBodies = append(partBodies, p.Body)
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}).Walk())
|
||||||
|
|
||||||
|
for _, body := range wantBodies {
|
||||||
|
expectedBodies = append(expectedBodies, []byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
assert.ElementsMatch(t, expectedBodies, partBodies)
|
||||||
|
}
|
||||||
|
|||||||
@ -2,8 +2,12 @@ package parser
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
pmmime "github.com/ProtonMail/proton-bridge/pkg/mime"
|
||||||
"github.com/emersion/go-message"
|
"github.com/emersion/go-message"
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
|
"golang.org/x/text/encoding"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Parts []*Part
|
type Parts []*Part
|
||||||
@ -30,35 +34,32 @@ func (p *Part) AddChild(child *Part) {
|
|||||||
p.children = append(p.children, child)
|
p.children = append(p.children, child)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Part) write(writer *message.Writer, w *Writer) (err error) {
|
func (p *Part) isUTF8() bool {
|
||||||
if len(p.children) > 0 {
|
return utf8.Valid(p.Body)
|
||||||
for _, child := range p.children {
|
|
||||||
if err = child.writeAsChild(writer, w); err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err = writer.Write(p.Body); err != nil {
|
// TODO: Do we then need to set charset to utf-8? What if it's embedded in html?
|
||||||
return
|
func (p *Part) convertToUTF8() error {
|
||||||
}
|
t, params, err := p.Header.ContentType()
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *Part) writeAsChild(writer *message.Writer, w *Writer) (err error) {
|
|
||||||
if !w.shouldWrite(p) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
childWriter, err := writer.CreatePart(p.Header)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err = p.write(childWriter, w); err != nil {
|
var decoder *encoding.Decoder
|
||||||
return
|
|
||||||
|
if knownCharset, ok := params["charset"]; !ok {
|
||||||
|
encoding, _, _ := charset.DetermineEncoding(p.Body, t)
|
||||||
|
decoder = encoding.NewDecoder()
|
||||||
|
} else if decoder, err = pmmime.SelectDecoder(knownCharset); err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return childWriter.Close()
|
if p.Body, err = decoder.Bytes(p.Body); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
params["charset"] = "utf-8"
|
||||||
|
p.Header.SetContentType(t, params)
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
5
pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml
vendored
Normal file
5
pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
From: Sender <sender@pm.me>
|
||||||
|
To: Receiver <receiver@pm.me>
|
||||||
|
Content-Type: text/html
|
||||||
|
|
||||||
|
<html><head><meta charset="ISO-8859-2"></head><body>latin2 <20><><EFBFBD><EFBFBD></body></html>
|
||||||
5
pkg/message/parser/testdata/text_plain_latin1.eml
vendored
Normal file
5
pkg/message/parser/testdata/text_plain_latin1.eml
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
From: Sender <sender@pm.me>
|
||||||
|
To: Receiver <receiver@pm.me>
|
||||||
|
Content-Type: text/plain; charset=ISO-8859-1
|
||||||
|
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
5
pkg/message/parser/testdata/text_plain_latin2.eml
vendored
Normal file
5
pkg/message/parser/testdata/text_plain_latin2.eml
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
From: Sender <sender@pm.me>
|
||||||
|
To: Receiver <receiver@pm.me>
|
||||||
|
Content-Type: text/plain; charset=ISO-8859-2
|
||||||
|
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
5
pkg/message/parser/testdata/text_plain_unknown_latin1.eml
vendored
Normal file
5
pkg/message/parser/testdata/text_plain_unknown_latin1.eml
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
From: Sender <sender@pm.me>
|
||||||
|
To: Receiver <receiver@pm.me>
|
||||||
|
Content-Type: text/plain
|
||||||
|
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
5
pkg/message/parser/testdata/text_plain_unknown_latin2.eml
vendored
Normal file
5
pkg/message/parser/testdata/text_plain_unknown_latin2.eml
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
From: Sender <sender@pm.me>
|
||||||
|
To: Receiver <receiver@pm.me>
|
||||||
|
Content-Type: text/plain
|
||||||
|
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
@ -19,19 +19,27 @@ func newWriter(root *Part) *Writer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithCondition allows setting a condition when parts should be written.
|
||||||
|
// Parts are passed to each condition set and if any condition returns false,
|
||||||
|
// the part is not written.
|
||||||
|
// This initially seemed like a good idea but is now kinda useless.
|
||||||
func (w *Writer) WithCondition(cond Condition) *Writer {
|
func (w *Writer) WithCondition(cond Condition) *Writer {
|
||||||
w.cond = append(w.cond, cond)
|
w.cond = append(w.cond, cond)
|
||||||
return w
|
return w
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *Writer) Write(ww io.Writer) (err error) {
|
func (w *Writer) Write(ww io.Writer) error {
|
||||||
msgWriter, err := message.CreateWriter(ww, w.root.Header)
|
if w.shouldFilter(w.root) {
|
||||||
if err != nil {
|
w.root.Header.Add("Content-Transfer-Encoding", "base64")
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err = w.root.write(msgWriter, w); err != nil {
|
msgWriter, err := message.CreateWriter(ww, w.root.Header)
|
||||||
return
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := w.write(msgWriter, w.root); err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return msgWriter.Close()
|
return msgWriter.Close()
|
||||||
@ -46,3 +54,56 @@ func (w *Writer) shouldWrite(p *Part) bool {
|
|||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *Writer) shouldFilter(p *Part) bool {
|
||||||
|
encoding := p.Header.Get("Content-Transfer-Encoding")
|
||||||
|
|
||||||
|
if encoding != "" && encoding == "quoted-printable" || encoding == "base64" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, b := range p.Body {
|
||||||
|
if uint8(b) > 1<<7 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Writer) write(writer *message.Writer, p *Part) error {
|
||||||
|
if len(p.children) > 0 {
|
||||||
|
for _, child := range p.children {
|
||||||
|
if err := w.writeAsChild(writer, child); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := writer.Write(p.Body); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Writer) writeAsChild(writer *message.Writer, p *Part) error {
|
||||||
|
if !w.shouldWrite(p) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if w.shouldFilter(p) {
|
||||||
|
p.Header.Add("Content-Transfer-Encoding", "base64")
|
||||||
|
}
|
||||||
|
|
||||||
|
childWriter, err := writer.CreatePart(p.Header)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := w.write(childWriter, p); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return childWriter.Close()
|
||||||
|
}
|
||||||
|
|||||||
@ -37,7 +37,7 @@ import (
|
|||||||
|
|
||||||
var wordDec = &mime.WordDecoder{
|
var wordDec = &mime.WordDecoder{
|
||||||
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
|
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
|
||||||
dec, err := selectDecoder(charset)
|
dec, err := SelectDecoder(charset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -166,7 +166,7 @@ func getEncoding(charset string) (enc encoding.Encoding, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func selectDecoder(charset string) (decoder *encoding.Decoder, err error) {
|
func SelectDecoder(charset string) (decoder *encoding.Decoder, err error) {
|
||||||
var enc encoding.Encoding
|
var enc encoding.Encoding
|
||||||
lcharset := strings.Trim(strings.ToLower(charset), " \t\r\n")
|
lcharset := strings.Trim(strings.ToLower(charset), " \t\r\n")
|
||||||
switch lcharset {
|
switch lcharset {
|
||||||
@ -211,7 +211,7 @@ func DecodeCharset(original []byte, contentType string) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if charset, ok := params["charset"]; ok {
|
if charset, ok := params["charset"]; ok {
|
||||||
decoder, err := selectDecoder(charset)
|
decoder, err := SelectDecoder(charset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return original, errors.Wrap(err, "unknown charset was specified")
|
return original, errors.Wrap(err, "unknown charset was specified")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user