feat: [GODT-360] detect charset embedded in html and xml

2020-05-28 12:36:42 +02:00
parent 84d344cb0a
commit 9e633400b0
8 changed files with 65 additions and 48 deletions
--- a/pkg/mime/encoding_test.go
+++ b/pkg/mime/encoding_test.go
@ -330,81 +330,81 @@ func TestGetEncoding(t *testing.T) {
 func TestEncodeReader(t *testing.T) {
 	// define test data
 	testData := []struct {
-		params   map[string]string
+		charset  string
 		original []byte
 		message  string
 	}{
 		// russian
 		{
-			map[string]string{"charset": "koi8-r"},
+			"koi8-r",
 			//     а, з, б, у, к, а, а, б, в, г, д, е, ё
 			[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
 			"азбукаабвгдеё",
 		},
 		{
-			map[string]string{"charset": "KOI8-R"},
+			"KOI8-R",
 			[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
 			"азбукаабвгдеё",
 		},
 		{
-			map[string]string{"charset": "csKOI8R"},
+			"csKOI8R",
 			[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
 			"азбукаабвгдеё",
 		},
 		{
-			map[string]string{"charset": "koi8-u"},
+			"koi8-u",
 			[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
 			"азбукаабвгдеё",
 		},
 		{
-			map[string]string{"charset": "iso-8859-5"},
+			"iso-8859-5",
 			//     а    , з    , б    , у    , к    , а    , а    , б    , в    , г    , д    , е    , ё
 			[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xF1},
 			"азбукаабвгдеё",
 		},
 		{
-			map[string]string{"charset": "csWrong"},
+			"csWrong",
 			[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6},
 			"",
 		},
 		{
-			map[string]string{"charset": "utf8"},
+			"utf8",
 			[]byte{0xD0, 0xB0, 0xD0, 0xB7, 0xD0, 0xB1, 0xD1, 0x83, 0xD0, 0xBA, 0xD0, 0xB0, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x91},
 			"азбукаабвгдеё",
 		},
 		// czechoslovakia
 		{
-			map[string]string{"charset": "windows-1250"},
+			"windows-1250",
 			[]byte{225, 228, 232, 233, 236, 244},
 			"áäčéěô",
 		},
 		// umlauts
 		{
-			map[string]string{"charset": "iso-8859-1"},
+			"iso-8859-1",
 			[]byte{196, 203, 214, 220, 228, 235, 246, 252},
 			"ÄËÖÜäëöü",
 		},
 		// latvia
 		{
-			map[string]string{"charset": "iso-8859-4"},
+			"iso-8859-4",
 			[]byte{224, 239, 243, 182, 254},
 			"āīķļū",
 		},
 		{ // encoded by https://www.motobit.com/util/charset-codepage-conversion.asp
-			map[string]string{"charset": "utf7"},
+			"utf7",
 			[]byte("He wes Leovena+APA-es sone -- li+APA-e him be Drihten.+A6QDtw- +A7MDuwPOA8MDwwOx- +A7wDvwPF- +A60DtAPJA8MDsQO9- +A7UDuwO7A7cDvQO5A7oDrg-. +BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+C68LvguuC7ELvwuoC80LpA- +C64Lygu0C78LlQuzC78LsgvH- +C6QLrgu/C7QLzQuuC8oLtAu/- +C6oLywuyC80- +C4cLqQu/C6QLvgu1C6QLwQ- +C44LmQvNC5ULwQuuC80- +C5ULvgujC8sLrgvN-."),
 			"He wes Leovenaðes sone -- liðe him be Drihten.Τη γλώσσα μου έδωσαν ελληνική. Чернели избы здесь и там,Чернели избы здесь и там,யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்.",
 		},

 		// iconv -f UTF8 -t GB2312 utf8.txt | hexdump -v -e '"0x" 1/1 "%x, "'
 		{ // encoded by iconv; dump by `cat gb2312.txt | hexdump -v -e '"0x" 1/1 "%x "'` and reformat; text from https://zh.wikipedia.org/wiki/GB_2312
-			map[string]string{"charset": "GB2312"},
+			"GB2312",
 			[]byte{0x47, 0x42, 0x20, 0x32, 0x33, 0x31, 0x32, 0xb5, 0xc4, 0xb3, 0xf6, 0xcf, 0xd6, 0xa3, 0xac, 0xbb, 0xf9, 0xb1, 0xbe, 0xc2, 0xfa, 0xd7, 0xe3, 0xc1, 0xcb, 0xba, 0xba, 0xd7, 0xd6, 0xb5, 0xc4, 0xbc, 0xc6, 0xcb, 0xe3, 0xbb, 0xfa, 0xb4, 0xa6, 0xc0, 0xed, 0xd0, 0xe8, 0xd2, 0xaa, 0xa3, 0xac, 0xcb, 0xfc, 0xcb, 0xf9, 0xca, 0xd5, 0xc2, 0xbc, 0xb5, 0xc4, 0xba, 0xba, 0xd7, 0xd6, 0xd2, 0xd1, 0xbe, 0xad, 0xb8, 0xb2, 0xb8, 0xc7, 0xd6, 0xd0, 0xb9, 0xfa, 0xb4, 0xf3, 0xc2, 0xbd, 0x39, 0x39, 0x2e, 0x37, 0x35, 0x25, 0xb5, 0xc4, 0xca, 0xb9, 0xd3, 0xc3, 0xc6, 0xb5, 0xc2, 0xca, 0xa1, 0xa3, 0xb5, 0xab, 0xb6, 0xd4, 0xd3, 0xda, 0xc8, 0xcb, 0xc3, 0xfb},
 			"GB 2312的出现，基本满足了汉字的计算机处理需要，它所收录的汉字已经覆盖中国大陆99.75%的使用频率。但对于人名",
 		},

 		{ // encoded by iconv; text from https://jp.wikipedia.org/wiki/Shift_JIS
-			map[string]string{"charset": "shift-jis"},
+			"shift-jis",
 			[]byte{0x95, 0xb6, 0x8e, 0x9a, 0x95, 0x84, 0x8d, 0x86, 0x89, 0xbb, 0x95, 0xfb, 0x8e, 0xae, 0x53, 0x68, 0x69, 0x66, 0x74, 0x5f, 0x4a, 0x49, 0x53, 0x82, 0xcc, 0x90, 0xdd, 0x8c, 0x76, 0x8e, 0xd2, 0x82, 0xe7, 0x82, 0xcd, 0x81, 0x41, 0x90, 0xe6, 0x8d, 0x73, 0x82, 0xb5, 0x82, 0xc4, 0x82, 0xe6, 0x82, 0xad, 0x97, 0x98, 0x97, 0x70, 0x82, 0xb3, 0x82, 0xea, 0x82, 0xc4, 0x82, 0xa2, 0x82, 0xbd, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x30, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x31, 0x81, 0x6a, 0x82, 0xcc, 0x38, 0x83, 0x72, 0x83, 0x62, 0x83, 0x67, 0x95, 0x84, 0x8d, 0x86, 0x81, 0x69, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x89, 0x70, 0x90, 0x94, 0x8e, 0x9a, 0x81, 0x45, 0x94, 0xbc, 0x8a, 0x70, 0x83, 0x4a, 0x83, 0x69, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xc6, 0x81, 0x41, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x36, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x38, 0x81, 0x41, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x8a, 0xbf, 0x8e, 0x9a, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xcc, 0x97, 0xbc, 0x95, 0xb6, 0x8e, 0x9a, 0x8f, 0x57, 0x8d, 0x87, 0x82, 0xf0, 0x95, 0x5c, 0x8c, 0xbb, 0x82, 0xb5, 0x82, 0xe6, 0x82, 0xa4, 0x82, 0xc6, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42, 0x82, 0xdc, 0x82, 0xbd, 0x81, 0x41, 0x83, 0x74, 0x83, 0x40, 0x83, 0x43, 0x83, 0x8b, 0x82, 0xcc, 0x91, 0xe5, 0x82, 0xab, 0x82, 0xb3, 0x82, 0xe2, 0x8f, 0x88, 0x97, 0x9d, 0x8e, 0x9e, 0x8a, 0xd4, 0x82, 0xcc, 0x92, 0x5a, 0x8f, 0x6b, 0x82, 0xf0, 0x90, 0x7d, 0x82, 0xe9, 0x82, 0xbd, 0x82, 0xdf, 0x81, 0x41, 0x83, 0x47, 0x83, 0x58, 0x83, 0x50, 0x81, 0x5b, 0x83, 0x76, 0x83, 0x56, 0x81, 0x5b, 0x83, 0x50, 0x83, 0x93, 0x83, 0x58, 0x82, 0xc8, 0x82, 0xb5, 0x82, 0xc5, 0x8d, 0xac, 0x8d, 0xdd, 0x89, 0xc2, 0x94, 0x5c, 0x82, 0xc9, 0x82, 0xb7, 0x82, 0xe9, 0x82, 0xb1, 0x82, 0xc6, 0x82, 0xf0, 0x8a, 0xe9, 0x90, 0x7d, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42},
 			"文字符号化方式Shift_JISの設計者らは、先行してよく利用されていたJIS C 6220（現在のJIS X 0201）の8ビット符号（以下「英数字・半角カナ」）と、JIS C 6226（現在のJIS X 0208、以下「漢字」）の両文字集合を表現しようとした。また、ファイルの大きさや処理時間の短縮を図るため、エスケープシーケンスなしで混在可能にすることを企図した。",
 		},
@ -417,7 +417,7 @@ func TestEncodeReader(t *testing.T) {
 	for _, val := range testData {
 		//fmt.Println("Testing ", val)
 		expected := []byte(val.message)
-		decoded, err := DecodeCharset(val.original, val.params)
+		decoded, err := DecodeCharset(val.original, "text/plain; charset="+val.charset)
 		if len(expected) == 0 {
 			if err == nil {
 				t.Error("Expected err but have ", err)
@ -434,10 +434,10 @@ func TestEncodeReader(t *testing.T) {
 		if bytes.Equal(decoded, expected) {
 			// fmt.Println("Succesfull decoding of ", val.params, ":", string(decoded))
 		} else {
-			t.Error("Wrong encoding of ", val.params, ".Expected\n", expected, "\nbut have\n", decoded)
+			t.Error("Wrong encoding of ", val.charset, ".Expected\n", expected, "\nbut have\n", decoded)
 		}
 		if strings.Compare(val.message, string(decoded)) != 0 {
-			t.Error("Wrong message for ", val.params, ".Expected\n", val.message, "\nbut have\n", string(decoded))
+			t.Error("Wrong message for ", val.charset, ".Expected\n", val.message, "\nbut have\n", string(decoded))
 		}
 	}
 }