feat: [GODT-360] detect charset embedded in html and xml
This commit is contained in:
@ -330,81 +330,81 @@ func TestGetEncoding(t *testing.T) {
|
||||
func TestEncodeReader(t *testing.T) {
|
||||
// define test data
|
||||
testData := []struct {
|
||||
params map[string]string
|
||||
charset string
|
||||
original []byte
|
||||
message string
|
||||
}{
|
||||
// russian
|
||||
{
|
||||
map[string]string{"charset": "koi8-r"},
|
||||
"koi8-r",
|
||||
// а, з, б, у, к, а, а, б, в, г, д, е, ё
|
||||
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "KOI8-R"},
|
||||
"KOI8-R",
|
||||
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "csKOI8R"},
|
||||
"csKOI8R",
|
||||
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "koi8-u"},
|
||||
"koi8-u",
|
||||
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "iso-8859-5"},
|
||||
"iso-8859-5",
|
||||
// а , з , б , у , к , а , а , б , в , г , д , е , ё
|
||||
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xF1},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "csWrong"},
|
||||
"csWrong",
|
||||
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6},
|
||||
"",
|
||||
},
|
||||
{
|
||||
map[string]string{"charset": "utf8"},
|
||||
"utf8",
|
||||
[]byte{0xD0, 0xB0, 0xD0, 0xB7, 0xD0, 0xB1, 0xD1, 0x83, 0xD0, 0xBA, 0xD0, 0xB0, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x91},
|
||||
"азбукаабвгдеё",
|
||||
},
|
||||
// czechoslovakia
|
||||
{
|
||||
map[string]string{"charset": "windows-1250"},
|
||||
"windows-1250",
|
||||
[]byte{225, 228, 232, 233, 236, 244},
|
||||
"áäčéěô",
|
||||
},
|
||||
// umlauts
|
||||
{
|
||||
map[string]string{"charset": "iso-8859-1"},
|
||||
"iso-8859-1",
|
||||
[]byte{196, 203, 214, 220, 228, 235, 246, 252},
|
||||
"ÄËÖÜäëöü",
|
||||
},
|
||||
// latvia
|
||||
{
|
||||
map[string]string{"charset": "iso-8859-4"},
|
||||
"iso-8859-4",
|
||||
[]byte{224, 239, 243, 182, 254},
|
||||
"āīķļū",
|
||||
},
|
||||
{ // encoded by https://www.motobit.com/util/charset-codepage-conversion.asp
|
||||
map[string]string{"charset": "utf7"},
|
||||
"utf7",
|
||||
[]byte("He wes Leovena+APA-es sone -- li+APA-e him be Drihten.+A6QDtw- +A7MDuwPOA8MDwwOx- +A7wDvwPF- +A60DtAPJA8MDsQO9- +A7UDuwO7A7cDvQO5A7oDrg-. +BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+C68LvguuC7ELvwuoC80LpA- +C64Lygu0C78LlQuzC78LsgvH- +C6QLrgu/C7QLzQuuC8oLtAu/- +C6oLywuyC80- +C4cLqQu/C6QLvgu1C6QLwQ- +C44LmQvNC5ULwQuuC80- +C5ULvgujC8sLrgvN-."),
|
||||
"He wes Leovenaðes sone -- liðe him be Drihten.Τη γλώσσα μου έδωσαν ελληνική. Чернели избы здесь и там,Чернели избы здесь и там,யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்.",
|
||||
},
|
||||
|
||||
// iconv -f UTF8 -t GB2312 utf8.txt | hexdump -v -e '"0x" 1/1 "%x, "'
|
||||
{ // encoded by iconv; dump by `cat gb2312.txt | hexdump -v -e '"0x" 1/1 "%x "'` and reformat; text from https://zh.wikipedia.org/wiki/GB_2312
|
||||
map[string]string{"charset": "GB2312"},
|
||||
"GB2312",
|
||||
[]byte{0x47, 0x42, 0x20, 0x32, 0x33, 0x31, 0x32, 0xb5, 0xc4, 0xb3, 0xf6, 0xcf, 0xd6, 0xa3, 0xac, 0xbb, 0xf9, 0xb1, 0xbe, 0xc2, 0xfa, 0xd7, 0xe3, 0xc1, 0xcb, 0xba, 0xba, 0xd7, 0xd6, 0xb5, 0xc4, 0xbc, 0xc6, 0xcb, 0xe3, 0xbb, 0xfa, 0xb4, 0xa6, 0xc0, 0xed, 0xd0, 0xe8, 0xd2, 0xaa, 0xa3, 0xac, 0xcb, 0xfc, 0xcb, 0xf9, 0xca, 0xd5, 0xc2, 0xbc, 0xb5, 0xc4, 0xba, 0xba, 0xd7, 0xd6, 0xd2, 0xd1, 0xbe, 0xad, 0xb8, 0xb2, 0xb8, 0xc7, 0xd6, 0xd0, 0xb9, 0xfa, 0xb4, 0xf3, 0xc2, 0xbd, 0x39, 0x39, 0x2e, 0x37, 0x35, 0x25, 0xb5, 0xc4, 0xca, 0xb9, 0xd3, 0xc3, 0xc6, 0xb5, 0xc2, 0xca, 0xa1, 0xa3, 0xb5, 0xab, 0xb6, 0xd4, 0xd3, 0xda, 0xc8, 0xcb, 0xc3, 0xfb},
|
||||
"GB 2312的出现,基本满足了汉字的计算机处理需要,它所收录的汉字已经覆盖中国大陆99.75%的使用频率。但对于人名",
|
||||
},
|
||||
|
||||
{ // encoded by iconv; text from https://jp.wikipedia.org/wiki/Shift_JIS
|
||||
map[string]string{"charset": "shift-jis"},
|
||||
"shift-jis",
|
||||
[]byte{0x95, 0xb6, 0x8e, 0x9a, 0x95, 0x84, 0x8d, 0x86, 0x89, 0xbb, 0x95, 0xfb, 0x8e, 0xae, 0x53, 0x68, 0x69, 0x66, 0x74, 0x5f, 0x4a, 0x49, 0x53, 0x82, 0xcc, 0x90, 0xdd, 0x8c, 0x76, 0x8e, 0xd2, 0x82, 0xe7, 0x82, 0xcd, 0x81, 0x41, 0x90, 0xe6, 0x8d, 0x73, 0x82, 0xb5, 0x82, 0xc4, 0x82, 0xe6, 0x82, 0xad, 0x97, 0x98, 0x97, 0x70, 0x82, 0xb3, 0x82, 0xea, 0x82, 0xc4, 0x82, 0xa2, 0x82, 0xbd, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x30, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x31, 0x81, 0x6a, 0x82, 0xcc, 0x38, 0x83, 0x72, 0x83, 0x62, 0x83, 0x67, 0x95, 0x84, 0x8d, 0x86, 0x81, 0x69, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x89, 0x70, 0x90, 0x94, 0x8e, 0x9a, 0x81, 0x45, 0x94, 0xbc, 0x8a, 0x70, 0x83, 0x4a, 0x83, 0x69, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xc6, 0x81, 0x41, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x36, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x38, 0x81, 0x41, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x8a, 0xbf, 0x8e, 0x9a, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xcc, 0x97, 0xbc, 0x95, 0xb6, 0x8e, 0x9a, 0x8f, 0x57, 0x8d, 0x87, 0x82, 0xf0, 0x95, 0x5c, 0x8c, 0xbb, 0x82, 0xb5, 0x82, 0xe6, 0x82, 0xa4, 0x82, 0xc6, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42, 0x82, 0xdc, 0x82, 0xbd, 0x81, 0x41, 0x83, 0x74, 0x83, 0x40, 0x83, 0x43, 0x83, 0x8b, 0x82, 0xcc, 0x91, 0xe5, 0x82, 0xab, 0x82, 0xb3, 0x82, 0xe2, 0x8f, 0x88, 0x97, 0x9d, 0x8e, 0x9e, 0x8a, 0xd4, 0x82, 0xcc, 0x92, 0x5a, 0x8f, 0x6b, 0x82, 0xf0, 0x90, 0x7d, 0x82, 0xe9, 0x82, 0xbd, 0x82, 0xdf, 0x81, 0x41, 0x83, 0x47, 0x83, 0x58, 0x83, 0x50, 0x81, 0x5b, 0x83, 0x76, 0x83, 0x56, 0x81, 0x5b, 0x83, 0x50, 0x83, 0x93, 0x83, 0x58, 0x82, 0xc8, 0x82, 0xb5, 0x82, 0xc5, 0x8d, 0xac, 0x8d, 0xdd, 0x89, 0xc2, 0x94, 0x5c, 0x82, 0xc9, 0x82, 0xb7, 0x82, 0xe9, 0x82, 0xb1, 0x82, 0xc6, 0x82, 0xf0, 0x8a, 0xe9, 0x90, 0x7d, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42},
|
||||
"文字符号化方式Shift_JISの設計者らは、先行してよく利用されていたJIS C 6220(現在のJIS X 0201)の8ビット符号(以下「英数字・半角カナ」)と、JIS C 6226(現在のJIS X 0208、以下「漢字」)の両文字集合を表現しようとした。また、ファイルの大きさや処理時間の短縮を図るため、エスケープシーケンスなしで混在可能にすることを企図した。",
|
||||
},
|
||||
@ -417,7 +417,7 @@ func TestEncodeReader(t *testing.T) {
|
||||
for _, val := range testData {
|
||||
//fmt.Println("Testing ", val)
|
||||
expected := []byte(val.message)
|
||||
decoded, err := DecodeCharset(val.original, val.params)
|
||||
decoded, err := DecodeCharset(val.original, "text/plain; charset="+val.charset)
|
||||
if len(expected) == 0 {
|
||||
if err == nil {
|
||||
t.Error("Expected err but have ", err)
|
||||
@ -434,10 +434,10 @@ func TestEncodeReader(t *testing.T) {
|
||||
if bytes.Equal(decoded, expected) {
|
||||
// fmt.Println("Succesfull decoding of ", val.params, ":", string(decoded))
|
||||
} else {
|
||||
t.Error("Wrong encoding of ", val.params, ".Expected\n", expected, "\nbut have\n", decoded)
|
||||
t.Error("Wrong encoding of ", val.charset, ".Expected\n", expected, "\nbut have\n", decoded)
|
||||
}
|
||||
if strings.Compare(val.message, string(decoded)) != 0 {
|
||||
t.Error("Wrong message for ", val.params, ".Expected\n", val.message, "\nbut have\n", string(decoded))
|
||||
t.Error("Wrong message for ", val.charset, ".Expected\n", val.message, "\nbut have\n", string(decoded))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user