Detect charset and convert non UTF-8 files for display (#4950)

* Detect charset and convert non UTF-8 files for display

* Refactor and move function to correct module

* Revert unrelated changes

* More unrelated changes

* Duplicate content for small text to have better encoding detection

* Check if original content is valid before duplicating it
This commit is contained in:
Lauris BH 2018-09-29 11:33:54 +03:00 committed by Lunny Xiao
parent 6780661192
commit 81702e6ec9
3 changed files with 44 additions and 4 deletions

View file

@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) {
return "UTF-8", nil
}
result, err := chardet.NewTextDetector().DetectBest(content)
textDetector := chardet.NewTextDetector()
var detectContent []byte
if len(content) < 1024 {
// Check if original content is valid
if _, err := textDetector.DetectBest(content); err != nil {
return "", err
}
times := 1024 / len(content)
detectContent = make([]byte, 0, times*len(content))
for i := 0; i < times; i++ {
detectContent = append(detectContent, content...)
}
} else {
detectContent = content
}
result, err := textDetector.DetectBest(detectContent)
if err != nil {
return "", err
}