aboutsummaryrefslogtreecommitdiff
path: root/unicode.go
diff options
context:
space:
mode:
authorMarin Ivanov <[email protected]>2024-03-29 00:01:46 +0200
committerMarin Ivanov <[email protected]>2024-03-29 00:01:46 +0200
commit90394d8b5b2a4c6bcf29692e6d4ab79c598928b1 (patch)
treeb509478f05466384e1995893d09589cfb7518ca9 /unicode.go
parentd0c92f7cb3bb581db84dede75c53269868d5eed1 (diff)
wip
Diffstat (limited to 'unicode.go')
-rw-r--r--unicode.go28
1 files changed, 28 insertions, 0 deletions
diff --git a/unicode.go b/unicode.go
new file mode 100644
index 0000000..e145d28
--- /dev/null
+++ b/unicode.go
@@ -0,0 +1,28 @@
+package main
+
+import (
+ "unicode/utf8"
+)
+
+// detectUTF8 reports whether s is a valid UTF-8 string, and whether the string
+// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
+// or any other common encoding).
+func detectUTF8(s string) (valid, require bool) {
+ for i := 0; i < len(s); {
+ r, size := utf8.DecodeRuneInString(s[i:])
+ i += size
+ // Officially, ZIP uses CP-437, but many readers use the system's
+ // local character encoding. Most encoding are compatible with a large
+ // subset of CP-437, which itself is ASCII-like.
+ //
+ // Forbid 0x7e and 0x5c since EUC-KR and Shift-JIS replace those
+ // characters with localized currency and overline characters.
+ if r < 0x20 || r > 0x7d || r == 0x5c {
+ if !utf8.ValidRune(r) || (r == utf8.RuneError && size == 1) {
+ return false, false
+ }
+ require = true
+ }
+ }
+ return true, require
+}