1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}
{- |
Module : Text.Pandoc.URI
Copyright : Copyright (C) 2006-2024 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <[email protected]>
Stability : alpha
Portability : portable
-}
module Text.Pandoc.URI ( urlEncode
, escapeURI
, isURI
, schemes
, uriPathToPath
, pBase64DataURI
) where
import qualified Network.HTTP.Types as HTTP
import Data.ByteString.Base64 (decodeLenient)
import Text.Pandoc.MIME (MimeType)
import qualified Data.ByteString as B
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Data.Text as T
import qualified Data.Set as Set
import Data.Char (isSpace, isAscii, isHexDigit, chr)
import Safe (readMay)
import Network.URI (URI (uriScheme), parseURI, escapeURIString)
import qualified Data.Attoparsec.Text as A
import Data.Text.Encoding (encodeUtf8)
import Control.Applicative (many, (<|>))
urlEncode :: T.Text -> T.Text
urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
-- | Escape whitespace and some punctuation characters in URI.
escapeURI :: T.Text -> T.Text
escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"
--
-- IANA URIs
--
-- | Schemes from http://www.iana.org/assignments/uri-schemes.html plus
-- the unofficial schemes doi, javascript, isbn, pmid.
schemes :: Set.Set T.Text
schemes = Set.fromList
-- Official IANA schemes
[ "aaa", "aaas", "about", "acap", "acct", "acr", "adiumxtra", "afp", "afs"
, "aim", "appdata", "apt", "attachment", "aw", "barion", "beshare", "bitcoin"
, "blob", "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension"
, "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid", "cvs"
, "data", "dav", "dict", "dis", "dlna-playcontainer", "dlna-playsingle"
, "dns", "dntp", "dtn", "dvb", "ed2k", "example", "facetime", "fax", "feed"
, "feedready", "file", "filesystem", "finger", "fish", "ftp", "geo", "gg"
, "git", "gizmoproject", "go", "gopher", "graph", "gtalk", "h323", "ham"
, "hcp", "http", "https", "hxxp", "hxxps", "hydrazone", "iax", "icap", "icon"
, "im", "imap", "info", "iotdisco", "ipn", "ipp", "ipps", "irc", "irc6"
, "ircs", "iris", "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs"
, "isostore", "itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap"
, "ldaps", "lvlt", "magnet", "mailserver", "mailto", "maps", "market"
, "message", "mid", "mms", "modem", "mongodb", "moz", "ms-access"
, "ms-browser-extension", "ms-drive-to", "ms-enrollment", "ms-excel"
, "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath"
, "ms-media-stream-id", "ms-officeapp", "ms-project", "ms-powerpoint"
, "ms-publisher", "ms-search-repair", "ms-secondary-screen-controller"
, "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode"
, "ms-settings-bluetooth", "ms-settings-camera", "ms-settings-cellular"
, "ms-settings-cloudstorage", "ms-settings-connectabledevices"
, "ms-settings-displays-topology", "ms-settings-emailandaccounts"
, "ms-settings-language", "ms-settings-location", "ms-settings-lock"
, "ms-settings-nfctransactions", "ms-settings-notifications"
, "ms-settings-power", "ms-settings-privacy", "ms-settings-proximity"
, "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace"
, "ms-spd", "ms-sttoverlay", "ms-transit-to", "ms-virtualtouchpad"
, "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd", "ms-word"
, "msnim", "msrp", "msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs"
, "ni", "nih", "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd"
, "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop"
, "pres", "prospero", "proxy", "pwid", "psyc", "qb", "query", "redis"
, "rediss", "reload", "res", "resource", "rmi", "rsync", "rtmfp", "rtmp"
, "rtsp", "rtsps", "rtspu", "secondlife", "service", "session", "sftp", "sgn"
, "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "smtp", "snews"
, "snmp", "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam"
, "stun", "stuns", "submit", "svn", "tag", "teamspeak", "tel", "teliaeid"
, "telnet", "tftp", "things", "thismessage", "tip", "tn3270", "tool", "turn"
, "turns", "tv", "udp", "unreal", "urn", "ut2004", "v-event", "vemmi"
, "ventrilo", "videotex", "vnc", "view-source", "wais", "webcal", "wpid"
, "ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire"
, "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri", "ymsgr", "z39.50", "z39.50r"
, "z39.50s"
-- Unofficial schemes
, "doi", "gemini", "isbn", "javascript", "pmid"
]
-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: T.Text -> Bool
isURI t =
-- If it's a base 64 data: URI, avoid the expensive call to parseURI:
case A.parseOnly (pBase64DataURI *> A.endOfInput) t of
Right () -> True
Left _ ->
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack $ t
where
hasKnownScheme =
(`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme
-- | Converts the path part of a file: URI to a regular path.
-- On windows, @/c:/foo@ should be @c:/foo@.
-- On linux, @/foo@ should be @/foo@.
uriPathToPath :: T.Text -> FilePath
uriPathToPath (T.unpack -> path) =
#ifdef _WINDOWS
case path of
'/':ps -> ps
ps -> ps
#else
path
#endif
pBase64DataURI :: A.Parser (B.ByteString, MimeType)
pBase64DataURI = base64uri
where
base64uri = do
A.string "data:"
mime <- do
n1 <- restrictedName
A.char '/'
n2 <- restrictedName
mps <- many mediaParam
pure $ n1 <> "/" <> n2 <> mconcat mps
A.string ";base64,"
b64 <- mconcat <$> many
(A.takeWhile1 (A.inClass "A-Za-z0-9/+ \t\r\n") <|> percentOctet)
A.skipWhile (== '=')
-- this decode should be lazy:
pure (decodeLenient (encodeUtf8 b64), mime)
percentOctet = do
A.char '%'
x <- A.satisfy isHexDigit
y <- A.satisfy isHexDigit
case readMay ['0','x',x,y] of
Nothing -> fail $ "Could not read percent encoded byte " <> [x,y]
Just d -> pure $ T.singleton $ chr d
restrictedName = do
c <- A.satisfy (A.inClass "A-Za-z0-9")
rest <- A.takeWhile (A.inClass "A-Za-z0-9!#$&^_.+-")
pure $ T.singleton c <> rest
mediaParam = do
A.char ';'
A.skipWhile isSpace
k <- restrictedName
A.char '='
v <- A.takeWhile (/=';')
pure $ ";" <> k <> "=" <> v
|