1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
|
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE TypeApplications #-}
{- |
Module : Text.Pandoc.Readers.Pod
Copyright : © 2024 Evan Silberman
License : GNU GPL, version 2 or above
Maintainer : Evan Silberman <[email protected]>
Stability : WIP
Portability : portable
Conversion of Pod to 'Pandoc' documents
-}
module Text.Pandoc.Readers.Pod (readPod) where
import Control.Monad (void)
import Control.Monad.Except (throwError)
import Data.Char (isAsciiUpper, digitToInt)
import Data.Default (Default)
import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Parsing.General (isSpaceChar)
import Text.Pandoc.XML (lookupEntity)
import Text.Pandoc.Class.PandocMonad (PandocMonad(..))
import Text.Pandoc.Definition (Pandoc)
import Text.Pandoc.Error
import Text.Pandoc.Builder (Blocks, Inlines)
import qualified Text.Pandoc.Builder as B
import qualified Data.Text as T
import qualified Data.Text.Read as TR
import Text.Pandoc.Shared (stringify, textToIdentifier, tshow)
import Data.Set (Set)
import Data.Functor (($>))
import Data.Maybe (listToMaybe, fromMaybe)
import Numeric (readOct)
data PodState = PodState
{ logMessages :: [LogMessage]
, headerIds :: Set T.Text
, options :: ReaderOptions
} deriving (Show)
instance HasLogMessages PodState where
addLogMessage msg st = st{ logMessages = msg : logMessages st }
getLogMessages st = reverse $ logMessages st
instance HasIdentifierList PodState where
extractIdentifierList = headerIds
updateIdentifierList f st = st{headerIds = f (headerIds st)}
instance HasReaderOptions PodState where
extractReaderOptions = options
instance Default PodState where
def = PodState
{ logMessages = []
, headerIds = mempty
, options = def
}
data PodLinkDestination = LinkUrl Inlines T.Text
| LinkMan Inlines (Maybe Inlines)
| LinkInternal Inlines
deriving (Show)
defaultLinkName :: PodLinkDestination -> Inlines
defaultLinkName (LinkUrl inl _) = inl
defaultLinkName (LinkMan nm (Just sec)) = B.doubleQuoted sec <> " in " <> nm
defaultLinkName (LinkMan nm Nothing) = nm
defaultLinkName (LinkInternal sec) = B.doubleQuoted sec
type PodParser m = ParsecT Sources PodState m
readPod :: (PandocMonad m, ToSources a)
=> ReaderOptions
-> a
-> m Pandoc
readPod _ s = do
let sources = toSources s
p <- readWithM parsePod def sources
case p of
Right result -> return result
Left e -> throwError e
parsePod :: PandocMonad m => PodParser m Pandoc
parsePod = do
-- We don't actually start processing Pod until we encounter a Pod command.
-- If we never encounter a Pod command, the document is still valid Pod, it
-- just contains no content.
notPod
bs <- manyTill block eof
reportLogMessages
return $ B.doc $ mconcat bs
block :: PandocMonad m => PodParser m Blocks
block = verbatim <|> paragraph <|> command <?> "Pod paragraph"
command :: PandocMonad m => PodParser m Blocks
command = do
try (char '=' >> notFollowedBy (string "item" <|> string "back" <|> string "end"))
header <|> pod <|> cut <|> over <|> for <|> begin <|> encoding <?> "Pod command"
cmd :: PandocMonad m => T.Text -> PodParser m ()
cmd nm = do
textStr nm
notFollowedBy nonspaceChar
void $ many spaceChar
encoding :: PandocMonad m => PodParser m Blocks
encoding = do
cmd "encoding"
anyLine
optional blanklines
logMessage $ IgnoredElement "=encoding; Pandoc requires UTF-8 input"
return mempty
header :: PandocMonad m => PodParser m Blocks
header = do
string "head"
dig <- oneOf "123456"
void blankline <|> skipMany1 spaceChar
ins <- inlines
attrs <- registerHeader B.nullAttr ins
optional blanklines
return $ B.headerWith attrs (digitToInt dig) ins
pod :: PandocMonad m => PodParser m Blocks
pod = do
cmd "pod"
optional (try inlines)
optional blanklines
return mempty
cut :: PandocMonad m => PodParser m Blocks
cut = cmd "cut" *> notPod
notPod :: PandocMonad m => PodParser m Blocks
notPod = do
manyTill anyLine (eof <|> void (try (lookAhead (char '=' *> letter))))
return mempty
over :: PandocMonad m => PodParser m Blocks
over = do
cmd "over"
anyLine
blanklines
optional $ try (char '=' *> cut)
bs <- list <|> blockquote
string "=back" <* blanklines
return bs
list :: PandocMonad m => PodParser m Blocks
list = try bulletList <|> try orderedList <|> definitionList
bulletList :: PandocMonad m => PodParser m Blocks
bulletList = B.bulletList <$> many1 (item (many spaceChar *> optional (char '*')))
orderedList :: PandocMonad m => PodParser m Blocks
orderedList = do
start <- item1
more <- many orderedItem
return $ B.orderedList (start : more)
where
item1 = item $ spaces *> char '1' *> optional (char '.')
orderedItem = item $ spaces *> many digit *> optional (char '.')
item :: PandocMonad m => PodParser m () -> PodParser m Blocks
item p = do
try (cmd "=item")
p
blanklines
mconcat <$> many block <?> "runaway item"
definitionList :: PandocMonad m => PodParser m Blocks
definitionList = B.definitionList <$> many1 dlItem
where
dlItem = do
try (cmd "=item")
spaces
term <- inlines
blanklines
-- perlpodspec sez the /section part of a link can refer to either
-- a header or a dl item, hence treating it as a "header" here
attrs <- registerHeader B.nullAttr term
defn <- mconcat <$> many block <?> "runaway dlitem"
return (B.spanWith attrs term, [defn])
blockquote :: PandocMonad m => PodParser m Blocks
blockquote = B.blockQuote . mconcat <$> many block <?> "runaway blockquote"
paragraph :: PandocMonad m => PodParser m Blocks
paragraph = do
try (notFollowedBy (char '=' *> letter))
inl <- inlines
optional blanklines
return $ B.para $ B.trimInlines inl
inlines :: PandocMonad m => PodParser m Inlines
inlines = mconcat <$> many1 (format <|> whitespace <|> str)
-- perlpodspec sez:
-- If a Pod processor sees any formatting code other than the ones listed,
-- that processor must by default treat this as an error.
format :: PandocMonad m => PodParser m Inlines
format = try $ do
ctrl <- satisfy isAsciiUpper
p <- getPosition
lookAhead (char '<')
case ctrl of
'B' -> B.strong <$> argument
'C' -> B.code . stringify <$> argument
'F' -> B.spanWith (mempty, ["filename"], mempty) <$> argument
'I' -> B.emph <$> argument
'S' -> argument -- TODO map nbsps
'X' -> argument $> mempty
'Z' -> argument $> mempty
'E' -> do
a <- stringify <$> argument
case entity a of
-- per spec:
-- Pod parsers, when faced with some unknown "E<identifier>" code,
-- shouldn't simply replace it with nullstring (by default, at
-- least), but may pass it through as a string consisting of the
-- literal characters E, less-than, identifier, greater-than.
Nothing -> do
logMessage $ SkippedContent ("unknown entity " <> a) p
return $ B.str $ "E<" <> a <> ">"
Just e -> return $ B.str e
'L' -> link
x -> throwError $ PandocParseError $ T.snoc "unknown Pod formatting code " x
where
argument = try expandedArg <|> compactArg <?> "argument"
innerStr = B.str <$> many1Char (podCharLess ">")
compactArg = do
char '<'
mconcat <$> manyTill (format <|> whitespace <|> innerStr) (char '>')
expandedArg = do
openLen <- length <$> many1 (char '<')
let close = T.pack $ replicate openLen '>'
skipMany1 spaceChar <|> void blankline
arg <- mconcat <$> many (format <|> try (whitespace <* notFollowedBy (textStr close)) <|> str)
many1 spaceChar
textStr close
return arg
-- Some legacy entity names are required to be parsed by Pod formatters
oct = listToMaybe . readOct @Integer
entity "apos" = Just "'"
entity "sol" = Just "/"
entity "verbar" = Just "|"
entity "lchevron" = Just "«"
entity "rchevron" = Just "»"
entity (T.stripPrefix "0x" -> Just suf) = lookupEntity $ "#x" <> suf
entity (T.stripPrefix "0" -> Just suf)
| Just (n, "") <- oct (T.unpack suf) = lookupEntity $ "#" <> tshow n
entity (TR.decimal @Integer -> Right (x, "")) = lookupEntity $ "#" <> tshow x
entity x = lookupEntity x
-- god knows there must be a higher order way of writing this thing, where we
-- have multiple different possible parser states within the link argument
-- varying depending on whether the link is expanded or not, but at least I
-- understand what I've done. This would be less wacky with a lexing step.
link :: PandocMonad m => PodParser m Inlines
link = do
identifier <- textToIdentifier <$> getOption readerExtensions
(name, dest) <- try expandedLinkArg <|> compactLinkArg
return $ mkLink identifier name dest
where
compactLinkArg = do
char '<'
name <- linkName whitespace ">"
dest <- linkDest whitespace (char '>') ">"
char '>'
return (mconcat <$> name, dest)
expandedLinkArg = do
openLen <- length <$> many1 (char '<')
let closeStr = textStr (T.pack $ replicate openLen '>')
let close = skipMany1 spaceChar *> closeStr
let sp = try $ many1 spaceChar *> notFollowedBy closeStr $> B.space
many1 spaceChar
name <- linkName sp ""
dest <- linkDest sp close ""
close
return (mconcat <$> name, dest)
mkLink identifier name dest =
let name' = fromMaybe (defaultLinkName dest) name in
case dest of
LinkUrl _ href -> B.link href "" name'
LinkMan nm Nothing -> B.linkWith (mempty, mempty, [("manual", stringify nm)]) "" "" name'
LinkMan nm (Just sc) -> B.linkWith (mempty, mempty, [("manual", stringify nm), ("section", stringify sc)]) "" "" name'
LinkInternal sc -> B.link ("#" <> identifier (stringify sc)) "" name'
linkName sp ex = optionMaybe $ try $ many
(try format
<|> sp
<|> B.str <$> many1Char (podCharLess ('|':ex))) <* char '|'
linkDest sp close ex = try (url ex) <|> internal sp close ex <|> man sp close ex
-- perlpodspec sez:
-- Note that you can distinguish URL-links from anything else by the
-- fact that they match m/\A\w+:[^:\s]\S*\z/.
-- This is obviously not an RFC-compliant matcher for a URI scheme, but
-- this is what the specification and the canonical implementation (Pod::Simple)
-- do for deciding that a link target "looks like" a URL, as opposed to a
-- manual page reference, so what we are doing here is roughly equivalent
-- even though it is nonsense
url ex = do
scheme <- many1Char (letter <|> digit <|> char '_')
colon <- T.singleton <$> char ':' <* notFollowedBy (char ':')
rst <- many (format <|> B.str <$> many1Char (podCharLess ex))
return $ LinkUrl
(B.str scheme <> B.str colon <> mconcat rst)
(scheme <> colon <> stringify rst)
quotedSection sp close ex = do
let mystr = B.str <$> many1Char (podCharLess ('\"':ex) <|> try (char '"' <* notFollowedBy close))
char '"'
ins <- mconcat <$> many1 (format <|> sp <|> mystr)
char '"'
return ins
section sp close ex = try (quotedSection sp close ex) <|> mconcat <$> many1 (format <|> sp <|> B.str <$> many1Char (podCharLess ex))
internal sp close ex = do
char '/'
LinkInternal <$> section sp close ex
notSlash sp ex = format <|> sp <|> B.str <$> many1Char (podCharLess ('/':ex))
man sp close ex = do
page <- mconcat <$> many (notSlash sp ex)
sec <- optionMaybe $ char '/' *> section sp close ex
return $ LinkMan page sec
whitespace :: PandocMonad m => PodParser m Inlines
whitespace = try $ do
many1 spaceChar *> optional newline <|> many spaceChar *> void newline
notFollowedBy blankline
return B.space
podCharLess :: PandocMonad m => String -> PodParser m Char
podCharLess exclude = try (satisfy isAsciiUpper <* notFollowedBy (char '<'))
<|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c || elem c exclude))
podChar :: PandocMonad m => PodParser m Char
podChar = try (satisfy isAsciiUpper <* notFollowedBy (char '<'))
<|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c))
str :: PandocMonad m => PodParser m Inlines
str = B.str <$> many1Char podChar
nonEmptyLine :: PandocMonad m => PodParser m T.Text
nonEmptyLine = try $ do
pre <- manyChar spaceChar
something <- T.singleton <$> nonspaceChar
post <- anyLineNewline
return $ pre <> something <> post
verbatim :: PandocMonad m => PodParser m Blocks
verbatim = do
start <- startVerbatimLine
lns <- many (nonEmptyLine <|>
try (do b <- blanklines
l <- startVerbatimLine
return $ b <> l))
optional blanklines
return $ B.codeBlock $ mconcat $ start:lns
where
startVerbatimLine = many1Char spaceChar <> nonEmptyLine
-- =begin/=end/=for and data paragraphs
-- The =begin/=end (and single-paragraph =for variant) markers in Pod are
-- designed as an extension point for specific formatters
--
-- this doesn't strictly match the intent of "=begin :ident" pod blocks, which
-- are still meant to be processed specially by the formatter, and only land in
-- the output upon request, i.e. pod2html will process "=begin :html" blocks as
-- Pod and include them in the regular output. Since the regions contain Pod
-- markup it seems to me that the best thing to do is parse the markup and put
-- a classname on it, allowing users to respond as desired with filters.
-- Pandoc doesn't have a built-in concept of parsed Divs that are only rendered
-- to certain formats, just raw blocks.
--
-- perlpodspec allows nesting of =begin/=end regions but we currently don't
-- because it would be annoying and we have something somewhat useful we
-- can do with these blocks which is treat them as RawBlocks, which matches
-- the intent reasonably well, and that gets weirder if we parse a nested
-- structure. It seems unlikely this would be encountered in the wild.
regionIdentifier :: PandocMonad m => PodParser m T.Text
regionIdentifier = many1Char (alphaNum <|> oneOf "-_")
for :: PandocMonad m => PodParser m Blocks
for = do
string "for"
many1 spaceChar
forDiv <|> forData
forDiv :: PandocMonad m => PodParser m Blocks
forDiv = do
char ':'
cls <- regionIdentifier
many1 spaceChar
B.divWith (mempty, [cls], mempty) <$> paragraph
forData :: PandocMonad m => PodParser m Blocks
forData = do
fmt <- regionIdentifier
ln1 <- anyLineNewline
lns <- many nonEmptyLine
optional blanklines
return $ B.rawBlock fmt (T.concat (ln1 : lns))
begin :: PandocMonad m => PodParser m Blocks
begin = do
cmd "begin"
beginDiv <|> beginData
beginDiv :: PandocMonad m => PodParser m Blocks
beginDiv = do
char ':'
cls <- regionIdentifier
anyLine -- "parameters" may appear in this position
blanklines
bs <- mconcat <$> many block
textStr ("=end :" <> cls) <* blanklines
return $ B.divWith (mempty, [cls], mempty) bs
beginData :: PandocMonad m => PodParser m Blocks
beginData = do
fmt <- regionIdentifier
anyLine
blanklines
lns <- mconcat <$> many (try rawCut <|> rawLine)
textStr ("=end " <> fmt) <* blanklines
return $ B.rawBlock fmt lns
where
rawCut = do
char '=' *> cut
pod <?> "=pod to close =cut within =begin/=end"
return mempty
rawLine = do
try (notFollowedBy (char '=' *> letter))
anyLineNewline
|