src/Text/Pandoc/Readers/Mdoc/Lex.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}
{- |
   Module      : Text.Pandoc.Readers.Mdoc.Lex
   Copyright   : Copyright (C) 2024 Evan Silberman
   License     : GNU GPL, version 2 or above

   Maintainer  : Evan Silberman <[email protected]>
   Stability   : WIP
   Portability : portable

Tokenizer for mdoc
-}
module Text.Pandoc.Readers.Mdoc.Lex
  ( MdocToken(..)
  , MdocTokens(..)
  , DelimSide(..)
  , lexMdoc
  , toString
  )
where

import Control.Monad (void, guard, when)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class.PandocMonad (PandocMonad(..))
import Data.Char (isAlphaNum)
import Data.Maybe (isJust)
import qualified Data.Text as T
import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.Roff.Escape
import Text.Pandoc.Readers.Mdoc.Macros
import qualified Data.Sequence as Seq

-- As a higher level language with a wealth of semantic macros, mdoc
-- discourages authors from falling back to low-level roff features like font
-- selection, custom macros, defined strings, etc. Pandoc's mdoc reader is
-- accordingly implemented as a high-level interpreter of mdoc's semantic macros
-- and almost no raw roff requests are supported.
--
-- tbl(7) and eqn(7) macros are rare but not completely unseen in mdoc manuals.
-- they are not yet implemented. most use of tbl macros in mdoc could probably
-- be replaced with .Bl -column

data DelimSide = Open | Middle | Close deriving (Show, Eq)

-- | Tokens for Mdoc documents
data MdocToken = Str T.Text SourcePos -- ^ The contents of a text line
               | Macro T.Text SourcePos  -- ^ A macro to be processed
               | Lit T.Text SourcePos  -- ^ Literal text on a control line
               | Blank SourcePos  -- ^ A blank line
               | Delim DelimSide T.Text SourcePos  -- ^ A delimiter character
               | Eol  -- ^ The end of a control line
               deriving Show

toString :: MdocToken -> T.Text
toString (Str x _) = x
toString (Macro x _) = x
toString (Lit x _) = x
toString (Delim _ x _) = x
toString Blank{} = mempty
toString Eol = mempty

newtype MdocTokens = MdocTokens { unMdocTokens :: Seq.Seq MdocToken }
        deriving (Show, Semigroup, Monoid)

singleTok :: MdocToken -> MdocTokens
singleTok t = MdocTokens (Seq.singleton t)

type Lexer m = ParsecT Sources () m

instance RoffLikeLexer MdocTokens where
  -- This is a bit confusing. We're lexing to MdocTokens, but for escaping
  -- purposes we just want Texts.
  type Token MdocTokens = T.Text
  -- We don't need a state
  type State MdocTokens = ()
  -- We don't support predefined string expansion
  expandString = return ()
  escString = return mempty
  -- what token type the unescaped text gets wrapped in is decided by other
  -- parts of the lexer.
  emit = id
  -- All escapes are resolved in the lexer and we never need to emit anything,
  -- vs. the roff lexer which has to push the backlashes to the output while
  -- in copy mode.
  backslash = (mempty <* char '\\') <|> (mempty <* string "\\E")
  -- We don't support macro definition and we don't output anything for \A
  checkDefined = const mempty
  -- We don't support copy mode and \E is treated as backslash
  escE = return mempty
  -- We don't support low-level font selection
  escFont = escIgnore 'f' [escapeArg, countChar 1 (satisfy (/='\n'))]

eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m MdocToken
eofline = do
  void newline <|> eof
  return Eol

lexComment :: PandocMonad m => Lexer m MdocTokens
lexComment = do
  try $ string ".\\\""
  skipMany $ noneOf "\n"
  eofline
  return mempty

argText :: PandocMonad m => Lexer m T.Text
argText = do
  beg <- escape <|> regularText
  end <- mconcat <$> many (escape <|> regularText <|> quoteChar)
  return $ beg <> end

spaceTabChar :: PandocMonad m => Lexer m T.Text
spaceTabChar = T.singleton <$> spaceChar

quotedArg :: PandocMonad m => Lexer m T.Text
quotedArg = do
  quoteChar
  t <- mconcat <$> many (try innerQuote <|> escape <|> regularText <|> spaceTabChar)
  quoteChar
  notFollowedBy quoteChar
  return t
  where
    innerQuote = do
      string "\"\""
      return "\""

anyText :: PandocMonad m => Lexer m T.Text
anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar

regularText :: PandocMonad m => Lexer m T.Text
regularText = many1Char $ noneOf "\n\r\t \\\""

quoteChar :: PandocMonad m => Lexer m T.Text
quoteChar = T.singleton <$> char '"'

mdocToken :: PandocMonad m => Lexer m MdocTokens
mdocToken = lexComment <|> lexControlLine <|> lexTextLine

lexMacroName :: PandocMonad m => Lexer m T.Text
lexMacroName = many1Char (satisfy isMacroChar)
  where
    isMacroChar '%' = True
    isMacroChar x = isAlphaNum x

lexMacro :: PandocMonad m => Lexer m MdocToken
lexMacro = do
  pos <- getPosition
  name <- lexMacroName
  eof <|> void (lookAhead (spaceChar <|> newline))
  skipSpaces
  return $ Macro name pos

lexCallableMacro :: PandocMonad m => Lexer m MdocToken
lexCallableMacro = do
  pos <- getPosition
  q <- optionMaybe quoteChar
  name <- lexMacroName
  when (isJust q) (void quoteChar)
  eof <|> void (lookAhead (spaceChar <|> newline))
  skipSpaces
  guard $ isCallableMacro name
  return $ Macro name pos

lexDelim :: (PandocMonad m) => Lexer m MdocToken
lexDelim = do
  pos <- getPosition
  q <- optionMaybe quoteChar
  t <-
    Delim Open <$> oneOfStrings ["(", "["]
      <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
      <|> Delim Middle <$> textStr "|"
  when (isJust q) (void quoteChar)
  eof <|> void (lookAhead (spaceChar <|> newline))
  skipSpaces
  return $ t pos

lexLit :: PandocMonad m => Lexer m MdocToken
lexLit = do
  pos <- getPosition
  t <- argText <|> quotedArg
  skipSpaces
  return $ Lit t pos

lexTextLine :: PandocMonad m => Lexer m MdocTokens
lexTextLine = do
  pos <- getPosition
  guard $ sourceColumn pos == 1
  t <- mconcat <$> many anyText
  eofline
  if T.null $ T.strip t
     then return $ singleTok $ Blank pos
     else return $ singleTok $ Str t pos

lexControlLine :: PandocMonad m => Lexer m MdocTokens
lexControlLine = do
  pos <- getPosition
  guard $ sourceColumn pos == 1
  char '.'
  eofline *> mempty <|> do
    m@(Macro name _) <- lexMacro
    -- .Ns macros at the start of a line are ignored. We'd have to look behind
    -- to keep track of the "start of the line" in the parser, so we'll drop
    -- those macros in lexing.
    let start | name == "Ns" = []
              | otherwise = [m]
    let parsed = isParsedMacro name
    (wds, e) <- manyUntil (l parsed) eofline
    return $ MdocTokens $ Seq.fromList $ start <> wds <> [e]
      where
        l True = try lexDelim <|> try lexCallableMacro <|> lexLit
        l False = try lexDelim <|> lexLit

-- | Tokenize a string as a sequence of mdoc tokens.
lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens
lexMdoc pos txt = do
  eithertokens <- readWithM (do setPosition pos
                                mconcat <$> manyTill mdocToken eof) def txt
  case eithertokens of
    Left e       -> throwError e
    Right tokenz -> return tokenz