src/Text/Pandoc/Transforms.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

{-# LANGUAGE OverloadedStrings   #-}
{-# LANGUAGE StrictData          #-}
{- |
   Module      : Text.Pandoc.Transforms
   Copyright   : © 2006-2024 John MacFarlane
   License     : GPL-2.0-or-later
   Maintainer  : John MacFarlane <jgm@berkeley@edu>

Transformation of a Pandoc document post-parsing
-}
module Text.Pandoc.Transforms
  ( Transform
  , applyTransforms
  , adjustLinksAndIds
  , eastAsianLineBreakFilter
  , filterIpynbOutput
  , headerShift
  ) where

import Data.List (sortOn)
import Data.Text (Text)
import Network.URI (unEscapeString)
import Text.DocLayout (charWidth)
import Text.Pandoc.Definition
  ( Pandoc (..), Attr, Block (..), Format (..), Inline (..) )
import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.Options (Extensions)
import Text.Pandoc.Shared (stringify, textToIdentifier)
import Text.Pandoc.Walk (walk)
import qualified Data.Text as T
import qualified Text.Pandoc.Builder as B

-- | Transformation of a Pandoc document post-parsing
type Transform = Pandoc -> Pandoc

-- | Apply a list of transforms to a document, in order.
applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
applyTransforms transforms d = return $ foldr ($) d transforms

-- | Prefixes identifiers with a string derived from the filepath of
-- @thisfile@; fixes links to targets in @allfiles@ accordingly.
adjustLinksAndIds :: Extensions        -- ^ defines how IDs are generated
                  -> Text              -- ^ thisfile
                  -> [Text]            -- ^ allfiles
                  -> Transform
adjustLinksAndIds exts thisfile allfiles
  | length allfiles > 1 = walk fixInline . walk fixBlock
  | otherwise           = id
 where
  -- fix ids in blocks
  fixBlock :: Block -> Block
  fixBlock (CodeBlock attr t) = CodeBlock (fixAttrs attr) t
  fixBlock (Header lev attr ils) = Header lev (fixAttrs attr) ils
  fixBlock (Table attr cap cols th tbs tf) =
     Table (fixAttrs attr) cap cols th tbs tf
  fixBlock (Div attr bs) = Div (fixAttrs attr) bs
  fixBlock x = x

  -- fix ids and links in inlines
  fixInline :: Inline -> Inline
  fixInline (Code attr t) = Code (fixAttrs attr) t
  fixInline (Link attr ils (url,tit)) =
    Link (fixAttrs attr) ils (fixURL url,tit)
  fixInline (Image attr ils (url,tit)) =
    Image (fixAttrs attr) ils (fixURL url,tit)
  fixInline (Span attr ils) = Span (fixAttrs attr) ils
  fixInline x = x

  -- add thisfile as prefix of identifier
  fixAttrs :: Attr -> Attr
  fixAttrs (i,cs,kvs)
    | T.null i = (i,cs,kvs)
    | otherwise =
        (T.intercalate "__"
          (filter (not . T.null) [toIdent thisfile, i]),
        cs, kvs)

  -- turns a filepath into an identifier
  toIdent :: Text -> Text
  toIdent = textToIdentifier exts . T.intercalate "__" .
            T.split (\c -> c == '/' || c == '\\')

  -- if URL begins with file from allfiles, convert to
  -- an internal link with the appropriate identifier
  fixURL :: Text -> Text
  fixURL u =
    let (a,b) = T.break (== '#') $ T.pack . unEscapeString . T.unpack $ u
        filepart = if T.null a
                      then toIdent thisfile
                      else toIdent a
        fragpart = T.dropWhile (== '#') b
     in if T.null a || a `elem` allfiles
           then "#" <> T.intercalate "__"
                         (filter (not . T.null) [filepart, fragpart])
           else u

-- | Process ipynb output cells.  If mode is Nothing,
-- remove all output.  If mode is Just format, select
-- best output for the format.  If format is not ipynb,
-- strip out ANSI escape sequences from CodeBlocks (see #5633).
filterIpynbOutput :: Maybe Format -> Pandoc -> Pandoc
filterIpynbOutput mode = walk go
  where go (Div (ident, "output":os, kvs) bs) =
          case mode of
            Nothing  -> Div (ident, "output":os, kvs) []
            -- "best" for ipynb includes all formats:
            Just fmt
              | fmt == Format "ipynb"
                          -> Div (ident, "output":os, kvs) bs
              | otherwise -> Div (ident, "output":os, kvs) $
                              walk removeANSI $
                              take 1 $ sortOn rank bs
                 where
                  rank (RawBlock (Format "html") _)
                    | fmt == Format "html" = 1 :: Int
                    | fmt == Format "markdown" = 3
                    | otherwise = 4
                  rank (RawBlock (Format "latex") _)
                    | fmt == Format "latex" = 1
                    | fmt == Format "markdown" = 3
                    | otherwise = 4
                  rank (RawBlock f _)
                    | fmt == f = 1
                    | otherwise = 4
                  rank (Para [Image{}]) = 2
                  rank _ = 3
                  removeANSI (CodeBlock attr code) =
                    CodeBlock attr (removeANSIEscapes code)
                  removeANSI x = x
                  removeANSIEscapes t
                    | Just cs <- T.stripPrefix "\x1b[" t =
                        removeANSIEscapes $ T.drop 1 $ T.dropWhile (/='m') cs
                    | Just (c, cs) <- T.uncons t = T.cons c $ removeANSIEscapes cs
                    | otherwise = ""
        go x = x

-- | Remove soft breaks between East Asian characters.
eastAsianLineBreakFilter :: Pandoc -> Pandoc
eastAsianLineBreakFilter = bottomUp go
  where go (x:SoftBreak:y:zs)
          | Just (_, b) <- T.unsnoc $ stringify x
          , Just (c, _) <- T.uncons $ stringify y
          , charWidth b == 2
          , charWidth c == 2
          = x:y:zs
          | otherwise
          = x:SoftBreak:y:zs
        go xs
          = xs

-- | Shift header levels up or down.
headerShift :: Int -> Pandoc -> Pandoc
headerShift n (Pandoc meta (Header m _ ils : bs))
  | n < 0
  , m + n == 0 = headerShift n $
                 B.setTitle (B.fromList ils) $ Pandoc meta bs
headerShift n (Pandoc meta bs) = Pandoc meta (walk shift bs)

 where
   shift :: Block -> Block
   shift (Header level attr inner)
     | level + n > 0  = Header (level + n) attr inner
     | otherwise      = Para inner
   shift x            = x