aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Chunks.hs326
2 files changed, 327 insertions, 0 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index 8b56a77c3..f07e112f9 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -635,6 +635,7 @@ library
Text.Pandoc.Class,
Text.Pandoc.Class.IO,
Text.Pandoc.Citeproc,
+ Text.Pandoc.Chunks,
Text.Pandoc.Version
other-modules: Text.Pandoc.App.CommandLineOptions,
Text.Pandoc.App.FormatHeuristics,
diff --git a/src/Text/Pandoc/Chunks.hs b/src/Text/Pandoc/Chunks.hs
new file mode 100644
index 000000000..9be710c63
--- /dev/null
+++ b/src/Text/Pandoc/Chunks.hs
@@ -0,0 +1,326 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE DeriveGeneric #-}
+{- |
+ Module : Text.Pandoc.Chunks
+ Copyright : Copyright (C) 2022 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Functions and types for splitting a Pandoc into subdocuments,
+e.g. for conversion into a set of HTML pages.
+-}
+module Text.Pandoc.Chunks
+ ( Chunk(..)
+ , ChunkedDoc(..)
+ , PathTemplate(..)
+ , splitIntoChunks
+ ) where
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared (makeSections, stringify)
+import Text.Pandoc.Walk (Walkable(..))
+import Data.Text (Text)
+import Text.Printf (printf)
+import Data.Maybe (fromMaybe)
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Data.List (find)
+import Data.String (IsString)
+import GHC.Generics (Generic)
+import Text.HTML.TagSoup (Tag (TagOpen), fromAttrib, parseTags)
+import Text.Pandoc.Writers.Shared (toTOCTree, SecInfo(..))
+import Data.Tree (Tree)
+
+-- | Split 'Pandoc' into 'Chunk's, e.g. for conversion into
+-- a set of HTML pages or EPUB chapters.
+splitIntoChunks :: PathTemplate -- ^ Template for filepath
+ -> Bool -- ^ Number sections
+ -> Maybe Int -- ^ Base heading level
+ -> Int -- ^ Chunk level -- level of section to split at
+ -> Pandoc
+ -> ChunkedDoc
+splitIntoChunks pathTemplate numberSections mbBaseLevel
+ chunkLevel (Pandoc meta blocks) =
+ fixInternalReferences .
+ (\chunks -> ChunkedDoc{ chunkedMeta = meta
+ , chunkedChunks = chunks
+ , chunkedTOC = toTOCTree
+ (concatMap chunkContents chunks) }) .
+ makeChunks chunkLevel pathTemplate .
+ addNavigation Nothing Nothing .
+ makeSections numberSections mbBaseLevel $ blocks
+
+-- | Fix internal references so they point to the path of the chunk.
+fixInternalReferences :: ChunkedDoc -> ChunkedDoc
+fixInternalReferences chunkedDoc =
+ walk rmNavAttrs $ walk fixInternalRefs $
+ chunkedDoc{ chunkedTOC = newTOC
+ , chunkedChunks = newChunks }
+ where
+ newTOC = fromMaybe (chunkedTOC chunkedDoc) $
+ traverse addSecPath (chunkedTOC chunkedDoc)
+
+ newChunks = map fixNav (chunkedChunks chunkedDoc)
+
+ fixNav chunk =
+ chunk{ chunkNext = chunkNext chunk >>= toNavLink
+ , chunkPrev = chunkPrev chunk >>= toNavLink
+ , chunkUp = chunkUp chunk >>= toNavLink
+ }
+
+ toNavLink id' =
+ case M.lookup id' refMap of
+ Nothing -> Just $ "#" <> id'
+ Just fp -> Just $ T.pack fp <> "#" <> id'
+
+ addSecPath :: SecInfo -> Maybe SecInfo
+ addSecPath secinfo =
+ case M.lookup (secId secinfo) refMap of
+ Nothing -> Just secinfo
+ Just fp -> Just $ secinfo{ secPath = T.pack fp }
+
+ -- Remove some attributes we added just to construct chunkNext etc.
+ rmNavAttrs :: Block -> Block
+ rmNavAttrs (Div (ident,classes,kvs) bs) =
+ Div (ident,classes,filter (not . isNavAttr) kvs) bs
+ rmNavAttrs b = b
+
+ isNavAttr :: (Text,Text) -> Bool
+ isNavAttr ("nav-prev",_) = True
+ isNavAttr ("nav-next",_) = True
+ isNavAttr ("nav-up",_) = True
+ isNavAttr ("nav-path",_) = True
+ isNavAttr _ = False
+
+ fixInternalRefs :: Inline -> Inline
+ fixInternalRefs il@(Link attr ils (src,tit))
+ = case T.uncons src of
+ Just ('#', ident) -> Link attr ils (src', tit)
+ where src' = case M.lookup ident refMap of
+ Just fp -> T.pack fp <> src
+ Nothing -> src
+ _ -> il
+ fixInternalRefs il = il
+
+ refMap = foldr chunkToRefs mempty (chunkedChunks chunkedDoc)
+
+ chunkToRefs chunk m =
+ let idents = chunkId chunk : getIdents (chunkContents chunk)
+ in foldr (\ident -> M.insert ident (chunkPath chunk)) m idents
+
+ getIdents bs = query getBlockIdent bs ++ query getInlineIdent bs
+
+ getBlockIdent :: Block -> [Text]
+ getBlockIdent (Div (ident, _, _) _)
+ | not (T.null ident) = [ident]
+ getBlockIdent (Header _ (ident, _, _) _)
+ | not (T.null ident) = [ident]
+ getBlockIdent (Table (ident,_,_) _ _ _ _ _)
+ | not (T.null ident) = [ident]
+ getBlockIdent (RawBlock fmt raw)
+ | isHtmlFormat fmt
+ = foldr (\tag ->
+ case tag of
+ TagOpen{} ->
+ case fromAttrib "id" tag of
+ "" -> id
+ x -> (x:)
+ _ -> id)
+ [] (parseTags raw)
+ getBlockIdent _ = []
+
+ getInlineIdent :: Inline -> [Text]
+ getInlineIdent (Span (ident, _, _) _)
+ | not (T.null ident) = [ident]
+ getInlineIdent (Link (ident, _, _) _ _)
+ | not (T.null ident) = [ident]
+ getInlineIdent (Image (ident, _, _) _ _)
+ | not (T.null ident) = [ident]
+ getInlineIdent (RawInline fmt raw)
+ | isHtmlFormat fmt
+ = foldr (\tag ->
+ case tag of
+ TagOpen{} ->
+ case fromAttrib "id" tag of
+ "" -> id
+ x -> (x:)
+ _ -> id)
+ [] (parseTags raw)
+ getInlineIdent _ = []
+
+ isHtmlFormat :: Format -> Bool
+ isHtmlFormat (Format "html") = True
+ isHtmlFormat (Format "html4") = True
+ isHtmlFormat (Format "html5") = True
+ isHtmlFormat _ = False
+
+
+makeChunks :: Int -> PathTemplate -> [Block] -> [Chunk]
+makeChunks chunkLevel pathTemplate = secsToChunks 1
+ where
+ isChunkHeader :: Block -> Bool
+ isChunkHeader (Div (_,"section":_,_) (Header n _ _:_)) = n <= chunkLevel
+ isChunkHeader _ = False
+
+ secsToChunks :: Int -> [Block] -> [Chunk]
+ secsToChunks chunknum bs =
+ case break isChunkHeader bs of
+ ([], []) -> []
+ ([], (d@(Div attr@(_,"section":_,_) (h@(Header lvl _ _) : bs')) : rest))
+ | chunkLevel == lvl ->
+ -- If the header is of the same level as chunks, create a chunk
+ toChunk chunknum d :
+ secsToChunks (chunknum + 1) rest
+ | chunkLevel > lvl ->
+ case break isChunkHeader bs' of
+ (xs, ys) -> toChunk chunknum (Div attr (h:xs)) :
+ secsToChunks (chunknum + 1) (ys ++ rest)
+ (xs, ys) -> toChunk chunknum
+ (Div ("",["preamble"],[]) xs) :
+ secsToChunks (chunknum + 1) ys
+
+ toChunk :: Int -> Block -> Chunk
+ toChunk chunknum
+ (Div (divid,"section":classes,kvs) (h@(Header _ _ ils) : bs)) =
+ Chunk
+ { chunkHeading = ils
+ , chunkId = divid
+ , chunkNumber = chunknum
+ , chunkPath = chunkpath
+ , chunkUp = lookup "nav-up" kvs
+ , chunkPrev = lookup "nav-prev" kvs
+ , chunkNext = lookup "nav-next" kvs
+ , chunkContents =
+ [Div (divid,"section":classes,kvs') (h : bs)]
+ }
+ where kvs' = kvs ++ [("nav-path", T.pack chunkpath)]
+ chunkpath = resolvePathTemplate pathTemplate chunknum
+ (stringify ils)
+ divid
+ (fromMaybe "" (lookup "number" kvs))
+ toChunk chunknum (Div ("",["preamble"],[]) bs) =
+ Chunk
+ { chunkHeading = []
+ , chunkId = ""
+ , chunkNumber = chunknum
+ , chunkPath = resolvePathTemplate pathTemplate chunknum
+ "" "" ""
+ , chunkUp = Nothing
+ , chunkPrev = Nothing
+ , chunkNext = Nothing
+ , chunkContents = bs
+ }
+ toChunk _ b = error $ "toChunk called on inappropriate block " <> show b
+ -- should not happen
+
+-- | Add nav-up, nav-prev, nav-next attributes to each section Div
+-- in a document.
+addNavigation :: Maybe Text -> Maybe Text -> [Block] -> [Block]
+addNavigation mbUpId mbPrevId (Div (ident, "section":classes, kvs) bs : xs) =
+ Div (ident, "section":classes, kvs ++ navattrs) bs' :
+ addNavigation mbUpId (Just ident) xs
+ where
+ bs' = addNavigation (Just ident) Nothing bs
+ navattrs = maybe [] (\x -> [("nav-up", x)]) mbUpId
+ ++ maybe [] (\x -> [("nav-prev", x)]) mbPrevId
+ ++ maybe [] (\x -> [("nav-next", x)]) mbNextId
+ mbNextId = find isSectionDiv bs >>= extractId
+ isSectionDiv (Div (_,"section":_,_) _) = True
+ isSectionDiv _ = False
+ extractId (Div (id',_,_) _) = Just id'
+ extractId _ = Nothing
+addNavigation mbUpId mbPrevId (x:xs) = x : addNavigation mbUpId mbPrevId xs
+addNavigation _ _ [] = []
+
+resolvePathTemplate :: PathTemplate
+ -> Int -- ^ Chunk number
+ -> Text -- ^ Stringified heading text
+ -> Text -- ^ Section identifier
+ -> Text -- ^ Section number
+ -> FilePath
+resolvePathTemplate (PathTemplate templ) chunknum headingText ident secnum =
+ T.unpack .
+ T.replace "%n" (T.pack $ printf "%03d" chunknum) .
+ T.replace "%s" secnum .
+ T.replace "%h" headingText .
+ T.replace "%i" ident $
+ templ
+
+-- | A 'PathTemplate' is a FilePath in which certain codes
+-- will be substituted with information from a 'Chunk'.
+-- @%n@ will be replaced with the chunk number
+-- (padded with leading 0s to 3 digits),
+-- @%s@ with the section number of the heading,
+-- @%h@ with the (stringified) heading text,
+-- @%i@ with the section identifier.
+-- For example, @"section-%s-%i.html"@ might be resolved to
+-- @"section-1.2-introduction.html"@.
+newtype PathTemplate =
+ PathTemplate { unPathTemplate :: Text }
+ deriving (Show, IsString)
+
+-- | A part of a document (typically a chapter or section, or
+-- the part of a section before its subsections).
+data Chunk =
+ Chunk
+ { chunkHeading :: [Inline]
+ , chunkId :: Text
+ , chunkNumber :: Int
+ , chunkPath :: FilePath
+ , chunkUp :: Maybe Text
+ , chunkPrev :: Maybe Text
+ , chunkNext :: Maybe Text
+ , chunkContents :: [Block]
+ }
+ deriving (Show, Read, Eq, Ord, Generic)
+
+instance Walkable Inline Chunk where
+ query f chunk = query f (chunkContents chunk)
+ walk f chunk = chunk{ chunkContents = walk f (chunkContents chunk) }
+ walkM f chunk = do
+ contents <- walkM f (chunkContents chunk)
+ return chunk{ chunkContents = contents }
+
+instance Walkable Block Chunk where
+ query f chunk = query f (chunkContents chunk)
+ walk f chunk = chunk{ chunkContents = walk f (chunkContents chunk) }
+ walkM f chunk = do
+ contents <- walkM f (chunkContents chunk)
+ return chunk{ chunkContents = contents }
+
+-- | A 'Pandoc' broken into 'Chunk's for writing to separate files.
+data ChunkedDoc =
+ ChunkedDoc
+ { chunkedMeta :: Meta
+ , chunkedTOC :: Tree SecInfo
+ , chunkedChunks :: [Chunk]
+ }
+
+instance Walkable Inline ChunkedDoc where
+ query f doc = query f (chunkedChunks doc) <> query f (chunkedMeta doc)
+ walk f doc = doc{ chunkedMeta = walk f (chunkedMeta doc)
+ , chunkedChunks = walk f (chunkedChunks doc)
+ }
+ walkM f doc = do
+ meta' <- walkM f (chunkedMeta doc)
+ chunks' <- walkM f (chunkedChunks doc)
+ return $ doc{ chunkedMeta = meta'
+ , chunkedChunks = chunks' }
+
+instance Walkable Block ChunkedDoc where
+ query f doc = query f (chunkedChunks doc) <> query f (chunkedMeta doc)
+ walk f doc = doc{ chunkedMeta = walk f (chunkedMeta doc)
+ , chunkedChunks = walk f (chunkedChunks doc)
+ }
+ walkM f doc = do
+ meta' <- walkM f (chunkedMeta doc)
+ chunks' <- walkM f (chunkedChunks doc)
+ return $ doc{ chunkedMeta = meta'
+ , chunkedChunks = chunks' }
+