Add ChunkedHTML writer.

- Add module Text.Pandoc.Writers.ChunkedHTML, exporting writeChunkedHtml [API change]. - Revised API for Text.Pandoc.Chunks. `chunkNext`, `chunkPrev`, `chunkUp` are now just `Maybe Chunk`. - Fix assignment of navigation elements of Chunks. - Strip off anchor portion of next and prev links. - Derive Show, Eq, Ord, Generic for ChunkDoc. - Add `chunkSectionNumber`, `chunkUnlisted`. - Automatically unwrap the zip to a directory if an extensionless output file specified. - Incorporate images with relative paths below working dir.
author: John MacFarlane <[email protected]> 2023-01-05 21:52:44 -0800
committer: John MacFarlane <[email protected]> 2023-01-10 21:48:38 -0800
commit: 1fdc27a2c0f723074c87ba4674575bc30da974f2 (patch)
tree: 8365affedb3445b474479de6ff6190be6ba2b54c
parent: b55cd83a609bcb01f8472a66ab0ddca985c881e4 (diff)
8 files changed, 433 insertions, 95 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index 330faff37..7329073b4 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -287,6 +287,7 @@ header when requesting a document from a URL:
     - `beamer` ([LaTeX beamer][`beamer`] slide show)
     - `bibtex` ([BibTeX] bibliography)
     - `biblatex` ([BibLaTeX] bibliography)
+    - `chunkedhtml` (zip archive of multiple linked HTML files)
     - `commonmark` ([CommonMark] Markdown)
     - `commonmark_x` ([CommonMark] Markdown with extensions)
     - `context` ([ConTeXt])
@@ -358,7 +359,12 @@ header when requesting a document from a URL:
 
 :   Write output to *FILE* instead of *stdout*.  If *FILE* is
     `-`, output will go to *stdout*, even if a non-textual format
-    (`docx`, `odt`, `epub2`, `epub3`) is specified.
+    (`docx`, `odt`, `epub2`, `epub3`) is specified.  If the
+    output format is `chunkedhtml` and *FILE* has no extension,
+    then instead of producing a `.zip` file pandoc will create
+    a directory *FILE* and unpack the zip archive there
+    (unless *FILE* already exists, in which case an error
+    will be raised).
 
 `--data-dir=`*DIRECTORY*
 
@@ -6822,6 +6828,27 @@ styling of pandoc's default HTML templates is desired (and
 in that case the variables defined in [Variables for HTML] may
 be used to fine-tune the style).
 
+# Chunked HTML
+
+`pandoc -t chunkedhtml` will produce a zip archive of linked
+HTML files, one for each section of the original document.
+Internal links will automatically be adjusted to point to
+the right place, images linked to under the working directory
+will be incorporated, and navigation links will be added.
+In addition, a JSON file `sitemap.json` will be included
+describing the hierarchical structure of the files.
+
+If an output file without an extension is specified, then
+it will be interpreted as a directory and the zip archive
+will be automatically unpacked into it (unless it already
+exists, in which case an error will be raised).  Otherwise
+a `.zip` file will be produced.
+
+The navigation links can be customized by adjusting the
+template.  By default, a table of contents is included only
+on the top page. To include it on every page, set the
+`toc` variable manually.
+
 # Jupyter notebooks
 
 When creating a [Jupyter notebook], pandoc will try to infer the
diff --git a/data/templates/default.chunkedhtml b/data/templates/default.chunkedhtml
new file mode 100644
index 000000000..7101c9587
--- /dev/null
+++ b/data/templates/default.chunkedhtml
@@ -0,0 +1,99 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="$lang$" xml:lang="$lang$"$if(dir)$ dir="$dir$"$endif$>
+<head>
+  <meta charset="utf-8" />
+  <meta name="generator" content="pandoc" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
+$for(author-meta)$
+  <meta name="author" content="$author-meta$" />
+$endfor$
+$if(date-meta)$
+  <meta name="dcterms.date" content="$date-meta$" />
+$endif$
+$if(keywords)$
+  <meta name="keywords" content="$for(keywords)$$keywords$$sep$, $endfor$" />
+$endif$
+$if(description-meta)$
+  <meta name="description" content="$description-meta$" />
+$endif$
+  <title>$if(title-prefix)$$title-prefix$ – $endif$$pagetitle$</title>
+  <style>
+    div.sitenav { display: flex; flex-direction: row; flex-wrap: wrap; }
+    span.navlink { flex: 1; }
+    span.navlink-label { display: inline-block; min-width: 4em; }
+    $styles.html()$
+  </style>
+$for(css)$
+  <link rel="stylesheet" href="$css$" />
+$endfor$
+$for(header-includes)$
+  $header-includes$
+$endfor$
+$if(math)$
+  $math$
+$endif$
+</head>
+<body>
+$for(include-before)$
+$include-before$
+$endfor$
+<nav id="sitenav">
+<div class="sitenav">
+<span class="navlink">
+$if(up.url)$
+<span class="navlink-label">Up:</span> <a href="$up.url$" accesskey="u" rel="up">$up.title$</a>
+$endif$
+</span>
+<span class="navlink">
+$if(top)$
+<span class="navlink-label">Top:</span> <a href="$top.url$" accesskey="t" rel="top">$top.title$</a>
+$endif$
+</span>
+</div>
+<div class="sitenav">
+<span class="navlink">
+$if(next.url)$
+<span class="navlink-label">Next:</span> <a href="$next.url$" accesskey="n" rel="next">$next.title$</a>
+$endif$
+</span>
+<span class="navlink">
+$if(previous.url)$
+<span class="navlink-label">Previous:</span> <a href="$previous.url$" accesskey="p" rel="previous">$previous.title$</a>
+$endif$
+</span>
+</div>
+</nav>
+$if(title)$
+<header id="title-block-header">
+<h1 class="title">$title$</h1>
+$if(subtitle)$
+<p class="subtitle">$subtitle$</p>
+$endif$
+$for(author)$
+<p class="author">$author$</p>
+$endfor$
+$if(date)$
+<p class="date">$date$</p>
+$endif$
+$if(abstract)$
+<div class="abstract">
+<div class="abstract-title">$abstract-title$</div>
+$abstract$
+</div>
+$endif$
+</header>
+$endif$
+$if(toc)$
+<nav id="$idprefix$TOC" role="doc-toc">
+$if(toc-title)$
+<h2 id="$idprefix$toc-title">$toc-title$</h2>
+$endif$
+$table-of-contents$
+</nav>
+$endif$
+$body$
+$for(include-after)$
+$include-after$
+$endfor$
+</body>
+</html>
diff --git a/pandoc.cabal b/pandoc.cabal
index 3d694bf3c..ea255917f 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -51,6 +51,7 @@ data-files:
                  data/templates/styles.citations.html
                  data/templates/default.html4
                  data/templates/default.html5
+                 data/templates/default.chunkedhtml
                  data/templates/default.docbook4
                  data/templates/default.docbook5
                  data/templates/default.jats_archiving
@@ -584,6 +585,7 @@ library
                    Text.Pandoc.Writers.JATS,
                    Text.Pandoc.Writers.OPML,
                    Text.Pandoc.Writers.HTML,
+                   Text.Pandoc.Writers.ChunkedHTML,
                    Text.Pandoc.Writers.Ipynb,
                    Text.Pandoc.Writers.ICML,
                    Text.Pandoc.Writers.Jira,
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index bc996f554..2d4601b68 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -40,7 +40,9 @@ import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TE
 import qualified Data.Text.Encoding.Error as TE
-import System.Directory (doesDirectoryExist)
+import System.Directory (doesDirectoryExist, createDirectory)
+import Codec.Archive.Zip (toArchiveOrFail,
+                          extractFilesFromArchive, ZipOption(..))
 import System.Exit (exitSuccess)
 import System.FilePath ( takeBaseName, takeExtension)
 import System.IO (nativeNewline, stdout)
@@ -115,6 +117,16 @@ convertWithOpts scriptingEngine opts = do
       case output of
         TextOutput t    -> writerFn eol outputFile t
         BinaryOutput bs -> writeFnBinary outputFile bs
+        ZipOutput bs
+          | null (takeExtension outputFile) -> do
+             -- create directory and unzip
+             createDirectory outputFile -- will fail if directory exists
+             let zipopts = [OptRecursive, OptDestination outputFile] ++
+                           [OptVerbose | optVerbosity opts == INFO]
+             case toArchiveOrFail bs of
+               Right archive -> extractFilesFromArchive zipopts archive
+               Left e -> E.throwIO $ PandocShouldNeverHappenError $ T.pack e
+          | otherwise -> writeFnBinary outputFile bs
 
 convertWithOpts' :: (PandocMonad m, MonadIO m, MonadMask m)
                  => ScriptingEngine
@@ -300,7 +312,9 @@ convertWithOpts' scriptingEngine istty datadir opts = do
     createPngFallbacks (writerDpi writerOptions)
 
   output <- case writer of
-    ByteStringWriter f -> BinaryOutput <$> f writerOptions doc
+    ByteStringWriter f
+      | format == "chunkedhtml" -> ZipOutput <$> f writerOptions doc
+      | otherwise -> BinaryOutput <$> f writerOptions doc
     TextWriter f -> case outputPdfProgram outputSettings of
       Just pdfProg -> do
               res <- makePDF pdfProg (optPdfEngineOpts opts) f
@@ -322,7 +336,10 @@ convertWithOpts' scriptingEngine istty datadir opts = do
   reports <- getLog
   return (output, reports)
 
-data PandocOutput = TextOutput Text | BinaryOutput BL.ByteString
+data PandocOutput =
+      TextOutput Text
+    | BinaryOutput BL.ByteString
+    | ZipOutput BL.ByteString
   deriving (Show)
 
 type Transform = Pandoc -> Pandoc
diff --git a/src/Text/Pandoc/App/OutputSettings.hs b/src/Text/Pandoc/App/OutputSettings.hs
index 8208137cd..8e41e51fb 100644
--- a/src/Text/Pandoc/App/OutputSettings.hs
+++ b/src/Text/Pandoc/App/OutputSettings.hs
@@ -106,7 +106,7 @@ optToOutputSettings scriptingEngine opts = do
   flvrd@(Format.FlavoredFormat format _extsDiff) <-
     Format.parseFlavoredFormat writerName
 
-  let standalone = optStandalone opts || not (isTextFormat format) || pdfOutput
+  let standalone = optStandalone opts || isBinaryFormat format || pdfOutput
   let templateOrThrow = \case
         Left  e -> throwError $ PandocTemplateError (T.pack e)
         Right t -> pure t
@@ -300,6 +300,6 @@ pdfWriterAndProg mWriter mEngine =
 
       isCustomWriter w = ".lua" `T.isSuffixOf` w
 
-isTextFormat :: T.Text -> Bool
-isTextFormat s =
-  s `notElem` ["odt","docx","epub2","epub3","epub","pptx","pdf"]
+isBinaryFormat :: T.Text -> Bool
+isBinaryFormat s =
+  s `elem` ["odt","docx","epub2","epub3","epub","pptx","pdf","chunkedhtml"]
diff --git a/src/Text/Pandoc/Chunks.hs b/src/Text/Pandoc/Chunks.hs
index db8ef531e..3b61f3041 100644
--- a/src/Text/Pandoc/Chunks.hs
+++ b/src/Text/Pandoc/Chunks.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE FlexibleContexts #-}
@@ -25,18 +26,18 @@ module Text.Pandoc.Chunks
   , SecInfo(..)
   ) where
 import Text.Pandoc.Definition
-import Text.Pandoc.Shared (makeSections, stringify)
+import Text.Pandoc.Shared (makeSections, stringify, inlineListToIdentifier)
 import Text.Pandoc.Walk (Walkable(..))
 import Data.Text (Text)
 import Text.Printf (printf)
 import Data.Maybe (fromMaybe, isNothing)
 import qualified Data.Map as M
 import qualified Data.Text as T
-import Data.List (find)
 import Data.String (IsString)
 import GHC.Generics (Generic)
 import Text.HTML.TagSoup (Tag (TagOpen), fromAttrib, parseTags)
-import Data.Tree (Tree(..))
+import Data.Tree (Tree(..), unfoldForest, unfoldTree)
+import Data.List (tails)
 
 -- | Split 'Pandoc' into 'Chunk's, e.g. for conversion into
 -- a set of HTML pages or EPUB chapters.
@@ -47,64 +48,51 @@ splitIntoChunks :: PathTemplate -- ^ Template for filepath
                 -> Pandoc
                 -> ChunkedDoc
 splitIntoChunks pathTemplate numberSections mbBaseLevel
-                chunkLevel (Pandoc meta blocks) =
+                chunklev (Pandoc meta blocks) =
+   addNav .
    fixInternalReferences .
+   walk rmNavAttrs .
    (\chunks -> ChunkedDoc{ chunkedMeta = meta
                          , chunkedChunks = chunks
-                         , chunkedTOC = toTOCTree
-                              (concatMap chunkContents chunks) }) .
-   makeChunks chunkLevel pathTemplate .
-   addNavigation Nothing Nothing .
+                         , chunkedTOC = toTOCTree' chunks }) .
+   makeChunks chunklev pathTemplate meta .
    makeSections numberSections mbBaseLevel $ blocks
 
+-- | Add chunkNext, chunkPrev, chunkUp
+addNav :: ChunkedDoc -> ChunkedDoc
+addNav chunkedDoc =
+  chunkedDoc{ chunkedChunks =
+     addNext . addPrev . addUp $ chunkedChunks chunkedDoc }
+
+addUp :: [Chunk] -> [Chunk]
+addUp (c : d : ds)
+  | chunkLevel c < chunkLevel d
+    = c : addUp (d{ chunkUp = Just c } : ds)
+  | chunkLevel c == chunkLevel d
+    = c : addUp (d{ chunkUp = chunkUp c} : ds)
+addUp (c:cs) = c : addUp cs
+addUp [] = []
+
+addNext :: [Chunk] -> [Chunk]
+addNext cs = zipWith go cs (map Just (tail cs) ++ [Nothing])
+ where
+  go c nxt = c{ chunkNext = nxt }
+
+addPrev :: [Chunk] -> [Chunk]
+addPrev cs = zipWith go cs (Nothing : map Just cs)
+ where
+  go c prev = c{ chunkPrev = prev }
+
 -- | Fix internal references so they point to the path of the chunk.
 fixInternalReferences :: ChunkedDoc -> ChunkedDoc
-fixInternalReferences chunkedDoc =
-  walk rmNavAttrs $ walk fixInternalRefs $
-    chunkedDoc{ chunkedTOC = newTOC
-              , chunkedChunks = newChunks }
+fixInternalReferences chunkedDoc = walk fixInternalRefs chunkedDoc
  where
-  newTOC = fromMaybe (chunkedTOC chunkedDoc) $
-             traverse addSecPath (chunkedTOC chunkedDoc)
-
-  newChunks = map fixNav (chunkedChunks chunkedDoc)
-
-  fixNav chunk =
-    chunk{ chunkNext = chunkNext chunk >>= toNavLink
-         , chunkPrev = chunkPrev chunk >>= toNavLink
-         , chunkUp = chunkUp chunk >>= toNavLink
-         }
-
-  toNavLink id' =
-    case M.lookup id' refMap of
-      Nothing -> Just $ "#" <> id'
-      Just fp -> Just $ T.pack fp <> "#" <> id'
-
-  addSecPath :: SecInfo -> Maybe SecInfo
-  addSecPath secinfo =
-    case M.lookup (secId secinfo) refMap of
-      Nothing -> Just secinfo
-      Just fp -> Just $ secinfo{ secPath = T.pack fp }
-
-  -- Remove some attributes we added just to construct chunkNext etc.
-  rmNavAttrs :: Block -> Block
-  rmNavAttrs (Div (ident,classes,kvs) bs) =
-    Div (ident,classes,filter (not . isNavAttr) kvs) bs
-  rmNavAttrs b = b
-
-  isNavAttr :: (Text,Text) -> Bool
-  isNavAttr ("nav-prev",_) = True
-  isNavAttr ("nav-next",_) = True
-  isNavAttr ("nav-up",_)  = True
-  isNavAttr ("nav-path",_)  = True
-  isNavAttr _ = False
-
   fixInternalRefs :: Inline -> Inline
   fixInternalRefs il@(Link attr ils (src,tit))
     = case T.uncons src of
         Just ('#', ident) -> Link attr ils (src', tit)
           where src' = case M.lookup ident refMap of
-                         Just fp -> T.pack fp <> src
+                         Just chunk -> T.pack (chunkPath chunk) <> src
                          Nothing -> src
         _ -> il
   fixInternalRefs il = il
@@ -113,7 +101,7 @@ fixInternalReferences chunkedDoc =
 
   chunkToRefs chunk m =
     let idents = chunkId chunk : getIdents (chunkContents chunk)
-    in  foldr (\ident -> M.insert ident (chunkPath chunk)) m idents
+    in  foldr (\ident -> M.insert ident chunk) m idents
 
   getIdents bs = query getBlockIdent bs ++ query getInlineIdent bs
 
@@ -162,11 +150,11 @@ fixInternalReferences chunkedDoc =
   isHtmlFormat _ = False
 
 
-makeChunks :: Int -> PathTemplate -> [Block] -> [Chunk]
-makeChunks chunkLevel pathTemplate = secsToChunks 1
+makeChunks :: Int -> PathTemplate -> Meta -> [Block] -> [Chunk]
+makeChunks chunklev pathTemplate meta = secsToChunks 1
  where
   isChunkHeader :: Block -> Bool
-  isChunkHeader (Div (_,"section":_,_) (Header n _ _:_)) = n <= chunkLevel
+  isChunkHeader (Div (_,"section":_,_) (Header n _ _:_)) = n <= chunklev
   isChunkHeader _ = False
 
   secsToChunks :: Int -> [Block] -> [Chunk]
@@ -174,11 +162,11 @@ makeChunks chunkLevel pathTemplate = secsToChunks 1
     case break isChunkHeader bs of
       ([], []) -> []
       ([], (d@(Div attr@(_,"section":_,_) (h@(Header lvl _ _) : bs')) : rest))
-        | chunkLevel == lvl ->
+        | chunklev == lvl ->
           -- If the header is of the same level as chunks, create a chunk
           toChunk chunknum d :
             secsToChunks (chunknum + 1) rest
-        | chunkLevel > lvl ->
+        | chunklev > lvl ->
           case break isChunkHeader bs' of
             (xs, ys) -> toChunk chunknum (Div attr (h:xs)) :
                           secsToChunks (chunknum + 1) (ys ++ rest)
@@ -188,56 +176,55 @@ makeChunks chunkLevel pathTemplate = secsToChunks 1
 
   toChunk :: Int -> Block -> Chunk
   toChunk chunknum
-    (Div (divid,"section":classes,kvs) (h@(Header _ _ ils) : bs)) =
+    (Div (divid,"section":classes,kvs) (h@(Header lvl _ ils) : bs)) =
     Chunk
       { chunkHeading = ils
       , chunkId = divid
+      , chunkLevel = lvl
       , chunkNumber = chunknum
+      , chunkSectionNumber = secnum
       , chunkPath = chunkpath
-      , chunkUp = lookup "nav-up" kvs
-      , chunkPrev = lookup "nav-prev" kvs
-      , chunkNext = lookup "nav-next" kvs
+      , chunkUp = Nothing
+      , chunkNext = Nothing
+      , chunkPrev = Nothing
+      , chunkUnlisted = "unlisted" `elem` classes
       , chunkContents =
          [Div (divid,"section":classes,kvs') (h : bs)]
       }
      where kvs' = kvs ++ [("nav-path", T.pack chunkpath)]
+           secnum = lookup "number" kvs
            chunkpath = resolvePathTemplate pathTemplate chunknum
                         (stringify ils)
                         divid
-                        (fromMaybe "" (lookup "number" kvs))
+                        (fromMaybe "" secnum)
   toChunk chunknum (Div ("",["preamble"],[]) bs) =
     Chunk
-      { chunkHeading = []
-      , chunkId = ""
+      { chunkHeading = docTitle meta
+      , chunkId = inlineListToIdentifier mempty $ docTitle meta
+      , chunkLevel = 0
       , chunkNumber = chunknum
+      , chunkSectionNumber = Nothing
       , chunkPath = resolvePathTemplate pathTemplate chunknum
-                        "" "" ""
+                        (stringify (docTitle meta))
+                        (inlineListToIdentifier mempty (docTitle meta))
+                        "0"
       , chunkUp = Nothing
       , chunkPrev = Nothing
       , chunkNext = Nothing
+      , chunkUnlisted = False
       , chunkContents = bs
       }
   toChunk _ b = error $ "toChunk called on inappropriate block " <> show b
   -- should not happen
 
--- | Add nav-up, nav-prev, nav-next attributes to each section Div
--- in a document.
-addNavigation :: Maybe Text -> Maybe Text -> [Block] -> [Block]
-addNavigation mbUpId mbPrevId (Div (ident, "section":classes, kvs) bs : xs) =
-  Div (ident, "section":classes, kvs ++ navattrs) bs' :
-    addNavigation mbUpId (Just ident) xs
+
+-- Remove some attributes we added just to construct chunkNext etc.
+rmNavAttrs :: Block -> Block
+rmNavAttrs (Div (ident,classes,kvs) bs) =
+  Div (ident,classes,filter (not . isNavAttr) kvs) bs
  where
-  bs' = addNavigation (Just ident) Nothing bs
-  navattrs = maybe [] (\x -> [("nav-up", x)]) mbUpId
-          ++ maybe [] (\x -> [("nav-prev", x)]) mbPrevId
-          ++ maybe [] (\x -> [("nav-next", x)]) mbNextId
-  mbNextId = find isSectionDiv bs >>= extractId
-  isSectionDiv (Div (_,"section":_,_) _) = True
-  isSectionDiv _ = False
-  extractId (Div (id',_,_) _) = Just id'
-  extractId _ = Nothing
-addNavigation mbUpId mbPrevId (x:xs) = x : addNavigation mbUpId mbPrevId xs
-addNavigation _ _ [] = []
+  isNavAttr (k,_) = "nav-" `T.isPrefixOf` k
+rmNavAttrs b = b
 
 resolvePathTemplate :: PathTemplate
                     -> Int -- ^ Chunk number
@@ -272,14 +259,17 @@ data Chunk =
   Chunk
   { chunkHeading :: [Inline]
   , chunkId :: Text
+  , chunkLevel :: Int
   , chunkNumber :: Int
+  , chunkSectionNumber :: Maybe Text
   , chunkPath :: FilePath
-  , chunkUp :: Maybe Text
-  , chunkPrev :: Maybe Text
-  , chunkNext :: Maybe Text
+  , chunkUp :: Maybe Chunk
+  , chunkPrev :: Maybe Chunk
+  , chunkNext :: Maybe Chunk
+  , chunkUnlisted :: Bool
   , chunkContents :: [Block]
   }
-  deriving (Show, Read, Eq, Ord, Generic)
+  deriving (Show, Eq, Ord, Generic)
 
 instance Walkable Inline Chunk where
   query f chunk = query f (chunkContents chunk)
@@ -301,7 +291,7 @@ data ChunkedDoc =
   { chunkedMeta :: Meta
   , chunkedTOC :: Tree SecInfo
   , chunkedChunks :: [Chunk]
-  }
+  } deriving (Show, Eq, Ord, Generic)
 
 instance Walkable Inline ChunkedDoc where
   query f doc = query f (chunkedChunks doc) <> query f (chunkedMeta doc)
@@ -333,7 +323,7 @@ data SecInfo =
   , secId :: Text
   , secPath :: Text
   , secLevel :: Int
-  } deriving (Show, Ord, Eq)
+  } deriving (Show, Ord, Eq, Generic)
 
 instance Walkable Inline SecInfo where
   query f sec = query f (secTitle sec)
@@ -346,12 +336,12 @@ instance Walkable Inline SecInfo where
 -- in a form that can be turned into a table of contents.
 -- Presupposes that the '[Block]' is the output of 'makeSections'.
 toTOCTree :: [Block] -> Tree SecInfo
-toTOCTree bs =
+toTOCTree =
   Node SecInfo{ secTitle = []
               , secNumber = Nothing
               , secId = ""
               , secPath = ""
-              , secLevel = 0 } $ foldr go [] bs
+              , secLevel = 0 } . foldr go []
  where
   go :: Block -> [Tree SecInfo] -> [Tree SecInfo]
   go (Div (ident,_,_) (Header lev (_,classes,kvs) ils : subsecs))
@@ -364,3 +354,22 @@ toTOCTree bs =
   go (Div _ [d@Div{}]) = go d -- #8402
   go _ = id
 
+toTOCTree' :: [Chunk] -> Tree SecInfo
+toTOCTree' =
+  Node SecInfo{ secTitle = []
+              , secNumber = Nothing
+              , secId = ""
+              , secPath = ""
+              , secLevel = 0 } . getNodes . filter (not . skippable)
+ where
+  skippable c = isNothing (chunkSectionNumber c) && chunkUnlisted c
+  getNodes :: [Chunk] -> [Tree SecInfo]
+  getNodes (c:cs) =
+    let (as, bs) = span (\d -> chunkLevel d > chunkLevel c) cs
+        secinfo = SecInfo{ secTitle = chunkHeading c,
+                           secNumber = chunkSectionNumber c,
+                           secId = chunkId c,
+                           secPath = T.pack $ chunkPath c,
+                           secLevel = chunkLevel c }
+     in Node secinfo (getNodes as) : getNodes bs
+  getNodes [] = []
diff --git a/src/Text/Pandoc/Writers.hs b/src/Text/Pandoc/Writers.hs
index cf43cfad7..7f7a03603 100644
--- a/src/Text/Pandoc/Writers.hs
+++ b/src/Text/Pandoc/Writers.hs
@@ -24,6 +24,7 @@ module Text.Pandoc.Writers
     , writeBeamer
     , writeBibTeX
     , writeBibLaTeX
+    , writeChunkedHTML
     , writeCommonMark
     , writeConTeXt
     , writeCslJson
@@ -88,6 +89,7 @@ import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Pandoc.Error
 import Text.Pandoc.Writers.AsciiDoc
 import Text.Pandoc.Writers.BibTeX
+import Text.Pandoc.Writers.ChunkedHTML
 import Text.Pandoc.Writers.CommonMark
 import Text.Pandoc.Writers.ConTeXt
 import Text.Pandoc.Writers.CslJson
@@ -189,6 +191,7 @@ writers = [
   ,("bibtex"       , TextWriter writeBibTeX)
   ,("biblatex"     , TextWriter writeBibLaTeX)
   ,("markua"       , TextWriter writeMarkua)
+  ,("chunkedhtml"  , ByteStringWriter writeChunkedHTML)
   ]
 
 -- | Retrieve writer, extensions based on formatSpec (format+extensions).
diff --git a/src/Text/Pandoc/Writers/ChunkedHTML.hs b/src/Text/Pandoc/Writers/ChunkedHTML.hs
new file mode 100644
index 000000000..f58733588
--- /dev/null
+++ b/src/Text/Pandoc/Writers/ChunkedHTML.hs
@@ -0,0 +1,181 @@
+{-# LANGUAGE LambdaCase          #-}
+{-# LANGUAGE BangPatterns        #-}
+{-# LANGUAGE MultiWayIf          #-}
+{-# LANGUAGE OverloadedStrings   #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeApplications    #-}
+{-# LANGUAGE ViewPatterns        #-}
+{- |
+   Module      : Text.Pandoc.Writers.ChunkedHTML
+   Copyright   : Copyright (C) 2023 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of 'Pandoc' documents to "chunked" HTML (a folder of
+linked HTML documents, split by sections.
+-}
+module Text.Pandoc.Writers.ChunkedHTML (
+  writeChunkedHTML
+  ) where
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (WriterOptions(..))
+import Text.Pandoc.Shared (stringify, tshow)
+import Text.Pandoc.Class (PandocMonad, getPOSIXTime, runPure,
+                          readFileLazy, insertMedia, getMediaBag)
+import Text.Pandoc.MediaBag (mediaItems)
+import qualified Data.ByteString.Lazy as BL
+import Text.Pandoc.Chunks (splitIntoChunks, Chunk(..), ChunkedDoc(..),
+                           SecInfo(..))
+import Data.Text (Text)
+import Data.Tree
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import Text.Pandoc.Writers.HTML (writeHtml5String)
+import Codec.Archive.Zip (Entry, addEntryToArchive, emptyArchive, toEntry,
+                          fromArchive)
+import qualified Data.Map as M
+import Text.DocTemplates (Context(..), Val(..))
+import Text.DocLayout (literal)
+import Text.Pandoc.Writers.Shared (defField)
+import Data.Aeson (toJSON, encode)
+import System.FilePath (isRelative, normalise)
+import Data.List (isInfixOf)
+import Text.Pandoc.Walk (walkM)
+
+-- | Splits document into HTML chunks, dividing them by section,
+-- and returns a zip archive of a folder of files.
+writeChunkedHTML :: PandocMonad m
+                 => WriterOptions -> Pandoc -> m BL.ByteString
+writeChunkedHTML opts (Pandoc meta blocks) = do
+  walkM addMedia (Pandoc meta blocks)
+  epochtime <- floor <$> getPOSIXTime
+  let toMediaEntry (fp, _mt, bs) = toEntry fp epochtime bs
+  mediaEntries <- map toMediaEntry . mediaItems <$> getMediaBag
+  let chunkedDoc = splitIntoChunks "%s-%i.html"
+                     True
+                     (Just 1)
+                     (writerEpubChapterLevel opts)
+                     (Pandoc meta blocks)
+  let topChunk =
+        Chunk
+          { chunkHeading = docTitle meta
+          , chunkId = "top"
+          , chunkLevel = 0
+          , chunkNumber = 0
+          , chunkSectionNumber = Nothing
+          , chunkPath = "index.html"
+          , chunkUp = Nothing
+          , chunkPrev = Nothing
+          , chunkNext = case chunkedChunks chunkedDoc of
+                          [] -> Nothing
+                          (x:_) -> Just x
+          , chunkUnlisted = True
+          , chunkContents = mempty
+          }
+
+  let chunks = map (\x -> case chunkUp x of
+                             Nothing -> x{ chunkUp = Just topChunk }
+                             _ -> x)
+               $ case chunkedChunks chunkedDoc of
+                   [] -> []
+                   (x:xs) -> x{ chunkPrev = Just topChunk } : xs
+
+  let Node secinfo secs = chunkedTOC chunkedDoc
+  let tocTree = Node secinfo{ secTitle = docTitle meta,
+                              secPath = "index.html" } secs
+  let tocBlocks = buildTOC opts tocTree
+  renderedTOC <- writeHtml5String opts{ writerTemplate = Nothing }
+                    (Pandoc nullMeta tocBlocks)
+  let opts' = opts{ writerVariables =
+                        defField "table-of-contents" renderedTOC
+                      $ writerVariables opts }
+  entries <- mapM (chunkToEntry opts' meta topChunk) (topChunk : chunks)
+  let sitemap = toEntry "sitemap.json" epochtime
+                  (encode $ toJSON $ tocTreeToContext tocTree)
+  let archive = foldr addEntryToArchive emptyArchive
+                 (sitemap : entries ++ mediaEntries)
+  return $ fromArchive archive
+
+
+addMedia :: PandocMonad m => Inline -> m Inline
+addMedia il@(Image _ _ (src,_))
+  | fp <- normalise (T.unpack src)
+  , isRelative fp
+  , not (".." `isInfixOf` fp) = do
+  bs <- readFileLazy fp
+  insertMedia fp Nothing bs
+  return il
+addMedia il = return il
+
+buildTOC :: WriterOptions -> Tree SecInfo -> [Block]
+buildTOC opts tocTree = buildTOCPart tocTree
+ where
+  buildTOCPart (Node secinfo subsecs) =
+    Plain [Link nullAttr
+            ((maybe [] (\num ->
+               if writerNumberSections opts
+                  then [Span ("",["toc-section-number"],[])
+                                  [Str num, Space]]
+                  else []) (secNumber secinfo))
+             ++ secTitle secinfo)
+            (secPath secinfo, "") | secLevel secinfo > 0] :
+    if null subsecs
+       then []
+       else [BulletList (map buildTOCPart $ filter aboveThreshold subsecs)]
+  aboveThreshold (Node sec _) = secLevel sec <= writerTOCDepth opts
+
+chunkToEntry :: PandocMonad m
+             => WriterOptions -> Meta -> Chunk -> Chunk -> m Entry
+chunkToEntry opts meta topChunk chunk = do
+  html <- writeHtml5String opts' (Pandoc meta' blocks)
+  epochtime <- floor <$> getPOSIXTime
+  let htmlLBS = BL.fromStrict $ TE.encodeUtf8 html
+  return $ toEntry (chunkPath chunk) epochtime htmlLBS
+ where
+  opts' = opts{ writerVariables =
+                  addContextVars opts' topChunk chunk $ writerVariables opts }
+  meta' = if chunk == topChunk
+             then meta
+             else Meta $ M.fromList [("pagetitle", MetaString
+                                     (stringify $ chunkHeading chunk))]
+  blocks = chunkContents chunk
+
+tocTreeToContext :: Tree SecInfo -> Context Text
+tocTreeToContext (Node secinfo subs) =
+  Context $ M.fromList
+  [ ("section", MapVal $ secInfoToContext secinfo)
+  , ("subsections", ListVal $ map (MapVal . tocTreeToContext) subs)
+  ]
+
+secInfoToContext :: SecInfo -> Context Text
+secInfoToContext sec =
+  Context $ M.fromList
+  [ ("title", SimpleVal $ literal $ stringify $ secTitle sec)
+  , ("number", maybe NullVal (SimpleVal . literal) (secNumber sec))
+  , ("id", SimpleVal $ literal $ secId sec)
+  , ("path", SimpleVal $ literal $ secPath sec)
+  , ("level", SimpleVal $ literal $ tshow $ secLevel sec)
+  ]
+
+addContextVars
+  :: WriterOptions -> Chunk -> Chunk -> Context Text -> Context Text
+addContextVars opts topChunk chunk context =
+     maybe id (defField "next" . navlinks) (chunkNext chunk)
+   . maybe id (defField "previous" . navlinks) (chunkPrev chunk)
+   . maybe id (defField "up" . navlinks) (chunkUp chunk)
+   . maybe id (defField "top" . navlinks) (if chunk == topChunk
+                                              then Nothing
+                                              else Just topChunk)
+   . defField "toc" (chunk == topChunk && writerTableOfContents opts)
+    $ context
+ where
+  navlinks ch = toMapVal [("url", formatPath ch), ("title", formatHeading ch)]
+  toMapVal = MapVal . Context . M.fromList
+  formatPath = SimpleVal . literal . T.pack . chunkPath
+  formatHeading ch = SimpleVal . literal . either (const "") id . runPure $
+    writeHtml5String opts{ writerTemplate = Nothing }
+      (Pandoc nullMeta [Plain $ chunkHeading ch])
+
author	John MacFarlane <[email protected]>	2023-01-05 21:52:44 -0800
committer	John MacFarlane <[email protected]>	2023-01-10 21:48:38 -0800
commit	1fdc27a2c0f723074c87ba4674575bc30da974f2 (patch)
tree	8365affedb3445b474479de6ff6190be6ba2b54c
parent	b55cd83a609bcb01f8472a66ab0ddca985c881e4 (diff)