From 31f74fac065023e1accc686753dc6216edd4c9c4 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Tue, 13 May 2025 23:21:28 +0200 Subject: Lua: support sandboxed parsing with `pandoc.read`. The function `pandoc.read` is now taking an optional fourth parameter that specifies the environment in which the parser will be run. Passing the string `sandbox` as the argument causes the reader to run in a sandbox, thereby preventing all access to the network and file system. Closes: #10831 --- doc/lua-filters.md | 18 ++++++- .../src/Text/Pandoc/Lua/Module/Pandoc.hs | 54 +++++++++++++++------ pandoc-lua-engine/test/lua/module/include.tex | 1 + pandoc-lua-engine/test/lua/module/pandoc.lua | 34 +++++++++++++ pandoc-lua-engine/test/lua/module/sample.epub | Bin 0 -> 5522 bytes 5 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 pandoc-lua-engine/test/lua/module/include.tex create mode 100644 pandoc-lua-engine/test/lua/module/sample.epub diff --git a/doc/lua-filters.md b/doc/lua-filters.md index 88f770241..6a6460431 100644 --- a/doc/lua-filters.md +++ b/doc/lua-filters.md @@ -3730,7 +3730,7 @@ Parameters: Returns: the transformed inline element -### `read (markup[, format[, reader_options]])` {#pandoc.read} +### `read (markup[, format[, reader_options[, read_env]]])` {#pandoc.read} Parse the given string into a Pandoc document. @@ -3740,7 +3740,8 @@ the mediabag. This means that if the document specifies files to be included, as is possible in formats like LaTeX, reStructuredText, and Org, then these will be included in the resulting document. Any media elements are added to those -retrieved from the other parsed input files. +retrieved from the other parsed input files. Use the `read_env` +parameter to modify this behavior. The `format` parameter defines the format flavor that will be parsed. This can be either a string, using `+` and `-` to enable @@ -3770,6 +3771,19 @@ Parameters: ReaderOptions object; defaults to the default values documented in the manual. ([ReaderOptions]|table) +`read_env` +: which environment the reader operates in: Possible values + are: + + - 'io' is the default and gives the behavior described above. + - 'global' uses the same environment that was used to read + the input files; the parser has full access to the + file-system and the mediabag. + - 'sandbox' works like 'global' and give the parser access to + the mediabag, but prohibits file-system access. + + Defaults to `'io'`. (string) + Returns: pandoc document ([Pandoc](#type-pandoc)) Usage: diff --git a/pandoc-lua-engine/src/Text/Pandoc/Lua/Module/Pandoc.hs b/pandoc-lua-engine/src/Text/Pandoc/Lua/Module/Pandoc.hs index 4ad93dcbf..522871a73 100644 --- a/pandoc-lua-engine/src/Text/Pandoc/Lua/Module/Pandoc.hs +++ b/pandoc-lua-engine/src/Text/Pandoc/Lua/Module/Pandoc.hs @@ -19,6 +19,7 @@ import Prelude hiding (read) import Control.Applicative ((<|>)) import Control.Monad (forM_, when) import Control.Monad.Catch (catch, handle, throwM) +import Control.Monad.Except (MonadError (throwError)) import Data.Data (Data, dataTypeConstrs, dataTypeOf, showConstr) import Data.Default (Default (..)) import Data.Maybe (fromMaybe) @@ -26,9 +27,10 @@ import Data.Proxy (Proxy (Proxy)) import Data.Text.Encoding.Error (UnicodeException) import HsLua import System.Exit (ExitCode (..)) +import Text.Pandoc.Class (PandocMonad, sandbox) import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (..)) -import Text.Pandoc.Format (parseFlavoredFormat) +import Text.Pandoc.Format (FlavoredFormat, parseFlavoredFormat) import Text.Pandoc.Lua.Orphans () import Text.Pandoc.Lua.Marshal.AST import Text.Pandoc.Lua.Marshal.Format (peekFlavoredFormat) @@ -234,22 +236,31 @@ functions = =?> "output string, or error triple" , defun "read" - ### (\content mformatspec mreaderOptions -> - handle (failLua . show @UnicodeException) . unPandocLua $ do - flvrd <- maybe (parseFlavoredFormat "markdown") pure mformatspec + ### (\content mformatspec mreaderOptions mreadEnv -> do let readerOpts = fromMaybe def mreaderOptions - getReader flvrd >>= \case - (TextReader r, es) -> - r readerOpts{readerExtensions = es} - (case content of + readEnv = fromMaybe "global" mreadEnv + + readAction :: PandocMonad m => FlavoredFormat -> m Pandoc + readAction flvrd = getReader flvrd >>= \case + (TextReader r, es) -> + r readerOpts{readerExtensions = es} $ + case content of Left bs -> toSources $ UTF8.toText bs - Right sources -> sources) - (ByteStringReader r, es) -> - case content of - Left bs -> r readerOpts{readerExtensions = es} - (BSL.fromStrict bs) - Right _ -> throwM $ PandocLuaError - "Cannot use bytestring reader with Sources") + Right sources -> sources + (ByteStringReader r, es) -> + case content of + Left bs -> r readerOpts{readerExtensions = es} + (BSL.fromStrict bs) + Right _ -> throwError $ PandocLuaError + "Cannot use bytestring reader with Sources" + + handle (failLua . show @UnicodeException) . unPandocLua $ do + flvrd <- maybe (parseFlavoredFormat "markdown") pure mformatspec + case readEnv of + "global" -> readAction flvrd + "sandbox" -> sandbox [] (readAction flvrd) + x -> throwError $ PandocLuaError + ("unknown read environment: " <> x)) <#> parameter (\idx -> (Left <$> peekByteString idx) <|> (Right <$> peekSources idx)) "string|Sources" "content" "text to parse" @@ -257,6 +268,19 @@ functions = "formatspec" "format and extensions") <#> opt (parameter peekReaderOptions "ReaderOptions" "reader_options" "reader options") + <#> opt (parameter peekText "string" "read_env" $ mconcat + [ "which environment the reader operates in: Possible values" + , "are:" + , "" + , "- 'io' is the default and gives the behavior described above." + , "- 'global' uses the same environment that was used to read" + , " the input files; the parser has full access to the" + , " file-system and the mediabag." + , "- 'sandbox' works like 'global' and give the parser access to" + , " the mediabag, but prohibits file-system access." + , "" + , "Defaults to `'io'`. (string)" + ]) =#> functionResult pushPandoc "Pandoc" "result document" , sha1 diff --git a/pandoc-lua-engine/test/lua/module/include.tex b/pandoc-lua-engine/test/lua/module/include.tex new file mode 100644 index 000000000..201a102cd --- /dev/null +++ b/pandoc-lua-engine/test/lua/module/include.tex @@ -0,0 +1 @@ +included diff --git a/pandoc-lua-engine/test/lua/module/pandoc.lua b/pandoc-lua-engine/test/lua/module/pandoc.lua index 5df547d24..3db58c0f8 100644 --- a/pandoc-lua-engine/test/lua/module/pandoc.lua +++ b/pandoc-lua-engine/test/lua/module/pandoc.lua @@ -293,6 +293,40 @@ return { 'Unknown input format nosuchreader' ) end), + group 'read_env' { + test('images are added to the mediabag', function () + local epub = io.open('lua/module/sample.epub', 'rb'):read('a') + local _ = pandoc.read(epub, 'epub') + assert.are_equal( + #pandoc.mediabag.list(), + 1 + ) + end), + test('images from EPUB are added when using the sandbox', function () + local epub = io.open('lua/module/sample.epub', 'rb'):read('a') + local _ = pandoc.read(epub, 'epub', nil, 'sandbox') + assert.are_equal( + #pandoc.mediabag.list(), + 1 + ) + end), + test('includes work in global env', function () + local tex = '\\include{lua/module/include.tex}' + local doc = pandoc.read(tex, 'latex', nil, 'global') + assert.are_equal( + doc.blocks, + pandoc.Blocks{pandoc.Para 'included'} + ) + end), + test('sandbox disallows access to the filesystem', function () + local tex = '\\include{lua/module/include.tex}' + local doc = pandoc.read(tex, 'latex', nil, 'sandbox') + assert.are_equal( + doc.blocks, + pandoc.Blocks{} + ) + end), + }, group 'extensions' { test('string spec', function () local doc = pandoc.read('"vice versa"', 'markdown-smart') diff --git a/pandoc-lua-engine/test/lua/module/sample.epub b/pandoc-lua-engine/test/lua/module/sample.epub new file mode 100644 index 000000000..fca4a1861 Binary files /dev/null and b/pandoc-lua-engine/test/lua/module/sample.epub differ -- cgit v1.2.3