aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlbert Krewinkel <[email protected]>2022-10-10 09:33:20 +0200
committerAlbert Krewinkel <[email protected]>2022-10-10 11:27:42 +0200
commita5ffaaa4136f11378c0c7741309edb9fd53e17d8 (patch)
treeafa29d609290f50cf752754cfca4a23075dc2781
parentd6fb8fb20fbb822d58cdfbde34961094c902a708 (diff)
Lua: support custom bytestring readers.
-rw-r--r--doc/custom-readers.md17
-rw-r--r--pandoc-lua-engine/pandoc-lua-engine.cabal2
-rw-r--r--pandoc-lua-engine/src/Text/Pandoc/Lua/Reader.hs74
-rw-r--r--pandoc-lua-engine/test/Tests/Lua/Reader.hs34
-rw-r--r--pandoc-lua-engine/test/bytestring-reader.lua7
-rw-r--r--pandoc-lua-engine/test/test-pandoc-lua-engine.hs2
6 files changed, 111 insertions, 25 deletions
diff --git a/doc/custom-readers.md b/doc/custom-readers.md
index add2317a2..601d879dd 100644
--- a/doc/custom-readers.md
+++ b/doc/custom-readers.md
@@ -76,6 +76,23 @@ ensuring backwards compatibility.
[patterns]: http://lua-users.org/wiki/PatternsTutorial
[lpeg]: http://www.inf.puc-rio.br/~roberto/lpeg/
+# Bytestring readers
+
+Pandoc expects text input to be UTF-8 encoded. However, formats
+like docx, odt, epub, etc. are not text but binary formats. To
+read them, pandoc supports `ByteStringReader` functions. These
+functions work just like the `Reader` function that process text
+input, but instead of a list of sources, `ByteStringReader`
+functions are passed a bytestring, i.e., a string that contains
+the binary input.
+
+``` lua
+-- read input as epub
+function ByteStringReader (input)
+ return pandoc.read(input, 'epub')
+end
+```
+
# Example: plain text reader
This is a simple example using [lpeg] to parse the input
diff --git a/pandoc-lua-engine/pandoc-lua-engine.cabal b/pandoc-lua-engine/pandoc-lua-engine.cabal
index b8f3abbd1..a122b935c 100644
--- a/pandoc-lua-engine/pandoc-lua-engine.cabal
+++ b/pandoc-lua-engine/pandoc-lua-engine.cabal
@@ -21,6 +21,7 @@ description: This package provides a pandoc scripting engine based on
extra-source-files: README.md
, test/bytestring.bin
, test/bytestring.lua
+ , test/bytestring-reader.lua
, test/lua/*.lua
, test/lua/module/*.lua
, test/lua/module/partial.test
@@ -131,4 +132,5 @@ test-suite test-pandoc-lua-engine
, text >= 1.1.1 && < 2.1
other-modules: Tests.Lua
, Tests.Lua.Module
+ , Tests.Lua.Reader
, Tests.Lua.Writer
diff --git a/pandoc-lua-engine/src/Text/Pandoc/Lua/Reader.hs b/pandoc-lua-engine/src/Text/Pandoc/Lua/Reader.hs
index 44781c9fb..6aeda526f 100644
--- a/pandoc-lua-engine/src/Text/Pandoc/Lua/Reader.hs
+++ b/pandoc-lua-engine/src/Text/Pandoc/Lua/Reader.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TypeApplications #-}
{- |
Module : Text.Pandoc.Lua.Reader
Copyright : Copyright (C) 2021-2022 John MacFarlane
@@ -17,8 +18,9 @@ import Control.Monad ((<=<), when)
import Control.Monad.IO.Class (MonadIO)
import Data.Maybe (fromMaybe)
import HsLua as Lua hiding (Operation (Div))
-import HsLua.Core.Run (newGCManagedState, withGCManagedState)
+import HsLua.Core.Run (GCManagedState, newGCManagedState, withGCManagedState)
import Text.Pandoc.Class (PandocMonad, findFileWithDataFallback, report)
+import Text.Pandoc.Error (PandocError)
import Text.Pandoc.Logging
import Text.Pandoc.Lua.Global (Global (..), setGlobals)
import Text.Pandoc.Lua.Init (runLuaWith)
@@ -42,14 +44,23 @@ readCustom luaFile = do
-- to handle this more gracefully):
when (stat /= Lua.OK)
Lua.throwErrorAsException
- pure (reader luaState)
+ getCustomReader luaState
where
- reader st = TextReader $ \opts srcs -> liftIO . withGCManagedState st $ do
+ readerField = "PANDOC Reader function"
+ inLua st = liftIO . withGCManagedState @PandocError st
+ byteStringReader :: MonadIO m => GCManagedState -> Reader m
+ byteStringReader st = ByteStringReader $ \ropts input -> inLua st $ do
+ getfield registryindex readerField
+ push input
+ push ropts
+ callTrace 2 1
+ forcePeek $ peekPandoc top
+ textReader st = TextReader $ \ropts srcs -> inLua st $ do
let input = toSources srcs
- getglobal "Reader"
+ getfield registryindex readerField
push input
- push opts
+ push ropts
pcallTrace 2 1 >>= \case
OK -> forcePeek $ peekPandoc top
ErrRun -> do
@@ -59,25 +70,38 @@ readCustom luaFile = do
Failure {} ->
-- not a string error object. Bail!
throwErrorAsException
- Success errmsg -> do
+ Success errmsg ->
if "string expected, got pandoc Sources" `T.isInfixOf` errmsg
- then do
- pop 1
- _ <- unPandocLua $ do
- report $ Deprecated "old Reader function signature" $
- T.unlines
- [ "Reader functions should accept a sources list; "
- , "functions expecting `string` input are deprecated. "
- , "Use `tostring` to convert the first argument to a "
- , "string."
- ]
- getglobal "Reader"
- push $ sourcesToText input -- push sources as string
- push opts
- callTrace 2 1
- forcePeek $ peekPandoc top
- else
- -- nothing we can do here
- throwErrorAsException
+ then do
+ pop 1
+ _ <- unPandocLua $ do
+ report $ Deprecated "old Reader function signature" $
+ T.unlines
+ [ "Reader functions should accept a sources list; "
+ , "functions expecting `string` input are deprecated. "
+ , "Use `tostring` to convert the first argument to a "
+ , "string."
+ ]
+ getglobal "Reader"
+ push $ sourcesToText input -- push sources as string
+ push ropts
+ callTrace 2 1
+ forcePeek $ peekPandoc top
+ else
+ -- nothing we can do here
+ throwErrorAsException
_ -> -- not a runtime error, we won't be able to recover from that
- throwErrorAsException
+ throwErrorAsException
+ getCustomReader st = do
+ getglobal "Reader" >>= \case
+ TypeNil -> do
+ pop 1
+ getglobal "ByteStringReader" >>= \case
+ TypeNil -> failLua $ "No reader function found: either 'Reader' or "
+ <> "'ByteStringReader' must be defined."
+ _ -> do
+ setfield registryindex readerField
+ pure (byteStringReader st)
+ _ -> do
+ setfield registryindex readerField
+ pure (textReader st)
diff --git a/pandoc-lua-engine/test/Tests/Lua/Reader.hs b/pandoc-lua-engine/test/Tests/Lua/Reader.hs
new file mode 100644
index 000000000..16474bd91
--- /dev/null
+++ b/pandoc-lua-engine/test/Tests/Lua/Reader.hs
@@ -0,0 +1,34 @@
+{-# LANGUAGE LambdaCase #-}
+{- |
+Module : Tests.Lua.Reader
+Copyright : © 2022 Albert Krewinkel
+License : GPL-2.0-or-later
+Maintainer : Albert Krewinkel <[email protected]>
+
+Tests for custom Lua readers.
+-}
+module Tests.Lua.Reader (tests) where
+
+import Data.Char (chr)
+import Data.Default (Default (def))
+import Text.Pandoc.Class (runIOorExplode)
+import Text.Pandoc.Lua (readCustom)
+import Text.Pandoc.Readers (Reader (ByteStringReader, TextReader))
+import Test.Tasty (TestTree)
+import Test.Tasty.HUnit ((@?=), testCase)
+
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+
+tests :: [TestTree]
+tests =
+ [ testCase "read binary to code block" $ do
+ input <- BL.readFile "bytestring.bin"
+ doc <- runIOorExplode $
+ readCustom "bytestring-reader.lua" >>= \case
+ ByteStringReader f -> f def input
+ TextReader {} -> error "Expected a bytestring reader"
+ let bytes = mconcat $ map (B.str . T.singleton . chr) [0..255]
+ doc @?= B.doc (B.plain bytes)
+ ]
diff --git a/pandoc-lua-engine/test/bytestring-reader.lua b/pandoc-lua-engine/test/bytestring-reader.lua
new file mode 100644
index 000000000..7908479ec
--- /dev/null
+++ b/pandoc-lua-engine/test/bytestring-reader.lua
@@ -0,0 +1,7 @@
+function ByteStringReader (input, opts)
+ local chars = pandoc.List{}
+ for i = 1, #input do
+ chars:insert(utf8.char(input:byte(i,i)))
+ end
+ return pandoc.Pandoc(pandoc.Plain(pandoc.Str(table.concat(chars))))
+end
diff --git a/pandoc-lua-engine/test/test-pandoc-lua-engine.hs b/pandoc-lua-engine/test/test-pandoc-lua-engine.hs
index 035d92812..21febddb5 100644
--- a/pandoc-lua-engine/test/test-pandoc-lua-engine.hs
+++ b/pandoc-lua-engine/test/test-pandoc-lua-engine.hs
@@ -2,6 +2,7 @@ module Main (main) where
import Test.Tasty (TestTree, defaultMain, testGroup)
import qualified Tests.Lua
import qualified Tests.Lua.Module
+import qualified Tests.Lua.Reader
import qualified Tests.Lua.Writer
import System.Directory (withCurrentDirectory)
@@ -13,4 +14,5 @@ tests = testGroup "pandoc Lua engine"
[ testGroup "Lua filters" Tests.Lua.tests
, testGroup "Lua modules" Tests.Lua.Module.tests
, testGroup "Custom writers" Tests.Lua.Writer.tests
+ , testGroup "Custom readers" Tests.Lua.Reader.tests
]