App: move initial input-to-Pandoc code to internal submodule

author: Albert Krewinkel <[email protected]> 2022-09-30 23:36:02 +0200
committer: Albert Krewinkel <[email protected]> 2022-09-30 23:42:41 +0200
commit: 1148ead64cc90890953b7630229f0f5993dca0fc (patch)
tree: 07ce0eba19fbdf2a4a40ea311d48a2405153c87c /src
parent: cfc2024d37944bdefa805d5c18a2ab0c3d620bbe (diff)
2 files changed, 206 insertions, 151 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 493b58a1f..b6129422c 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -30,9 +30,7 @@ import qualified Control.Exception as E
 import Control.Monad ( (>=>), when, forM_ )
 import Control.Monad.Trans ( MonadIO(..) )
 import Control.Monad.Catch ( MonadMask )
-import Control.Monad.Except (throwError, catchError)
-import qualified Data.ByteString as BS
-import qualified Data.ByteString.Char8 as B8
+import Control.Monad.Except (throwError)
 import qualified Data.ByteString.Lazy as BL
 import Data.Maybe (fromMaybe, isJust, isNothing)
 import qualified Data.Set as Set
@@ -40,26 +38,22 @@ import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TE
-import qualified Data.Text.Encoding as TSE
 import qualified Data.Text.Encoding.Error as TE
-import qualified Data.Text.Encoding.Error as TSE
-import Network.URI (URI (..), parseURI)
 import System.Directory (doesDirectoryExist)
 import System.Exit (exitSuccess)
 import System.FilePath ( takeBaseName, takeExtension)
 import System.IO (nativeNewline, stdout)
 import qualified System.IO as IO (Newline (..))
 import Text.Pandoc
-import Text.Pandoc.Walk (walk)
 import Text.Pandoc.Builder (setMeta)
 import Text.Pandoc.MediaBag (mediaItems)
-import Text.Pandoc.MIME (getCharset, MimeType)
 import Text.Pandoc.Image (svgToPng)
 import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
 import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
                             IpynbOutput (..))
 import Text.Pandoc.App.CommandLineOptions (parseOptions, parseOptionsFromArgs,
                                            options)
+import Text.Pandoc.App.Input (InputParameters (..), readInput)
 import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings)
 import Text.Collate.Lang (Lang (..), parseLang)
 import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), Environment (..),
@@ -68,8 +62,7 @@ import Text.Pandoc.PDF (makePDF)
 import Text.Pandoc.Scripting (ScriptingEngine (..))
 import Text.Pandoc.SelfContained (makeSelfContained)
 import Text.Pandoc.Shared (eastAsianLineBreakFilter,
-         headerShift, isURI, tabFilter, uriPathToPath, filterIpynbOutput,
-         defaultUserDataDir, tshow, textToIdentifier)
+         headerShift, isURI, filterIpynbOutput, defaultUserDataDir, tshow)
 import Text.Pandoc.Sources (toSources)
 import Text.Pandoc.Writers.Shared (lookupMetaString)
 import Text.Pandoc.Readers.Markdown (yamlToMeta)
@@ -287,14 +280,6 @@ convertWithOpts' scriptingEngine istty datadir opts = do
                                           _        -> Format format) :))
                    $ []
 
-  let convertTabs = tabFilter (if optPreserveTabs opts ||
-                                    readerNameBase == "t2t" ||
-                                    readerNameBase == "man" ||
-                                    readerNameBase == "tsv"
-                                  then 0
-                                  else optTabStop opts)
-
-
   when (readerNameBase == "markdown_github" ||
         writerNameBase == "markdown_github") $
     report $ Deprecated "markdown_github" "Use gfm instead."
@@ -320,27 +305,18 @@ convertWithOpts' scriptingEngine istty datadir opts = do
 
   let filterEnv = Environment readerOpts writerOptions
 
-  inputs <- readSources sources
-
-
-  doc <- (case reader of
-           TextReader r
-             | readerNameBase == "json" ->
-                 mconcat <$>
-                    mapM (inputToText convertTabs
-                           >=> r readerOpts . (:[])) inputs
-             | optFileScope opts  ->
-                 mconcat <$> mapM
-                    (\source -> do
-                      (fp, txt) <- inputToText convertTabs source
-                      adjustLinksAndIds (readerExtensions readerOpts)
-                        (T.pack fp) (map (T.pack . fst) inputs)
-                        <$> r readerOpts [(fp, txt)])
-                    inputs
-             | otherwise -> mapM (inputToText convertTabs) inputs
-                              >>= r readerOpts
-           ByteStringReader r ->
-             mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs)
+  let inputParams = InputParameters
+        { inputReader = reader
+        , inputReaderName = readerNameBase
+        , inputReaderOptions = readerOpts
+        , inputSources = sources
+        , inputFileScope = optFileScope opts
+        , inputSpacesPerTab = if optPreserveTabs opts
+                              then Nothing
+                              else Just (optTabStop opts)
+        }
+
+  doc <- readInput inputParams
           >>= ( return . adjustMetadata (metadataFromFile <>)
             >=> return . adjustMetadata (<> optMetadata opts)
             >=> return . adjustMetadata (<> cslMetadata)
@@ -411,64 +387,6 @@ adjustMetadata f (Pandoc meta bs) = Pandoc (f meta) bs
 applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
 applyTransforms transforms d = return $ foldr ($) d transforms
 
-readSources :: PandocMonad m
-            => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))]
-readSources srcs =
-  mapM (\fp -> do t <- readSource fp
-                  return (if fp == "-" then "" else fp, t)) srcs
-
-readSource :: PandocMonad m
-           => FilePath -> m (BS.ByteString, Maybe MimeType)
-readSource "-" = (,Nothing) <$> readStdinStrict
-readSource src =
-  case parseURI src of
-    Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src)
-           | uriScheme u == "file:" ->
-               (,Nothing) <$>
-                 readFileStrict (uriPathToPath $ T.pack $ uriPath u)
-    _       -> (,Nothing) <$> readFileStrict src
-
-utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
-utf8ToText fp bs =
-  case TSE.decodeUtf8' . dropBOM $ bs of
-    Left (TSE.DecodeError _ (Just w)) ->
-      case BS.elemIndex w bs of
-        Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
-        Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
-    Left e -> throwError $ PandocAppError (tshow e)
-    Right t -> return t
- where
-   dropBOM bs' =
-     if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
-        then BS.drop 3 bs'
-        else bs'
-
-
-inputToText :: PandocMonad m
-            => (Text -> Text)
-            -> (FilePath, (BS.ByteString, Maybe MimeType))
-            -> m (FilePath, Text)
-inputToText convTabs (fp, (bs,mt)) =
-  (fp,) . convTabs . T.filter (/='\r') <$>
-  case mt >>= getCharset of
-    Just "UTF-8"      -> utf8ToText fp bs
-    Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
-    Just charset      -> throwError $ PandocUnsupportedCharsetError charset
-    Nothing           -> catchError
-                           (utf8ToText fp bs)
-                           (\case
-                              PandocUTF8DecodingError{} -> do
-                                report $ NotUTF8Encoded
-                                  (if null fp
-                                      then "input"
-                                      else fp)
-                                return $ T.pack $ B8.unpack bs
-                              e -> throwError e)
-
-inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
-                      -> BL.ByteString
-inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs
-
 writeFnBinary :: FilePath -> BL.ByteString -> IO ()
 writeFnBinary "-" = BL.putStr
 writeFnBinary f   = BL.writeFile (UTF8.encodePath f)
@@ -476,57 +394,3 @@ writeFnBinary f   = BL.writeFile (UTF8.encodePath f)
 writerFn :: IO.Newline -> FilePath -> Text -> IO ()
 writerFn eol "-" = UTF8.putStrWith eol
 writerFn eol f   = UTF8.writeFileWith eol f
-
-adjustLinksAndIds :: Extensions -> Text -> [Text] -> Pandoc -> Pandoc
-adjustLinksAndIds exts thisfile allfiles
-  | length allfiles > 1 = addDiv . walk fixInline . walk fixBlock
-  | otherwise           = id
- where
-  toIdent :: Text -> Text
-  toIdent = textToIdentifier exts . T.intercalate "__" .
-            T.split (\c -> c == '/' || c == '\\')
-
-  addDiv :: Pandoc -> Pandoc
-  addDiv (Pandoc m bs)
-    | T.null thisfile = Pandoc m bs
-    | otherwise = Pandoc m [Div (toIdent thisfile,[],[]) bs]
-
-  fixBlock :: Block -> Block
-  fixBlock (CodeBlock attr t) = CodeBlock (fixAttrs attr) t
-  fixBlock (Header lev attr ils) = Header lev (fixAttrs attr) ils
-  fixBlock (Table attr cap cols th tbs tf) =
-     Table (fixAttrs attr) cap cols th tbs tf
-  fixBlock (Div attr bs) = Div (fixAttrs attr) bs
-  fixBlock x = x
-
-  -- add thisfile as prefix of identifier
-  fixAttrs :: Attr -> Attr
-  fixAttrs (i,cs,kvs)
-    | T.null i = (i,cs,kvs)
-    | otherwise =
-        (T.intercalate "__"
-          (filter (not . T.null) [toIdent thisfile, i]),
-        cs, kvs)
-
-  -- if URL begins with file from allfiles, convert to
-  -- an internal link with the appropriate identifier
-  fixURL :: Text -> Text
-  fixURL u =
-    let (a,b) = T.break (== '#') u
-        filepart = if T.null a
-                      then toIdent thisfile
-                      else toIdent a
-        fragpart = T.dropWhile (== '#') b
-     in if T.null a || a `elem` allfiles
-           then "#" <> T.intercalate "__"
-                         (filter (not . T.null) [filepart, fragpart])
-           else u
-
-  fixInline :: Inline -> Inline
-  fixInline (Code attr t) = Code (fixAttrs attr) t
-  fixInline (Link attr ils (url,tit)) =
-    Link (fixAttrs attr) ils (fixURL url,tit)
-  fixInline (Image attr ils (url,tit)) =
-    Image (fixAttrs attr) ils (fixURL url,tit)
-  fixInline (Span attr ils) = Span (fixAttrs attr) ils
-  fixInline x = x
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs
new file mode 100644
index 000000000..68f5f7f15
--- /dev/null
+++ b/src/Text/Pandoc/App/Input.hs
@@ -0,0 +1,191 @@
+{-# LANGUAGE LambdaCase          #-}
+{-# LANGUAGE OverloadedStrings   #-}
+{-# LANGUAGE TupleSections       #-}
+{- |
+   Module      : Text.Pandoc.App.Input
+   Copyright   : © 2006-2022 John MacFarlane
+   License     : GPL-2.0-or-later
+   Maintainer  : John MacFarlane <jgm@berkeley@edu>
+
+Read from the file system into a pandoc document.
+-}
+module Text.Pandoc.App.Input
+  ( InputParameters (..)
+  , readInput
+  ) where
+
+import Control.Monad ((>=>))
+import Control.Monad.Except (throwError, catchError)
+import Data.Text (Text)
+import Network.URI (URI (..), parseURI)
+import Text.Pandoc.Class ( PandocMonad, openURL
+                         , readFileStrict, readStdinStrict, report)
+import Text.Pandoc.Definition (Pandoc (..), Attr, Block (..), Inline (..))
+import Text.Pandoc.Error (PandocError (..))
+import Text.Pandoc.Logging (LogMessage (..))
+import Text.Pandoc.MIME (getCharset, MimeType)
+import Text.Pandoc.Options (Extensions, ReaderOptions (..))
+import Text.Pandoc.Readers (Reader (..))
+import Text.Pandoc.Shared (tabFilter, textToIdentifier, tshow, uriPathToPath)
+import Text.Pandoc.Walk (walk)
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TSE
+import qualified Data.Text.Encoding.Error as TSE
+
+-- | Settings specifying how and which input should be processed.
+data InputParameters m = InputParameters
+  { inputReader         :: Reader m
+  , inputReaderName     :: Text
+  , inputReaderOptions  :: ReaderOptions
+  , inputSources        :: [FilePath]
+  , inputSpacesPerTab   :: Maybe Int
+  , inputFileScope      :: Bool
+  }
+
+-- | Read all input into a pandoc document.
+readInput :: PandocMonad m => InputParameters m -> m Pandoc
+readInput params = do
+  let sources = inputSources params
+  let readerName = inputReaderName params
+  let readerOpts = inputReaderOptions params
+  let convertTabs :: Text -> Text
+      convertTabs = tabFilter $ case inputSpacesPerTab params of
+        Nothing -> 0
+        Just ts -> if readerName `elem` ["t2t", "man", "tsv"]
+                   then 0
+                   else ts
+
+  inputs <- readSources sources
+
+  case inputReader params of
+    TextReader r
+      | readerName == "json" ->
+          mconcat <$> mapM (inputToText convertTabs >=> r readerOpts . (:[]))
+                           inputs
+      | inputFileScope params ->
+          mconcat <$> mapM
+              (\source -> do
+                  (fp, txt) <- inputToText convertTabs source
+                  adjustLinksAndIds (readerExtensions readerOpts)
+                    (T.pack fp) (map (T.pack . fst) inputs)
+                    <$> r readerOpts [(fp, txt)])
+              inputs
+      | otherwise -> mapM (inputToText convertTabs) inputs >>= r readerOpts
+    ByteStringReader r ->
+      mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs
+
+readSources :: PandocMonad m
+            => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))]
+readSources srcs =
+  mapM (\fp -> do t <- readSource fp
+                  return (if fp == "-" then "" else fp, t)) srcs
+
+-- | Read input from a resource, i.e., either a file, a URL, or stdin
+-- (@-@).
+readSource :: PandocMonad m
+           => FilePath -> m (BS.ByteString, Maybe MimeType)
+readSource "-" = (,Nothing) <$> readStdinStrict
+readSource src =
+  case parseURI src of
+    Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src)
+           | uriScheme u == "file:" ->
+               (,Nothing) <$>
+                 readFileStrict (uriPathToPath $ T.pack $ uriPath u)
+    _       -> (,Nothing) <$> readFileStrict src
+
+utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
+utf8ToText fp bs =
+  case TSE.decodeUtf8' . dropBOM $ bs of
+    Left (TSE.DecodeError _ (Just w)) ->
+      case BS.elemIndex w bs of
+        Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
+        Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
+    Left e -> throwError $ PandocAppError (tshow e)
+    Right t -> return t
+ where
+   dropBOM bs' =
+     if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
+        then BS.drop 3 bs'
+        else bs'
+
+inputToText :: PandocMonad m
+            => (Text -> Text)
+            -> (FilePath, (BS.ByteString, Maybe MimeType))
+            -> m (FilePath, Text)
+inputToText convTabs (fp, (bs,mt)) =
+  (fp,) . convTabs . T.filter (/='\r') <$>
+  case mt >>= getCharset of
+    Just "UTF-8"      -> utf8ToText fp bs
+    Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
+    Just charset      -> throwError $ PandocUnsupportedCharsetError charset
+    Nothing           -> catchError
+                           (utf8ToText fp bs)
+                           (\case
+                              PandocUTF8DecodingError{} -> do
+                                report $ NotUTF8Encoded
+                                  (if null fp
+                                      then "input"
+                                      else fp)
+                                return $ T.pack $ B8.unpack bs
+                              e -> throwError e)
+
+inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
+                      -> BL.ByteString
+inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs
+
+adjustLinksAndIds :: Extensions -> Text -> [Text] -> Pandoc -> Pandoc
+adjustLinksAndIds exts thisfile allfiles
+  | length allfiles > 1 = addDiv . walk fixInline . walk fixBlock
+  | otherwise           = id
+ where
+  toIdent :: Text -> Text
+  toIdent = textToIdentifier exts . T.intercalate "__" .
+            T.split (\c -> c == '/' || c == '\\')
+
+  addDiv :: Pandoc -> Pandoc
+  addDiv (Pandoc m bs)
+    | T.null thisfile = Pandoc m bs
+    | otherwise = Pandoc m [Div (toIdent thisfile,[],[]) bs]
+
+  fixBlock :: Block -> Block
+  fixBlock (CodeBlock attr t) = CodeBlock (fixAttrs attr) t
+  fixBlock (Header lev attr ils) = Header lev (fixAttrs attr) ils
+  fixBlock (Table attr cap cols th tbs tf) =
+     Table (fixAttrs attr) cap cols th tbs tf
+  fixBlock (Div attr bs) = Div (fixAttrs attr) bs
+  fixBlock x = x
+
+  -- add thisfile as prefix of identifier
+  fixAttrs :: Attr -> Attr
+  fixAttrs (i,cs,kvs)
+    | T.null i = (i,cs,kvs)
+    | otherwise =
+        (T.intercalate "__"
+          (filter (not . T.null) [toIdent thisfile, i]),
+        cs, kvs)
+
+  -- if URL begins with file from allfiles, convert to
+  -- an internal link with the appropriate identifier
+  fixURL :: Text -> Text
+  fixURL u =
+    let (a,b) = T.break (== '#') u
+        filepart = if T.null a
+                      then toIdent thisfile
+                      else toIdent a
+        fragpart = T.dropWhile (== '#') b
+     in if T.null a || a `elem` allfiles
+           then "#" <> T.intercalate "__"
+                         (filter (not . T.null) [filepart, fragpart])
+           else u
+
+  fixInline :: Inline -> Inline
+  fixInline (Code attr t) = Code (fixAttrs attr) t
+  fixInline (Link attr ils (url,tit)) =
+    Link (fixAttrs attr) ils (fixURL url,tit)
+  fixInline (Image attr ils (url,tit)) =
+    Image (fixAttrs attr) ils (fixURL url,tit)
+  fixInline (Span attr ils) = Span (fixAttrs attr) ils
+  fixInline x = x
author	Albert Krewinkel <[email protected]>	2022-09-30 23:36:02 +0200
committer	Albert Krewinkel <[email protected]>	2022-09-30 23:42:41 +0200
commit	1148ead64cc90890953b7630229f0f5993dca0fc (patch)
tree	07ce0eba19fbdf2a4a40ea311d48a2405153c87c /src
parent	cfc2024d37944bdefa805d5c18a2ab0c3d620bbe (diff)