diff options
| author | Albert Krewinkel <[email protected]> | 2022-09-30 23:36:02 +0200 |
|---|---|---|
| committer | Albert Krewinkel <[email protected]> | 2022-09-30 23:42:41 +0200 |
| commit | 1148ead64cc90890953b7630229f0f5993dca0fc (patch) | |
| tree | 07ce0eba19fbdf2a4a40ea311d48a2405153c87c /src | |
| parent | cfc2024d37944bdefa805d5c18a2ab0c3d620bbe (diff) | |
App: move initial input-to-Pandoc code to internal submodule
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/App.hs | 166 | ||||
| -rw-r--r-- | src/Text/Pandoc/App/Input.hs | 191 |
2 files changed, 206 insertions, 151 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index 493b58a1f..b6129422c 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -30,9 +30,7 @@ import qualified Control.Exception as E import Control.Monad ( (>=>), when, forM_ ) import Control.Monad.Trans ( MonadIO(..) ) import Control.Monad.Catch ( MonadMask ) -import Control.Monad.Except (throwError, catchError) -import qualified Data.ByteString as BS -import qualified Data.ByteString.Char8 as B8 +import Control.Monad.Except (throwError) import qualified Data.ByteString.Lazy as BL import Data.Maybe (fromMaybe, isJust, isNothing) import qualified Data.Set as Set @@ -40,26 +38,22 @@ import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TE -import qualified Data.Text.Encoding as TSE import qualified Data.Text.Encoding.Error as TE -import qualified Data.Text.Encoding.Error as TSE -import Network.URI (URI (..), parseURI) import System.Directory (doesDirectoryExist) import System.Exit (exitSuccess) import System.FilePath ( takeBaseName, takeExtension) import System.IO (nativeNewline, stdout) import qualified System.IO as IO (Newline (..)) import Text.Pandoc -import Text.Pandoc.Walk (walk) import Text.Pandoc.Builder (setMeta) import Text.Pandoc.MediaBag (mediaItems) -import Text.Pandoc.MIME (getCharset, MimeType) import Text.Pandoc.Image (svgToPng) import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths) import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts, IpynbOutput (..)) import Text.Pandoc.App.CommandLineOptions (parseOptions, parseOptionsFromArgs, options) +import Text.Pandoc.App.Input (InputParameters (..), readInput) import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings) import Text.Collate.Lang (Lang (..), parseLang) import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), Environment (..), @@ -68,8 +62,7 @@ import Text.Pandoc.PDF (makePDF) import Text.Pandoc.Scripting (ScriptingEngine (..)) import Text.Pandoc.SelfContained (makeSelfContained) import Text.Pandoc.Shared (eastAsianLineBreakFilter, - headerShift, isURI, tabFilter, uriPathToPath, filterIpynbOutput, - defaultUserDataDir, tshow, textToIdentifier) + headerShift, isURI, filterIpynbOutput, defaultUserDataDir, tshow) import Text.Pandoc.Sources (toSources) import Text.Pandoc.Writers.Shared (lookupMetaString) import Text.Pandoc.Readers.Markdown (yamlToMeta) @@ -287,14 +280,6 @@ convertWithOpts' scriptingEngine istty datadir opts = do _ -> Format format) :)) $ [] - let convertTabs = tabFilter (if optPreserveTabs opts || - readerNameBase == "t2t" || - readerNameBase == "man" || - readerNameBase == "tsv" - then 0 - else optTabStop opts) - - when (readerNameBase == "markdown_github" || writerNameBase == "markdown_github") $ report $ Deprecated "markdown_github" "Use gfm instead." @@ -320,27 +305,18 @@ convertWithOpts' scriptingEngine istty datadir opts = do let filterEnv = Environment readerOpts writerOptions - inputs <- readSources sources - - - doc <- (case reader of - TextReader r - | readerNameBase == "json" -> - mconcat <$> - mapM (inputToText convertTabs - >=> r readerOpts . (:[])) inputs - | optFileScope opts -> - mconcat <$> mapM - (\source -> do - (fp, txt) <- inputToText convertTabs source - adjustLinksAndIds (readerExtensions readerOpts) - (T.pack fp) (map (T.pack . fst) inputs) - <$> r readerOpts [(fp, txt)]) - inputs - | otherwise -> mapM (inputToText convertTabs) inputs - >>= r readerOpts - ByteStringReader r -> - mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs) + let inputParams = InputParameters + { inputReader = reader + , inputReaderName = readerNameBase + , inputReaderOptions = readerOpts + , inputSources = sources + , inputFileScope = optFileScope opts + , inputSpacesPerTab = if optPreserveTabs opts + then Nothing + else Just (optTabStop opts) + } + + doc <- readInput inputParams >>= ( return . adjustMetadata (metadataFromFile <>) >=> return . adjustMetadata (<> optMetadata opts) >=> return . adjustMetadata (<> cslMetadata) @@ -411,64 +387,6 @@ adjustMetadata f (Pandoc meta bs) = Pandoc (f meta) bs applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc applyTransforms transforms d = return $ foldr ($) d transforms -readSources :: PandocMonad m - => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))] -readSources srcs = - mapM (\fp -> do t <- readSource fp - return (if fp == "-" then "" else fp, t)) srcs - -readSource :: PandocMonad m - => FilePath -> m (BS.ByteString, Maybe MimeType) -readSource "-" = (,Nothing) <$> readStdinStrict -readSource src = - case parseURI src of - Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src) - | uriScheme u == "file:" -> - (,Nothing) <$> - readFileStrict (uriPathToPath $ T.pack $ uriPath u) - _ -> (,Nothing) <$> readFileStrict src - -utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text -utf8ToText fp bs = - case TSE.decodeUtf8' . dropBOM $ bs of - Left (TSE.DecodeError _ (Just w)) -> - case BS.elemIndex w bs of - Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w - Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w - Left e -> throwError $ PandocAppError (tshow e) - Right t -> return t - where - dropBOM bs' = - if "\xEF\xBB\xBF" `BS.isPrefixOf` bs' - then BS.drop 3 bs' - else bs' - - -inputToText :: PandocMonad m - => (Text -> Text) - -> (FilePath, (BS.ByteString, Maybe MimeType)) - -> m (FilePath, Text) -inputToText convTabs (fp, (bs,mt)) = - (fp,) . convTabs . T.filter (/='\r') <$> - case mt >>= getCharset of - Just "UTF-8" -> utf8ToText fp bs - Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs - Just charset -> throwError $ PandocUnsupportedCharsetError charset - Nothing -> catchError - (utf8ToText fp bs) - (\case - PandocUTF8DecodingError{} -> do - report $ NotUTF8Encoded - (if null fp - then "input" - else fp) - return $ T.pack $ B8.unpack bs - e -> throwError e) - -inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType)) - -> BL.ByteString -inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs - writeFnBinary :: FilePath -> BL.ByteString -> IO () writeFnBinary "-" = BL.putStr writeFnBinary f = BL.writeFile (UTF8.encodePath f) @@ -476,57 +394,3 @@ writeFnBinary f = BL.writeFile (UTF8.encodePath f) writerFn :: IO.Newline -> FilePath -> Text -> IO () writerFn eol "-" = UTF8.putStrWith eol writerFn eol f = UTF8.writeFileWith eol f - -adjustLinksAndIds :: Extensions -> Text -> [Text] -> Pandoc -> Pandoc -adjustLinksAndIds exts thisfile allfiles - | length allfiles > 1 = addDiv . walk fixInline . walk fixBlock - | otherwise = id - where - toIdent :: Text -> Text - toIdent = textToIdentifier exts . T.intercalate "__" . - T.split (\c -> c == '/' || c == '\\') - - addDiv :: Pandoc -> Pandoc - addDiv (Pandoc m bs) - | T.null thisfile = Pandoc m bs - | otherwise = Pandoc m [Div (toIdent thisfile,[],[]) bs] - - fixBlock :: Block -> Block - fixBlock (CodeBlock attr t) = CodeBlock (fixAttrs attr) t - fixBlock (Header lev attr ils) = Header lev (fixAttrs attr) ils - fixBlock (Table attr cap cols th tbs tf) = - Table (fixAttrs attr) cap cols th tbs tf - fixBlock (Div attr bs) = Div (fixAttrs attr) bs - fixBlock x = x - - -- add thisfile as prefix of identifier - fixAttrs :: Attr -> Attr - fixAttrs (i,cs,kvs) - | T.null i = (i,cs,kvs) - | otherwise = - (T.intercalate "__" - (filter (not . T.null) [toIdent thisfile, i]), - cs, kvs) - - -- if URL begins with file from allfiles, convert to - -- an internal link with the appropriate identifier - fixURL :: Text -> Text - fixURL u = - let (a,b) = T.break (== '#') u - filepart = if T.null a - then toIdent thisfile - else toIdent a - fragpart = T.dropWhile (== '#') b - in if T.null a || a `elem` allfiles - then "#" <> T.intercalate "__" - (filter (not . T.null) [filepart, fragpart]) - else u - - fixInline :: Inline -> Inline - fixInline (Code attr t) = Code (fixAttrs attr) t - fixInline (Link attr ils (url,tit)) = - Link (fixAttrs attr) ils (fixURL url,tit) - fixInline (Image attr ils (url,tit)) = - Image (fixAttrs attr) ils (fixURL url,tit) - fixInline (Span attr ils) = Span (fixAttrs attr) ils - fixInline x = x diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs new file mode 100644 index 000000000..68f5f7f15 --- /dev/null +++ b/src/Text/Pandoc/App/Input.hs @@ -0,0 +1,191 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TupleSections #-} +{- | + Module : Text.Pandoc.App.Input + Copyright : © 2006-2022 John MacFarlane + License : GPL-2.0-or-later + Maintainer : John MacFarlane <jgm@berkeley@edu> + +Read from the file system into a pandoc document. +-} +module Text.Pandoc.App.Input + ( InputParameters (..) + , readInput + ) where + +import Control.Monad ((>=>)) +import Control.Monad.Except (throwError, catchError) +import Data.Text (Text) +import Network.URI (URI (..), parseURI) +import Text.Pandoc.Class ( PandocMonad, openURL + , readFileStrict, readStdinStrict, report) +import Text.Pandoc.Definition (Pandoc (..), Attr, Block (..), Inline (..)) +import Text.Pandoc.Error (PandocError (..)) +import Text.Pandoc.Logging (LogMessage (..)) +import Text.Pandoc.MIME (getCharset, MimeType) +import Text.Pandoc.Options (Extensions, ReaderOptions (..)) +import Text.Pandoc.Readers (Reader (..)) +import Text.Pandoc.Shared (tabFilter, textToIdentifier, tshow, uriPathToPath) +import Text.Pandoc.Walk (walk) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as B8 +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Encoding as TSE +import qualified Data.Text.Encoding.Error as TSE + +-- | Settings specifying how and which input should be processed. +data InputParameters m = InputParameters + { inputReader :: Reader m + , inputReaderName :: Text + , inputReaderOptions :: ReaderOptions + , inputSources :: [FilePath] + , inputSpacesPerTab :: Maybe Int + , inputFileScope :: Bool + } + +-- | Read all input into a pandoc document. +readInput :: PandocMonad m => InputParameters m -> m Pandoc +readInput params = do + let sources = inputSources params + let readerName = inputReaderName params + let readerOpts = inputReaderOptions params + let convertTabs :: Text -> Text + convertTabs = tabFilter $ case inputSpacesPerTab params of + Nothing -> 0 + Just ts -> if readerName `elem` ["t2t", "man", "tsv"] + then 0 + else ts + + inputs <- readSources sources + + case inputReader params of + TextReader r + | readerName == "json" -> + mconcat <$> mapM (inputToText convertTabs >=> r readerOpts . (:[])) + inputs + | inputFileScope params -> + mconcat <$> mapM + (\source -> do + (fp, txt) <- inputToText convertTabs source + adjustLinksAndIds (readerExtensions readerOpts) + (T.pack fp) (map (T.pack . fst) inputs) + <$> r readerOpts [(fp, txt)]) + inputs + | otherwise -> mapM (inputToText convertTabs) inputs >>= r readerOpts + ByteStringReader r -> + mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs + +readSources :: PandocMonad m + => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))] +readSources srcs = + mapM (\fp -> do t <- readSource fp + return (if fp == "-" then "" else fp, t)) srcs + +-- | Read input from a resource, i.e., either a file, a URL, or stdin +-- (@-@). +readSource :: PandocMonad m + => FilePath -> m (BS.ByteString, Maybe MimeType) +readSource "-" = (,Nothing) <$> readStdinStrict +readSource src = + case parseURI src of + Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src) + | uriScheme u == "file:" -> + (,Nothing) <$> + readFileStrict (uriPathToPath $ T.pack $ uriPath u) + _ -> (,Nothing) <$> readFileStrict src + +utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text +utf8ToText fp bs = + case TSE.decodeUtf8' . dropBOM $ bs of + Left (TSE.DecodeError _ (Just w)) -> + case BS.elemIndex w bs of + Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w + Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w + Left e -> throwError $ PandocAppError (tshow e) + Right t -> return t + where + dropBOM bs' = + if "\xEF\xBB\xBF" `BS.isPrefixOf` bs' + then BS.drop 3 bs' + else bs' + +inputToText :: PandocMonad m + => (Text -> Text) + -> (FilePath, (BS.ByteString, Maybe MimeType)) + -> m (FilePath, Text) +inputToText convTabs (fp, (bs,mt)) = + (fp,) . convTabs . T.filter (/='\r') <$> + case mt >>= getCharset of + Just "UTF-8" -> utf8ToText fp bs + Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs + Just charset -> throwError $ PandocUnsupportedCharsetError charset + Nothing -> catchError + (utf8ToText fp bs) + (\case + PandocUTF8DecodingError{} -> do + report $ NotUTF8Encoded + (if null fp + then "input" + else fp) + return $ T.pack $ B8.unpack bs + e -> throwError e) + +inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType)) + -> BL.ByteString +inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs + +adjustLinksAndIds :: Extensions -> Text -> [Text] -> Pandoc -> Pandoc +adjustLinksAndIds exts thisfile allfiles + | length allfiles > 1 = addDiv . walk fixInline . walk fixBlock + | otherwise = id + where + toIdent :: Text -> Text + toIdent = textToIdentifier exts . T.intercalate "__" . + T.split (\c -> c == '/' || c == '\\') + + addDiv :: Pandoc -> Pandoc + addDiv (Pandoc m bs) + | T.null thisfile = Pandoc m bs + | otherwise = Pandoc m [Div (toIdent thisfile,[],[]) bs] + + fixBlock :: Block -> Block + fixBlock (CodeBlock attr t) = CodeBlock (fixAttrs attr) t + fixBlock (Header lev attr ils) = Header lev (fixAttrs attr) ils + fixBlock (Table attr cap cols th tbs tf) = + Table (fixAttrs attr) cap cols th tbs tf + fixBlock (Div attr bs) = Div (fixAttrs attr) bs + fixBlock x = x + + -- add thisfile as prefix of identifier + fixAttrs :: Attr -> Attr + fixAttrs (i,cs,kvs) + | T.null i = (i,cs,kvs) + | otherwise = + (T.intercalate "__" + (filter (not . T.null) [toIdent thisfile, i]), + cs, kvs) + + -- if URL begins with file from allfiles, convert to + -- an internal link with the appropriate identifier + fixURL :: Text -> Text + fixURL u = + let (a,b) = T.break (== '#') u + filepart = if T.null a + then toIdent thisfile + else toIdent a + fragpart = T.dropWhile (== '#') b + in if T.null a || a `elem` allfiles + then "#" <> T.intercalate "__" + (filter (not . T.null) [filepart, fragpart]) + else u + + fixInline :: Inline -> Inline + fixInline (Code attr t) = Code (fixAttrs attr) t + fixInline (Link attr ils (url,tit)) = + Link (fixAttrs attr) ils (fixURL url,tit) + fixInline (Image attr ils (url,tit)) = + Image (fixAttrs attr) ils (fixURL url,tit) + fixInline (Span attr ils) = Span (fixAttrs attr) ils + fixInline x = x |
