aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRepetitive <[email protected]>2025-08-27 10:44:46 +0200
committerJohn MacFarlane <[email protected]>2025-08-28 21:15:36 +0200
commit729c12394cb69ecc2d8c8d6177d2cc5526e51a13 (patch)
treeef037dc9086d875a025949bd06463a620fced01a /src
parent1bf2564b5cd363c0b5c92b73a6912de98782a883 (diff)
fix: recognize binary signatures and fail early
Fail early when receiving binary input with recognized signature: - zip[-based]: including OpenDocument and Microsoft formats - PDF - CFBF-based: old Microsoft formats including .doc and .xls - DjVu
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/App/Input.hs22
-rw-r--r--src/Text/Pandoc/Error.hs9
-rw-r--r--src/Text/Pandoc/Format.hs1
3 files changed, 28 insertions, 4 deletions
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs
index ecb423634..87e71da6b 100644
--- a/src/Text/Pandoc/App/Input.hs
+++ b/src/Text/Pandoc/App/Input.hs
@@ -14,7 +14,7 @@ module Text.Pandoc.App.Input
, readInput
) where
-import Control.Monad ((>=>))
+import Control.Monad ((>=>), when)
import Control.Monad.Except (throwError, catchError)
import Data.Text (Text)
import Network.URI (URI (..), parseURI)
@@ -109,15 +109,29 @@ inputToText convTabs (fp, (bs,mt)) =
(toTextM fp bs)
(\case
PandocUTF8DecodingError{} -> do
- -- TODO check for binary file signatures
- -- here and exit with an error instead
- -- of treating as latin1e..
+ when (hasKnownSignature bs) $
+ throwError $
+ PandocInputNotTextError (T.pack fp)
report $ NotUTF8Encoded
(if null fp
then "input"
else fp)
return $ T.pack $ B8.unpack bs
e -> throwError e)
+ where
+ -- "50 4B 03 04" is zip file signature
+ isZip bs' = "\x50\x4B\x03\x04" `BS.isPrefixOf` bs'
+ -- "25 50 44 46 2D" is PDF file signature
+ isPDF bs' = "\x25\x50\x44\x46\x2D" `BS.isPrefixOf` bs'
+ -- "D0 CF 11 E0 A1 B1 1A E1" is Compound File Binary Format signature used in
+ -- variety of old Microsoft formats (.doc and .xls among others)
+ isCFBF bs' = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" `BS.isPrefixOf` bs'
+ -- "41 54 26 54 46 4F 52 4D ?? ?? ?? ?? 44 4A 56" is DjVu signature
+ isDjVu bs' = case BS.stripPrefix "\x41\x54\x26\x54\x46\x4F\x52\x4D" bs' of
+ Nothing -> False
+ Just x -> BS.isPrefixOf "\x44\x4A\x56" $ BS.drop 4 x
+
+ hasKnownSignature bs' = any ($ bs') [isZip, isPDF, isCFBF, isDjVu]
inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
-> BL.ByteString
diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs
index f9c9aea8d..e782df735 100644
--- a/src/Text/Pandoc/Error.hs
+++ b/src/Text/Pandoc/Error.hs
@@ -65,6 +65,7 @@ data PandocError = PandocIOError Text IOError
| PandocUnsupportedExtensionError Text Text
| PandocCiteprocError CiteprocError
| PandocBibliographyError Text Text
+ | PandocInputNotTextError Text
deriving (Show, Typeable, Generic)
instance Exception PandocError
@@ -143,6 +144,13 @@ renderError e =
prettyCiteprocError e'
PandocBibliographyError fp msg ->
"Error reading bibliography file " <> fp <> ":\n" <> msg
+ PandocInputNotTextError fp ->
+ "Expected text as an input, but received binary data from " <>
+ (if T.null fp
+ then "stdin"
+ else "file " <> fp) <>
+ ".\nIf you intended to convert from binary format, verify that it's " <>
+ "supported and use\nexplicit -f FORMAT."
-- | Handle PandocError by exiting with an error message.
@@ -184,6 +192,7 @@ handleError (Left e) =
PandocUTF8DecodingError{} -> 92
PandocIpynbDecodingError{} -> 93
PandocUnsupportedCharsetError{} -> 94
+ PandocInputNotTextError{} -> 95
PandocCouldNotFindDataFileError{} -> 97
PandocCouldNotFindMetadataFileError{} -> 98
PandocResourceNotFound{} -> 99
diff --git a/src/Text/Pandoc/Format.hs b/src/Text/Pandoc/Format.hs
index 433592531..5b87a2531 100644
--- a/src/Text/Pandoc/Format.hs
+++ b/src/Text/Pandoc/Format.hs
@@ -181,6 +181,7 @@ formatFromFilePath x =
".ctx" -> defFlavor "context"
".db" -> defFlavor "docbook"
".dj" -> defFlavor "djot"
+ ".djvu" -> defFlavor "djvu" -- so we get an "unknown reader" error
".doc" -> defFlavor "doc" -- so we get an "unknown reader" error
".docx" -> defFlavor "docx"
".dokuwiki" -> defFlavor "dokuwiki"