aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANUAL.txt2
-rw-r--r--pandoc.cabal7
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/Xlsx.hs40
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Cells.hs63
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Parse.hs296
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Sheets.hs112
-rw-r--r--test/Tests/Helpers.hs14
-rw-r--r--test/Tests/Readers/Docx.hs14
-rw-r--r--test/Tests/Readers/Pptx.hs15
-rw-r--r--test/Tests/Readers/Xlsx.hs48
-rw-r--r--test/test-pandoc.hs4
-rw-r--r--test/xlsx-reader/basic.native381
-rw-r--r--test/xlsx-reader/basic.xlsxbin0 -> 13604 bytes
14 files changed, 969 insertions, 30 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index d394b2f6d..c3b657f72 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -283,6 +283,7 @@ header when requesting a document from a URL:
- `twiki` ([TWiki markup])
- `typst` ([typst])
- `vimwiki` ([Vimwiki])
+ - `xlsx` ([Excel spreadsheet][XLSX])
- `xml` (XML version of native AST)
- the path of a custom Lua reader, see [Custom readers and writers] below
:::
@@ -519,6 +520,7 @@ header when requesting a document from a URL:
[DokuWiki markup]: https://www.dokuwiki.org/dokuwiki
[ZimWiki markup]: https://zim-wiki.org/manual/Help/Wiki_Syntax.html
[XWiki markup]: https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/
+[XLSX]: https://en.wikipedia.org/wiki/Microsoft_Excel#File_formats
[Vimdoc]: https://vimhelp.org/helphelp.txt.html#help-writing
[TWiki markup]: https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules
[TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax
diff --git a/pandoc.cabal b/pandoc.cabal
index f648dce11..8aa2ddd1d 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -440,6 +440,8 @@ extra-source-files:
test/odt/odt/*.odt
test/odt/markdown/*.md
test/odt/native/*.native
+ test/xlsx-reader/*.xlsx
+ test/xlsx-reader/*.native
test/pod-reader.pod
test/vimdoc/*.markdown
test/vimdoc/*.vimdoc
@@ -613,6 +615,7 @@ library
Text.Pandoc.Readers.Txt2Tags,
Text.Pandoc.Readers.Docx,
Text.Pandoc.Readers.Pptx,
+ Text.Pandoc.Readers.Xlsx,
Text.Pandoc.Readers.ODT,
Text.Pandoc.Readers.EPUB,
Text.Pandoc.Readers.Muse,
@@ -726,6 +729,9 @@ library
Text.Pandoc.Readers.Pptx.Shapes,
Text.Pandoc.Readers.Pptx.Slides,
Text.Pandoc.Readers.Pptx.SmartArt,
+ Text.Pandoc.Readers.Xlsx.Parse,
+ Text.Pandoc.Readers.Xlsx.Cells,
+ Text.Pandoc.Readers.Xlsx.Sheets,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.Table,
Text.Pandoc.Readers.HTML.TagCategories,
@@ -863,6 +869,7 @@ test-suite test-pandoc
Tests.Readers.RTF
Tests.Readers.Docx
Tests.Readers.Pptx
+ Tests.Readers.Xlsx
Tests.Readers.ODT
Tests.Readers.Txt2Tags
Tests.Readers.EPUB
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 5f7b891e2..beec98587 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -27,6 +27,7 @@ module Text.Pandoc.Readers
, readers
, readDocx
, readPptx
+ , readXlsx
, readODT
, readMarkdown
, readCommonMark
@@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.Pptx
+import Text.Pandoc.Readers.Xlsx
import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
@@ -160,6 +162,7 @@ readers = [("native" , TextReader readNative)
,("tikiwiki" , TextReader readTikiWiki)
,("docx" , ByteStringReader readDocx)
,("pptx" , ByteStringReader readPptx)
+ ,("xlsx" , ByteStringReader readXlsx)
,("odt" , ByteStringReader readODT)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs
new file mode 100644
index 000000000..514dfd99e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Xlsx (readXlsx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
+import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)
+
+-- | Read XLSX file into Pandoc AST
+readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readXlsx opts bytes =
+ case toArchiveOrFail bytes of
+ Right archive ->
+ case archiveToXlsx archive of
+ Right xlsx -> do
+ let (meta, blocks) = xlsxToOutput opts xlsx
+ return $ Pandoc meta blocks
+ Left err ->
+ throwError $ PandocParseError $ "Failed to parse XLSX: " <> err
+
+ Left err ->
+ throwError $ PandocParseError $
+ "Failed to unpack XLSX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
new file mode 100644
index 000000000..ca0ad24f1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Cells
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Cell types and parsing for XLSX.
+-}
+module Text.Pandoc.Readers.Xlsx.Cells
+ ( CellRef(..)
+ , XlsxCell(..)
+ , CellValue(..)
+ , parseCellRef
+ ) where
+
+import qualified Data.Text as T
+import Data.Text (Text)
+import Data.Char (ord, isAlpha)
+import Text.Read (readMaybe)
+
+-- | Cell reference (A1 notation)
+data CellRef = CellRef
+ { cellRefCol :: Int -- 1-based (A=1, B=2, ..., AA=27)
+ , cellRefRow :: Int -- 1-based
+ } deriving (Show, Eq, Ord)
+
+-- | Cell value types
+data CellValue
+ = TextValue Text
+ | NumberValue Double
+ | EmptyValue
+ deriving (Show, Eq)
+
+-- | Parsed cell
+data XlsxCell = XlsxCell
+ { cellRef :: CellRef
+ , cellValue :: CellValue
+ , cellBold :: Bool
+ , cellItalic :: Bool
+ } deriving (Show)
+
+-- | Parse cell reference (A1 → CellRef)
+parseCellRef :: Text -> Either Text CellRef
+parseCellRef ref = do
+ let (colStr, rowStr) = T.span isAlpha ref
+
+ row <- case readMaybe (T.unpack rowStr) of
+ Just r | r > 0 -> Right r
+ _ -> Left $ "Invalid row: " <> rowStr
+
+ col <- parseColumn colStr
+
+ return $ CellRef col row
+
+-- | Parse column (A=1, Z=26, AA=27, etc.)
+parseColumn :: Text -> Either Text Int
+parseColumn colStr
+ | T.null colStr = Left "Empty column"
+ | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr
diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
new file mode 100644
index 000000000..1ee350dd1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
@@ -0,0 +1,296 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Parse
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Parsing of XLSX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Xlsx.Parse
+ ( Xlsx(..)
+ , XlsxWorkbook(..)
+ , XlsxSheet(..)
+ , SheetId(..)
+ , SharedStrings
+ , Styles(..)
+ , FontInfo(..)
+ , archiveToXlsx
+ ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import Data.List (find)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import qualified Data.Vector as V
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Xlsx.Cells
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Sheet identifier
+newtype SheetId = SheetId Int deriving (Show, Eq, Ord)
+
+-- | Shared strings table (Vector for O(1) lookup)
+type SharedStrings = V.Vector Text
+
+-- | Font information
+data FontInfo = FontInfo
+ { fontBold :: Bool
+ , fontItalic :: Bool
+ , fontUnderline :: Bool
+ } deriving (Show)
+
+-- | Style information
+data Styles = Styles
+ { styleFonts :: V.Vector FontInfo
+ } deriving (Show)
+
+-- | Complete XLSX document
+data Xlsx = Xlsx
+ { xlsxWorkbook :: XlsxWorkbook
+ , xlsxSheets :: [XlsxSheet]
+ , xlsxSharedStrings :: SharedStrings
+ , xlsxStyles :: Styles
+ } deriving (Show)
+
+-- | Workbook information
+data XlsxWorkbook = XlsxWorkbook
+ { workbookSheetNames :: [(SheetId, Text, Text)] -- (id, name, relId)
+ } deriving (Show)
+
+-- | Individual worksheet
+data XlsxSheet = XlsxSheet
+ { sheetId :: SheetId
+ , sheetName :: Text
+ , sheetCells :: M.Map CellRef XlsxCell
+ } deriving (Show)
+
+-- | Parse XLSX archive
+archiveToXlsx :: Archive -> Either Text Xlsx
+archiveToXlsx archive = do
+ -- Find and parse workbook.xml
+ workbookPath <- getWorkbookXmlPath archive
+ workbookElem <- loadXMLFromArchive archive workbookPath
+ workbook <- parseWorkbook workbookElem
+ `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath)
+
+ -- Load workbook relationships
+ workbookRels <- loadRelationships archive (relsPathFor workbookPath)
+
+ -- Parse shared strings (look for sharedStrings relationship)
+ sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of
+ Just (_, target) -> do
+ let path = "xl/" ++ T.unpack target
+ el <- loadXMLFromArchive archive path
+ parseSharedStrings el
+ Nothing -> Right V.empty
+
+ -- Parse styles
+ styles <- case findRelWithTarget workbookRels "styles" of
+ Just (_, target) -> do
+ let path = "xl/" ++ T.unpack target
+ el <- loadXMLFromArchive archive path
+ parseStyles el
+ Nothing -> Right $ Styles V.empty
+
+ -- Parse worksheets
+ sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo)
+ (workbookSheetNames workbook)
+
+ return $ Xlsx workbook sheets sharedStrings styles
+
+-- | Find workbook.xml via root relationships
+getWorkbookXmlPath :: Archive -> Either Text FilePath
+getWorkbookXmlPath archive = do
+ relsEntry <- maybeToEither "Missing _rels/.rels" $
+ findEntryByPath "_rels/.rels" archive
+ relsElem <- parseXMLFromEntry relsEntry
+
+ let relElems = onlyElems $ elContent relsElem
+ case find isOfficeDocRel relElems of
+ Nothing -> Left "No workbook.xml relationship found"
+ Just rel -> do
+ target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel
+ return $ T.unpack target
+ where
+ isOfficeDocRel el =
+ case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of
+ (Just relType, Just target) ->
+ "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target
+ _ -> False
+
+-- | Parse workbook.xml
+parseWorkbook :: Element -> Either Text XlsxWorkbook
+parseWorkbook wbElem = do
+ let ns = elemToNameSpaces wbElem
+
+ -- Find sheets element (match by local name only)
+ sheets <- maybeToEither "Missing <sheets>" $
+ find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem)
+
+ let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets)
+ sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems)
+
+ return $ XlsxWorkbook sheetRefs
+
+parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text)
+parseSheetRef ns (idx, sheetElem) = do
+ let name = fromMaybe ("Sheet" <> T.pack (show idx)) $
+ findAttr (unqual "name") sheetElem
+ relId <- maybeToEither "Missing r:id" $
+ findAttrByName ns "r" "id" sheetElem
+ return (SheetId idx, name, relId)
+
+-- | Parse shared strings
+parseSharedStrings :: Element -> Either Text SharedStrings
+parseSharedStrings sstElem = do
+ let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem)
+ strings = map extractString siElems
+ return $ V.fromList strings
+ where
+ extractString siElem =
+ case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of
+ Just tElem -> strContent tElem
+ Nothing -> getAllText siElem
+
+-- | Parse styles (fonts only for MVP)
+parseStyles :: Element -> Either Text Styles
+parseStyles stylesElem = do
+ -- Parse fonts (match by local name)
+ let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem)
+ fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem
+ fonts = V.fromList $ map (parseFont mempty) fontElems
+
+ return $ Styles fonts
+
+parseFont :: NameSpaces -> Element -> FontInfo
+parseFont _ns fontElem =
+ FontInfo
+ { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem)
+ , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem)
+ , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem)
+ }
+
+-- | Parse individual worksheet
+parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet
+parseSheet archive rels sharedStrings styles (sid, name, relId) = do
+ target <- maybeToEither ("Sheet relationship not found: " <> relId) $
+ lookup relId rels
+
+ let sheetPath = "xl/" ++ T.unpack target
+ sheetElem <- loadXMLFromArchive archive sheetPath
+
+ cells <- parseSheetCells sheetElem sharedStrings styles
+
+ return $ XlsxSheet sid name cells
+
+-- | Parse sheet cells
+parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell)
+parseSheetCells sheetElem sharedStrings styles = do
+ -- Find sheetData by local name
+ case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of
+ Nothing -> return M.empty
+ Just sheetData -> do
+ let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData)
+ cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems
+ cells = mapMaybe (parseCell sharedStrings styles) cellElems
+ return $ M.fromList [(cellRef c, c) | c <- cells]
+
+-- | Parse individual cell
+parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell
+parseCell sharedStrings styles cElem = do
+ -- Get cell reference
+ refText <- findAttr (unqual "r") cElem
+ cellRefParsed <- either (const Nothing) Just $ parseCellRef refText
+
+ -- Get cell type (default to number if missing)
+ let cellType = fromMaybe "" $ findAttr (unqual "t") cElem
+ styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack
+
+ -- Get value (match by local name)
+ let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem)
+ vText = maybe "" strContent vElem
+
+ -- Parse value based on type
+ let value = if cellType == "s"
+ then
+ -- Shared string
+ case readMaybe (T.unpack vText) of
+ Just idx | idx >= 0 && idx < V.length sharedStrings ->
+ TextValue (sharedStrings V.! idx)
+ _ -> EmptyValue
+ else if T.null vText
+ then EmptyValue
+ else
+ -- Number
+ case readMaybe (T.unpack vText) of
+ Just n -> NumberValue n
+ Nothing -> TextValue vText
+
+ -- Get formatting from style
+ let (bold, italic) = case styleIdx of
+ Just idx | idx >= 0 && idx < V.length (styleFonts styles) ->
+ let font = styleFonts styles V.! idx
+ in (fontBold font, fontItalic font)
+ _ -> (False, False)
+
+ return $ XlsxCell cellRefParsed value bold italic
+
+-- Helper functions
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+ entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+ findEntryByPath path archive
+ parseXMLFromEntry entry
+
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry entry =
+ let lazyText = TL.decodeUtf8 $ fromEntry entry
+ in parseXMLElement lazyText
+
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+ case findEntryByPath relsPath archive of
+ Nothing -> Right []
+ Just entry -> do
+ relsElem <- parseXMLFromEntry entry
+ let relElems = onlyElems $ elContent relsElem
+ return $ mapMaybe extractRel relElems
+ where
+ extractRel el = do
+ relId <- findAttr (unqual "Id") el
+ target <- findAttr (unqual "Target") el
+ return (relId, target)
+
+relsPathFor :: FilePath -> FilePath
+relsPathFor path =
+ let (dir, file) = splitFileName path
+ in dir ++ "/_rels/" ++ file ++ ".rels"
+
+findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text)
+findRelWithTarget rels targetName =
+ find (\(_, target) -> targetName `T.isInfixOf` target) rels
+
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
+
+getAllText :: Element -> Text
+getAllText el =
+ let textFromContent (Text cdata) = cdData cdata
+ textFromContent (Elem e) = getAllText e
+ textFromContent _ = ""
+ texts = map textFromContent (elContent el)
+ in T.unwords $ filter (not . T.null) texts
+
+addContext :: Either Text a -> Text -> Either Text a
+addContext (Right x) _ = Right x
+addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")")
diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
new file mode 100644
index 000000000..e75484790
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Sheets
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of XLSX sheets to Pandoc AST.
+-}
+module Text.Pandoc.Readers.Xlsx.Sheets
+ ( xlsxToOutput
+ ) where
+
+import qualified Data.Map.Strict as M
+import qualified Data.Text as T
+import Data.List (sort, dropWhileEnd)
+import Data.Char (isSpace)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse
+import Text.Pandoc.Readers.Xlsx.Cells
+import qualified Text.Pandoc.Builder as B
+
+-- | Convert XLSX to Pandoc output
+xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block])
+xlsxToOutput _opts xlsx =
+ let sheets = xlsxSheets xlsx
+ sheetBlocks = concatMap sheetToBlocks sheets
+ in (mempty, sheetBlocks)
+
+-- | Convert sheet to blocks (header + table)
+sheetToBlocks :: XlsxSheet -> [Block]
+sheetToBlocks sheet =
+ let SheetId n = sheetId sheet
+ name = sheetName sheet
+ sheetIdent = "sheet-" <> T.pack (show n)
+ header = Header 2 (sheetIdent, [], []) (B.toList (B.text name))
+
+ -- Convert cells to table
+ tableBlock = case cellsToTable sheet of
+ Just tbl -> [tbl]
+ Nothing -> [] -- Empty sheet
+ in header : tableBlock
+
+-- | Convert cells to Pandoc Table
+cellsToTable :: XlsxSheet -> Maybe Block
+cellsToTable sheet
+ | M.null (sheetCells sheet) = Nothing
+ | otherwise =
+ let cells = sheetCells sheet
+ -- Get bounds
+ refs = sort $ M.keys cells
+ minCol = minimum $ map cellRefCol refs
+ maxCol = maximum $ map cellRefCol refs
+ minRow = minimum $ map cellRefRow refs
+ maxRow = maximum $ map cellRefRow refs
+
+ -- Build dense grid
+ grid = [ [ M.lookup (CellRef col row) cells
+ | col <- [minCol..maxCol]
+ ]
+ | row <- [minRow..maxRow]
+ ]
+
+ -- First row is header (simple heuristic)
+ (headerRow, bodyRows) = case grid of
+ (h:bs) -> (h, bs)
+ [] -> ([], [])
+
+ -- Filter out trailing empty rows (rows with only whitespace)
+ filteredBodyRows = dropWhileEnd isEmptyRow bodyRows
+
+ makeCell mcell = case mcell of
+ Just cell ->
+ let inlines = cellToInlines cell
+ in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines]
+ Nothing ->
+ Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []]
+
+ numCols = length headerRow
+ colSpec = replicate numCols (AlignDefault, ColWidthDefault)
+ thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow]
+ tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows]
+ tfoot = TableFoot nullAttr []
+
+ in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot
+
+-- | Check if a row contains only whitespace or empty cells
+isEmptyRow :: [Maybe XlsxCell] -> Bool
+isEmptyRow = all isEmptyCell
+ where
+ isEmptyCell Nothing = True
+ isEmptyCell (Just cell) = case cellValue cell of
+ EmptyValue -> True
+ TextValue t -> T.all isSpace t
+ NumberValue _ -> False
+
+-- | Convert cell to Pandoc inlines
+cellToInlines :: XlsxCell -> [Inline]
+cellToInlines cell =
+ let base = case cellValue cell of
+ TextValue t -> B.toList $ B.text t
+ NumberValue n -> [Str $ T.pack $ show n]
+ EmptyValue -> []
+
+ applyBold inls = if cellBold cell then [Strong inls] else inls
+ applyItalic inls = if cellItalic cell then [Emph inls] else inls
+
+ in applyItalic $ applyBold base
diff --git a/test/Tests/Helpers.hs b/test/Tests/Helpers.hs
index 3e930b14a..081611ed9 100644
--- a/test/Tests/Helpers.hs
+++ b/test/Tests/Helpers.hs
@@ -16,6 +16,7 @@ module Tests.Helpers ( test
, TestResult(..)
, setupEnvironment
, showDiff
+ , nativeDiff
, testGolden
, (=?>)
, purely
@@ -132,6 +133,19 @@ vividize (Both s _) = " " ++ s
vividize (First s) = "- " ++ s
vividize (Second s) = "+ " ++ s
+nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
+nativeDiff normPath expectedNative actualNative
+ | expectedNative == actualNative = return Nothing
+ | otherwise = Just <$> do
+ expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
+ actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
+ let dash = replicate 72 '-'
+ let diff = getDiff (lines actual) (lines expected)
+ return $ '\n' : dash ++
+ "\n--- " ++ normPath ++
+ "\n+++ " ++ "test" ++ "\n" ++
+ showDiff (1,1) diff ++ dash
+
purely :: (b -> PandocPure a) -> b -> a
purely f = either (error . show) id . runPure . f
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index 0bd70d0e2..76af649b4 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -34,20 +34,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "docx" }
testCompare :: String -> FilePath -> FilePath -> TestTree
testCompare = testCompareWithOpts defopts
-
-nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
-nativeDiff normPath expectedNative actualNative
- | expectedNative == actualNative = return Nothing
- | otherwise = Just <$> do
- expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
- actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
- let dash = replicate 72 '-'
- let diff = getDiff (lines actual) (lines expected)
- return $ '\n' : dash ++
- "\n--- " ++ normPath ++
- "\n+++ " ++ "test" ++ "\n" ++
- showDiff (1,1) diff ++ dash
-
testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
testCompareWithOpts opts testName docxFP nativeFP =
goldenTest
diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs
index 613d5b50f..3358e4111 100644
--- a/test/Tests/Readers/Pptx.hs
+++ b/test/Tests/Readers/Pptx.hs
@@ -12,10 +12,8 @@ Tests for the PPTX reader.
-}
module Tests.Readers.Pptx (tests) where
-import Data.Algorithm.Diff (getDiff)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
-import qualified Data.Text as T
import Test.Tasty
import Test.Tasty.Golden.Advanced
import Tests.Helpers
@@ -28,19 +26,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "pptx" }
testCompare :: String -> FilePath -> FilePath -> TestTree
testCompare = testCompareWithOpts defopts
-nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
-nativeDiff normPath expectedNative actualNative
- | expectedNative == actualNative = return Nothing
- | otherwise = Just <$> do
- expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
- actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
- let dash = replicate 72 '-'
- let diff = getDiff (lines actual) (lines expected)
- return $ '\n' : dash ++
- "\n--- " ++ normPath ++
- "\n+++ " ++ "test" ++ "\n" ++
- showDiff (1,1) diff ++ dash
-
testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
testCompareWithOpts opts testName pptxFP nativeFP =
goldenTest
diff --git a/test/Tests/Readers/Xlsx.hs b/test/Tests/Readers/Xlsx.hs
new file mode 100644
index 000000000..189cd0c16
--- /dev/null
+++ b/test/Tests/Readers/Xlsx.hs
@@ -0,0 +1,48 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Tests.Readers.Xlsx
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Tests for the XLSX reader.
+-}
+module Tests.Readers.Xlsx (tests) where
+
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Lazy as B
+import Test.Tasty
+import Test.Tasty.Golden.Advanced
+import Tests.Helpers
+import Text.Pandoc
+import Text.Pandoc.UTF8 as UTF8
+
+defopts :: ReaderOptions
+defopts = def{ readerExtensions = getDefaultExtensions "xlsx" }
+
+testCompare :: String -> FilePath -> FilePath -> TestTree
+testCompare = testCompareWithOpts defopts
+
+testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
+testCompareWithOpts opts testName xlsxFP nativeFP =
+ goldenTest
+ testName
+ (do nf <- UTF8.toText <$> BS.readFile nativeFP
+ runIOorExplode (readNative def nf))
+ (do df <- B.readFile xlsxFP
+ runIOorExplode (readXlsx opts df))
+ (nativeDiff nativeFP)
+ (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a)
+ >>= BS.writeFile nativeFP . UTF8.fromText)
+
+tests :: [TestTree]
+tests = [ testGroup "basic"
+ [ testCompare
+ "sheet extraction"
+ "xlsx-reader/basic.xlsx"
+ "xlsx-reader/basic.native"
+ ]
+ ]
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index 0d04b361f..9ae97d9c0 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -13,6 +13,7 @@ import qualified Tests.Old
import qualified Tests.Readers.Creole
import qualified Tests.Readers.Docx
import qualified Tests.Readers.Pptx
+import qualified Tests.Readers.Xlsx
import qualified Tests.Readers.DokuWiki
import qualified Tests.Readers.EPUB
import qualified Tests.Readers.FB2
@@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests"
, testGroup "RTF" Tests.Readers.RTF.tests
, testGroup "Docx" Tests.Readers.Docx.tests
, testGroup "Pptx" Tests.Readers.Pptx.tests
+ , testGroup "Xlsx" Tests.Readers.Xlsx.tests
, testGroup "ODT" Tests.Readers.ODT.tests
, testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests
, testGroup "EPUB" Tests.Readers.EPUB.tests
@@ -126,4 +128,4 @@ main = do
_ -> inDirectory "test" $ do
fp <- getExecutablePath
-- putStrLn $ "Using pandoc executable at " ++ fp
- defaultMain $ tests fp \ No newline at end of file
+ defaultMain $ tests fp
diff --git a/test/xlsx-reader/basic.native b/test/xlsx-reader/basic.native
new file mode 100644
index 000000000..f69f78a41
--- /dev/null
+++ b/test/xlsx-reader/basic.native
@@ -0,0 +1,381 @@
+Pandoc
+ Meta { unMeta = fromList [] }
+ [ Header 2 ( "sheet-1" , [] , [] ) [ Str "Main" ]
+ , Table
+ ( "" , [] , [] )
+ (Caption Nothing [])
+ [ ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ ]
+ (TableHead
+ ( "" , [] , [] )
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Strong [ Str "Person" ] ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Strong [ Str "Age" ] ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Strong [ Str "Location" ] ] ]
+ ]
+ ])
+ [ TableBody
+ ( "" , [] , [] )
+ (RowHeadColumns 0)
+ []
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Anton" , Space , Str "Antich" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "23.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Switzerland" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "James" , Space , Str "Bond" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "35.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Moscow" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain
+ [ Str "Just"
+ , Space
+ , Str "a"
+ , Space
+ , Str "random"
+ , Space
+ , Str "cell"
+ ]
+ ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ ]
+ ]
+ ]
+ (TableFoot ( "" , [] , [] ) [])
+ , Header 2 ( "sheet-2" , [] , [] ) [ Str "Secondary" ]
+ , Table
+ ( "" , [] , [] )
+ (Caption Nothing [])
+ [ ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ ]
+ (TableHead
+ ( "" , [] , [] )
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain
+ [ Str "Sum"
+ , Space
+ , Str "of"
+ , Space
+ , Str "Age"
+ ]
+ ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Column" , Space , Str "Labels" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ ]
+ ])
+ [ TableBody
+ ( "" , [] , [] )
+ (RowHeadColumns 0)
+ []
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Row" , Space , Str "Labels" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Moscow" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Switzerland" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "(blank)" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Grand" , Space , Str "Total" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Anton" , Space , Str "Antich" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "23.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "23.0" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "James" , Space , Str "Bond" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "35.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "35.0" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "(blank)" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Grand" , Space , Str "Total" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "35.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "23.0" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "58.0" ] ]
+ ]
+ ]
+ ]
+ (TableFoot ( "" , [] , [] ) [])
+ ]
diff --git a/test/xlsx-reader/basic.xlsx b/test/xlsx-reader/basic.xlsx
new file mode 100644
index 000000000..55d62d56e
--- /dev/null
+++ b/test/xlsx-reader/basic.xlsx
Binary files differ