Add `xlsx` (Microsoft Excel) as an input format.

Each worksheet turns into a section containing a table. The common file `nativeDiff` has been extract from the Docx and Pptx text files and put in Tests.Helpers.
author: Anton Antich <[email protected]> 2025-11-09 14:07:38 +0100
committer: John MacFarlane <[email protected]> 2025-11-24 23:10:33 +0100
commit: 59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch)
tree: 1ddde4712ae7f8a879bd1299ae408cc596b02a89
parent: ec75b693e5618c12ddac872d48e084436f1e1b48 (diff)
14 files changed, 969 insertions, 30 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index d394b2f6d..c3b657f72 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -283,6 +283,7 @@ header when requesting a document from a URL:
     - `twiki` ([TWiki markup])
     - `typst` ([typst])
     - `vimwiki` ([Vimwiki])
+    - `xlsx` ([Excel spreadsheet][XLSX])
     - `xml` (XML version of native AST)
     - the path of a custom Lua reader, see [Custom readers and writers] below
     :::
@@ -519,6 +520,7 @@ header when requesting a document from a URL:
 [DokuWiki markup]: https://www.dokuwiki.org/dokuwiki
 [ZimWiki markup]: https://zim-wiki.org/manual/Help/Wiki_Syntax.html
 [XWiki markup]: https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/
+[XLSX]: https://en.wikipedia.org/wiki/Microsoft_Excel#File_formats
 [Vimdoc]: https://vimhelp.org/helphelp.txt.html#help-writing
 [TWiki markup]: https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules
 [TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax
diff --git a/pandoc.cabal b/pandoc.cabal
index f648dce11..8aa2ddd1d 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -440,6 +440,8 @@ extra-source-files:
                  test/odt/odt/*.odt
                  test/odt/markdown/*.md
                  test/odt/native/*.native
+                 test/xlsx-reader/*.xlsx
+                 test/xlsx-reader/*.native
                  test/pod-reader.pod
                  test/vimdoc/*.markdown
                  test/vimdoc/*.vimdoc
@@ -613,6 +615,7 @@ library
                    Text.Pandoc.Readers.Txt2Tags,
                    Text.Pandoc.Readers.Docx,
                    Text.Pandoc.Readers.Pptx,
+                   Text.Pandoc.Readers.Xlsx,
                    Text.Pandoc.Readers.ODT,
                    Text.Pandoc.Readers.EPUB,
                    Text.Pandoc.Readers.Muse,
@@ -726,6 +729,9 @@ library
                    Text.Pandoc.Readers.Pptx.Shapes,
                    Text.Pandoc.Readers.Pptx.Slides,
                    Text.Pandoc.Readers.Pptx.SmartArt,
+                   Text.Pandoc.Readers.Xlsx.Parse,
+                   Text.Pandoc.Readers.Xlsx.Cells,
+                   Text.Pandoc.Readers.Xlsx.Sheets,
                    Text.Pandoc.Readers.HTML.Parsing,
                    Text.Pandoc.Readers.HTML.Table,
                    Text.Pandoc.Readers.HTML.TagCategories,
@@ -863,6 +869,7 @@ test-suite test-pandoc
                   Tests.Readers.RTF
                   Tests.Readers.Docx
                   Tests.Readers.Pptx
+                  Tests.Readers.Xlsx
                   Tests.Readers.ODT
                   Tests.Readers.Txt2Tags
                   Tests.Readers.EPUB
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 5f7b891e2..beec98587 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -27,6 +27,7 @@ module Text.Pandoc.Readers
   , readers
   , readDocx
   , readPptx
+  , readXlsx
   , readODT
   , readMarkdown
   , readCommonMark
@@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
 import Text.Pandoc.Readers.Pptx
+import Text.Pandoc.Readers.Xlsx
 import Text.Pandoc.Readers.DokuWiki
 import Text.Pandoc.Readers.EPUB
 import Text.Pandoc.Readers.FB2
@@ -160,6 +162,7 @@ readers = [("native"       , TextReader readNative)
           ,("tikiwiki"     , TextReader readTikiWiki)
           ,("docx"         , ByteStringReader readDocx)
           ,("pptx"         , ByteStringReader readPptx)
+          ,("xlsx"         , ByteStringReader readXlsx)
           ,("odt"          , ByteStringReader readODT)
           ,("t2t"          , TextReader readTxt2Tags)
           ,("epub"         , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs
new file mode 100644
index 000000000..514dfd99e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Xlsx (readXlsx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
+import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)
+
+-- | Read XLSX file into Pandoc AST
+readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readXlsx opts bytes =
+  case toArchiveOrFail bytes of
+    Right archive ->
+      case archiveToXlsx archive of
+        Right xlsx -> do
+          let (meta, blocks) = xlsxToOutput opts xlsx
+          return $ Pandoc meta blocks
+        Left err ->
+          throwError $ PandocParseError $ "Failed to parse XLSX: " <> err
+
+    Left err ->
+      throwError $ PandocParseError $
+        "Failed to unpack XLSX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
new file mode 100644
index 000000000..ca0ad24f1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Cells
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Cell types and parsing for XLSX.
+-}
+module Text.Pandoc.Readers.Xlsx.Cells
+  ( CellRef(..)
+  , XlsxCell(..)
+  , CellValue(..)
+  , parseCellRef
+  ) where
+
+import qualified Data.Text as T
+import Data.Text (Text)
+import Data.Char (ord, isAlpha)
+import Text.Read (readMaybe)
+
+-- | Cell reference (A1 notation)
+data CellRef = CellRef
+  { cellRefCol :: Int    -- 1-based (A=1, B=2, ..., AA=27)
+  , cellRefRow :: Int    -- 1-based
+  } deriving (Show, Eq, Ord)
+
+-- | Cell value types
+data CellValue
+  = TextValue Text
+  | NumberValue Double
+  | EmptyValue
+  deriving (Show, Eq)
+
+-- | Parsed cell
+data XlsxCell = XlsxCell
+  { cellRef :: CellRef
+  , cellValue :: CellValue
+  , cellBold :: Bool
+  , cellItalic :: Bool
+  } deriving (Show)
+
+-- | Parse cell reference (A1 → CellRef)
+parseCellRef :: Text -> Either Text CellRef
+parseCellRef ref = do
+  let (colStr, rowStr) = T.span isAlpha ref
+
+  row <- case readMaybe (T.unpack rowStr) of
+    Just r | r > 0 -> Right r
+    _ -> Left $ "Invalid row: " <> rowStr
+
+  col <- parseColumn colStr
+
+  return $ CellRef col row
+
+-- | Parse column (A=1, Z=26, AA=27, etc.)
+parseColumn :: Text -> Either Text Int
+parseColumn colStr
+  | T.null colStr = Left "Empty column"
+  | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr
diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
new file mode 100644
index 000000000..1ee350dd1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
@@ -0,0 +1,296 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Parse
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Parsing of XLSX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Xlsx.Parse
+  ( Xlsx(..)
+  , XlsxWorkbook(..)
+  , XlsxSheet(..)
+  , SheetId(..)
+  , SharedStrings
+  , Styles(..)
+  , FontInfo(..)
+  , archiveToXlsx
+  ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import Data.List (find)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import qualified Data.Vector as V
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Xlsx.Cells
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Sheet identifier
+newtype SheetId = SheetId Int deriving (Show, Eq, Ord)
+
+-- | Shared strings table (Vector for O(1) lookup)
+type SharedStrings = V.Vector Text
+
+-- | Font information
+data FontInfo = FontInfo
+  { fontBold :: Bool
+  , fontItalic :: Bool
+  , fontUnderline :: Bool
+  } deriving (Show)
+
+-- | Style information
+data Styles = Styles
+  { styleFonts :: V.Vector FontInfo
+  } deriving (Show)
+
+-- | Complete XLSX document
+data Xlsx = Xlsx
+  { xlsxWorkbook :: XlsxWorkbook
+  , xlsxSheets :: [XlsxSheet]
+  , xlsxSharedStrings :: SharedStrings
+  , xlsxStyles :: Styles
+  } deriving (Show)
+
+-- | Workbook information
+data XlsxWorkbook = XlsxWorkbook
+  { workbookSheetNames :: [(SheetId, Text, Text)]  -- (id, name, relId)
+  } deriving (Show)
+
+-- | Individual worksheet
+data XlsxSheet = XlsxSheet
+  { sheetId :: SheetId
+  , sheetName :: Text
+  , sheetCells :: M.Map CellRef XlsxCell
+  } deriving (Show)
+
+-- | Parse XLSX archive
+archiveToXlsx :: Archive -> Either Text Xlsx
+archiveToXlsx archive = do
+  -- Find and parse workbook.xml
+  workbookPath <- getWorkbookXmlPath archive
+  workbookElem <- loadXMLFromArchive archive workbookPath
+  workbook <- parseWorkbook workbookElem
+    `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath)
+
+  -- Load workbook relationships
+  workbookRels <- loadRelationships archive (relsPathFor workbookPath)
+
+  -- Parse shared strings (look for sharedStrings relationship)
+  sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of
+    Just (_, target) -> do
+      let path = "xl/" ++ T.unpack target
+      el <- loadXMLFromArchive archive path
+      parseSharedStrings el
+    Nothing -> Right V.empty
+
+  -- Parse styles
+  styles <- case findRelWithTarget workbookRels "styles" of
+    Just (_, target) -> do
+      let path = "xl/" ++ T.unpack target
+      el <- loadXMLFromArchive archive path
+      parseStyles el
+    Nothing -> Right $ Styles V.empty
+
+  -- Parse worksheets
+  sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo)
+                 (workbookSheetNames workbook)
+
+  return $ Xlsx workbook sheets sharedStrings styles
+
+-- | Find workbook.xml via root relationships
+getWorkbookXmlPath :: Archive -> Either Text FilePath
+getWorkbookXmlPath archive = do
+  relsEntry <- maybeToEither "Missing _rels/.rels" $
+               findEntryByPath "_rels/.rels" archive
+  relsElem <- parseXMLFromEntry relsEntry
+
+  let relElems = onlyElems $ elContent relsElem
+  case find isOfficeDocRel relElems of
+    Nothing -> Left "No workbook.xml relationship found"
+    Just rel -> do
+      target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel
+      return $ T.unpack target
+  where
+    isOfficeDocRel el =
+      case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of
+        (Just relType, Just target) ->
+          "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target
+        _ -> False
+
+-- | Parse workbook.xml
+parseWorkbook :: Element -> Either Text XlsxWorkbook
+parseWorkbook wbElem = do
+  let ns = elemToNameSpaces wbElem
+
+  -- Find sheets element (match by local name only)
+  sheets <- maybeToEither "Missing <sheets>" $
+            find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem)
+
+  let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets)
+  sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems)
+
+  return $ XlsxWorkbook sheetRefs
+
+parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text)
+parseSheetRef ns (idx, sheetElem) = do
+  let name = fromMaybe ("Sheet" <> T.pack (show idx)) $
+             findAttr (unqual "name") sheetElem
+  relId <- maybeToEither "Missing r:id" $
+           findAttrByName ns "r" "id" sheetElem
+  return (SheetId idx, name, relId)
+
+-- | Parse shared strings
+parseSharedStrings :: Element -> Either Text SharedStrings
+parseSharedStrings sstElem = do
+  let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem)
+      strings = map extractString siElems
+  return $ V.fromList strings
+  where
+    extractString siElem =
+      case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of
+        Just tElem -> strContent tElem
+        Nothing -> getAllText siElem
+
+-- | Parse styles (fonts only for MVP)
+parseStyles :: Element -> Either Text Styles
+parseStyles stylesElem = do
+  -- Parse fonts (match by local name)
+  let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem)
+      fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem
+      fonts = V.fromList $ map (parseFont mempty) fontElems
+
+  return $ Styles fonts
+
+parseFont :: NameSpaces -> Element -> FontInfo
+parseFont _ns fontElem =
+  FontInfo
+    { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem)
+    , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem)
+    , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem)
+    }
+
+-- | Parse individual worksheet
+parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet
+parseSheet archive rels sharedStrings styles (sid, name, relId) = do
+  target <- maybeToEither ("Sheet relationship not found: " <> relId) $
+            lookup relId rels
+
+  let sheetPath = "xl/" ++ T.unpack target
+  sheetElem <- loadXMLFromArchive archive sheetPath
+
+  cells <- parseSheetCells sheetElem sharedStrings styles
+
+  return $ XlsxSheet sid name cells
+
+-- | Parse sheet cells
+parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell)
+parseSheetCells sheetElem sharedStrings styles = do
+  -- Find sheetData by local name
+  case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of
+    Nothing -> return M.empty
+    Just sheetData -> do
+      let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData)
+          cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems
+          cells = mapMaybe (parseCell sharedStrings styles) cellElems
+      return $ M.fromList [(cellRef c, c) | c <- cells]
+
+-- | Parse individual cell
+parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell
+parseCell sharedStrings styles cElem = do
+  -- Get cell reference
+  refText <- findAttr (unqual "r") cElem
+  cellRefParsed <- either (const Nothing) Just $ parseCellRef refText
+
+  -- Get cell type (default to number if missing)
+  let cellType = fromMaybe "" $ findAttr (unqual "t") cElem
+      styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack
+
+  -- Get value (match by local name)
+  let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem)
+      vText = maybe "" strContent vElem
+
+  -- Parse value based on type
+  let value = if cellType == "s"
+              then
+                -- Shared string
+                case readMaybe (T.unpack vText) of
+                  Just idx | idx >= 0 && idx < V.length sharedStrings ->
+                    TextValue (sharedStrings V.! idx)
+                  _ -> EmptyValue
+              else if T.null vText
+                then EmptyValue
+                else
+                  -- Number
+                  case readMaybe (T.unpack vText) of
+                    Just n -> NumberValue n
+                    Nothing -> TextValue vText
+
+  -- Get formatting from style
+  let (bold, italic) = case styleIdx of
+        Just idx | idx >= 0 && idx < V.length (styleFonts styles) ->
+          let font = styleFonts styles V.! idx
+           in (fontBold font, fontItalic font)
+        _ -> (False, False)
+
+  return $ XlsxCell cellRefParsed value bold italic
+
+-- Helper functions
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+  entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+           findEntryByPath path archive
+  parseXMLFromEntry entry
+
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry entry =
+  let lazyText = TL.decodeUtf8 $ fromEntry entry
+   in parseXMLElement lazyText
+
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+  case findEntryByPath relsPath archive of
+    Nothing -> Right []
+    Just entry -> do
+      relsElem <- parseXMLFromEntry entry
+      let relElems = onlyElems $ elContent relsElem
+      return $ mapMaybe extractRel relElems
+  where
+    extractRel el = do
+      relId <- findAttr (unqual "Id") el
+      target <- findAttr (unqual "Target") el
+      return (relId, target)
+
+relsPathFor :: FilePath -> FilePath
+relsPathFor path =
+  let (dir, file) = splitFileName path
+   in dir ++ "/_rels/" ++ file ++ ".rels"
+
+findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text)
+findRelWithTarget rels targetName =
+  find (\(_, target) -> targetName `T.isInfixOf` target) rels
+
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
+
+getAllText :: Element -> Text
+getAllText el =
+  let textFromContent (Text cdata) = cdData cdata
+      textFromContent (Elem e) = getAllText e
+      textFromContent _ = ""
+      texts = map textFromContent (elContent el)
+   in T.unwords $ filter (not . T.null) texts
+
+addContext :: Either Text a -> Text -> Either Text a
+addContext (Right x) _ = Right x
+addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")")
diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
new file mode 100644
index 000000000..e75484790
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Sheets
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of XLSX sheets to Pandoc AST.
+-}
+module Text.Pandoc.Readers.Xlsx.Sheets
+  ( xlsxToOutput
+  ) where
+
+import qualified Data.Map.Strict as M
+import qualified Data.Text as T
+import Data.List (sort, dropWhileEnd)
+import Data.Char (isSpace)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse
+import Text.Pandoc.Readers.Xlsx.Cells
+import qualified Text.Pandoc.Builder as B
+
+-- | Convert XLSX to Pandoc output
+xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block])
+xlsxToOutput _opts xlsx =
+  let sheets = xlsxSheets xlsx
+      sheetBlocks = concatMap sheetToBlocks sheets
+   in (mempty, sheetBlocks)
+
+-- | Convert sheet to blocks (header + table)
+sheetToBlocks :: XlsxSheet -> [Block]
+sheetToBlocks sheet =
+  let SheetId n = sheetId sheet
+      name = sheetName sheet
+      sheetIdent = "sheet-" <> T.pack (show n)
+      header = Header 2 (sheetIdent, [], []) (B.toList (B.text name))
+
+      -- Convert cells to table
+      tableBlock = case cellsToTable sheet of
+        Just tbl -> [tbl]
+        Nothing -> []  -- Empty sheet
+   in header : tableBlock
+
+-- | Convert cells to Pandoc Table
+cellsToTable :: XlsxSheet -> Maybe Block
+cellsToTable sheet
+  | M.null (sheetCells sheet) = Nothing
+  | otherwise =
+      let cells = sheetCells sheet
+          -- Get bounds
+          refs = sort $ M.keys cells
+          minCol = minimum $ map cellRefCol refs
+          maxCol = maximum $ map cellRefCol refs
+          minRow = minimum $ map cellRefRow refs
+          maxRow = maximum $ map cellRefRow refs
+
+          -- Build dense grid
+          grid = [ [ M.lookup (CellRef col row) cells
+                   | col <- [minCol..maxCol]
+                   ]
+                 | row <- [minRow..maxRow]
+                 ]
+
+          -- First row is header (simple heuristic)
+          (headerRow, bodyRows) = case grid of
+            (h:bs) -> (h, bs)
+            [] -> ([], [])
+
+          -- Filter out trailing empty rows (rows with only whitespace)
+          filteredBodyRows = dropWhileEnd isEmptyRow bodyRows
+
+          makeCell mcell = case mcell of
+            Just cell ->
+              let inlines = cellToInlines cell
+               in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines]
+            Nothing ->
+              Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []]
+
+          numCols = length headerRow
+          colSpec = replicate numCols (AlignDefault, ColWidthDefault)
+          thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow]
+          tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows]
+          tfoot = TableFoot nullAttr []
+
+       in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot
+
+-- | Check if a row contains only whitespace or empty cells
+isEmptyRow :: [Maybe XlsxCell] -> Bool
+isEmptyRow = all isEmptyCell
+  where
+    isEmptyCell Nothing = True
+    isEmptyCell (Just cell) = case cellValue cell of
+      EmptyValue -> True
+      TextValue t -> T.all isSpace t
+      NumberValue _ -> False
+
+-- | Convert cell to Pandoc inlines
+cellToInlines :: XlsxCell -> [Inline]
+cellToInlines cell =
+  let base = case cellValue cell of
+        TextValue t -> B.toList $ B.text t
+        NumberValue n -> [Str $ T.pack $ show n]
+        EmptyValue -> []
+
+      applyBold inls = if cellBold cell then [Strong inls] else inls
+      applyItalic inls = if cellItalic cell then [Emph inls] else inls
+
+   in applyItalic $ applyBold base
diff --git a/test/Tests/Helpers.hs b/test/Tests/Helpers.hs
index 3e930b14a..081611ed9 100644
--- a/test/Tests/Helpers.hs
+++ b/test/Tests/Helpers.hs
@@ -16,6 +16,7 @@ module Tests.Helpers ( test
                      , TestResult(..)
                      , setupEnvironment
                      , showDiff
+                     , nativeDiff
                      , testGolden
                      , (=?>)
                      , purely
@@ -132,6 +133,19 @@ vividize (Both s _) = "  " ++ s
 vividize (First s)  = "- " ++ s
 vividize (Second s) = "+ " ++ s
 
+nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
+nativeDiff normPath expectedNative actualNative
+  | expectedNative == actualNative = return Nothing
+  | otherwise = Just <$> do
+      expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
+      actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
+      let dash = replicate 72 '-'
+      let diff = getDiff (lines actual) (lines expected)
+      return $ '\n' : dash ++
+               "\n--- " ++ normPath ++
+               "\n+++ " ++ "test" ++ "\n" ++
+               showDiff (1,1) diff ++ dash
+
 purely :: (b -> PandocPure a) -> b -> a
 purely f = either (error . show) id . runPure . f
 
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index 0bd70d0e2..76af649b4 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -34,20 +34,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "docx" }
 testCompare :: String -> FilePath -> FilePath -> TestTree
 testCompare = testCompareWithOpts defopts
 
-
-nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
-nativeDiff normPath expectedNative actualNative
-  | expectedNative == actualNative = return Nothing
-  | otherwise = Just <$> do
-      expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
-      actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
-      let dash = replicate 72 '-'
-      let diff = getDiff (lines actual) (lines expected)
-      return $ '\n' : dash ++
-               "\n--- " ++ normPath ++
-               "\n+++ " ++ "test" ++ "\n" ++
-               showDiff (1,1) diff ++ dash
-
 testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
 testCompareWithOpts opts testName docxFP nativeFP =
   goldenTest
diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs
index 613d5b50f..3358e4111 100644
--- a/test/Tests/Readers/Pptx.hs
+++ b/test/Tests/Readers/Pptx.hs
@@ -12,10 +12,8 @@ Tests for the PPTX reader.
 -}
 module Tests.Readers.Pptx (tests) where
 
-import Data.Algorithm.Diff (getDiff)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Lazy as B
-import qualified Data.Text as T
 import Test.Tasty
 import Test.Tasty.Golden.Advanced
 import Tests.Helpers
@@ -28,19 +26,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "pptx" }
 testCompare :: String -> FilePath -> FilePath -> TestTree
 testCompare = testCompareWithOpts defopts
 
-nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
-nativeDiff normPath expectedNative actualNative
-  | expectedNative == actualNative = return Nothing
-  | otherwise = Just <$> do
-      expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
-      actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
-      let dash = replicate 72 '-'
-      let diff = getDiff (lines actual) (lines expected)
-      return $ '\n' : dash ++
-               "\n--- " ++ normPath ++
-               "\n+++ " ++ "test" ++ "\n" ++
-               showDiff (1,1) diff ++ dash
-
 testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
 testCompareWithOpts opts testName pptxFP nativeFP =
   goldenTest
diff --git a/test/Tests/Readers/Xlsx.hs b/test/Tests/Readers/Xlsx.hs
new file mode 100644
index 000000000..189cd0c16
--- /dev/null
+++ b/test/Tests/Readers/Xlsx.hs
@@ -0,0 +1,48 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Tests.Readers.Xlsx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Tests for the XLSX reader.
+-}
+module Tests.Readers.Xlsx (tests) where
+
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Lazy as B
+import Test.Tasty
+import Test.Tasty.Golden.Advanced
+import Tests.Helpers
+import Text.Pandoc
+import Text.Pandoc.UTF8 as UTF8
+
+defopts :: ReaderOptions
+defopts = def{ readerExtensions = getDefaultExtensions "xlsx" }
+
+testCompare :: String -> FilePath -> FilePath -> TestTree
+testCompare = testCompareWithOpts defopts
+
+testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
+testCompareWithOpts opts testName xlsxFP nativeFP =
+  goldenTest
+  testName
+  (do nf <- UTF8.toText <$> BS.readFile nativeFP
+      runIOorExplode (readNative def nf))
+  (do df <- B.readFile xlsxFP
+      runIOorExplode (readXlsx opts df))
+  (nativeDiff nativeFP)
+  (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a)
+            >>= BS.writeFile nativeFP . UTF8.fromText)
+
+tests :: [TestTree]
+tests = [ testGroup "basic"
+          [ testCompare
+            "sheet extraction"
+            "xlsx-reader/basic.xlsx"
+            "xlsx-reader/basic.native"
+          ]
+        ]
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index 0d04b361f..9ae97d9c0 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -13,6 +13,7 @@ import qualified Tests.Old
 import qualified Tests.Readers.Creole
 import qualified Tests.Readers.Docx
 import qualified Tests.Readers.Pptx
+import qualified Tests.Readers.Xlsx
 import qualified Tests.Readers.DokuWiki
 import qualified Tests.Readers.EPUB
 import qualified Tests.Readers.FB2
@@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests"
           , testGroup "RTF" Tests.Readers.RTF.tests
           , testGroup "Docx" Tests.Readers.Docx.tests
           , testGroup "Pptx" Tests.Readers.Pptx.tests
+          , testGroup "Xlsx" Tests.Readers.Xlsx.tests
           , testGroup "ODT" Tests.Readers.ODT.tests
           , testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests
           , testGroup "EPUB" Tests.Readers.EPUB.tests
@@ -126,4 +128,4 @@ main = do
     _ -> inDirectory "test" $ do
            fp <- getExecutablePath
            -- putStrLn $ "Using pandoc executable at " ++ fp
-           defaultMain $ tests fp
-\ No newline at end of file
+           defaultMain $ tests fp
diff --git a/test/xlsx-reader/basic.native b/test/xlsx-reader/basic.native
new file mode 100644
index 000000000..f69f78a41
--- /dev/null
+++ b/test/xlsx-reader/basic.native
@@ -0,0 +1,381 @@
+Pandoc
+  Meta { unMeta = fromList [] }
+  [ Header 2 ( "sheet-1" , [] , [] ) [ Str "Main" ]
+  , Table
+      ( "" , [] , [] )
+      (Caption Nothing [])
+      [ ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      ]
+      (TableHead
+         ( "" , [] , [] )
+         [ Row
+             ( "" , [] , [] )
+             [ Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [ Strong [ Str "Person" ] ] ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [ Strong [ Str "Age" ] ] ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [ Strong [ Str "Location" ] ] ]
+             ]
+         ])
+      [ TableBody
+          ( "" , [] , [] )
+          (RowHeadColumns 0)
+          []
+          [ Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Anton" , Space , Str "Antich" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "23.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Switzerland" ] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "James" , Space , Str "Bond" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "35.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Moscow" ] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain
+                      [ Str "Just"
+                      , Space
+                      , Str "a"
+                      , Space
+                      , Str "random"
+                      , Space
+                      , Str "cell"
+                      ]
+                  ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              ]
+          ]
+      ]
+      (TableFoot ( "" , [] , [] ) [])
+  , Header 2 ( "sheet-2" , [] , [] ) [ Str "Secondary" ]
+  , Table
+      ( "" , [] , [] )
+      (Caption Nothing [])
+      [ ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      , ( AlignDefault , ColWidthDefault )
+      ]
+      (TableHead
+         ( "" , [] , [] )
+         [ Row
+             ( "" , [] , [] )
+             [ Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain
+                     [ Str "Sum"
+                     , Space
+                     , Str "of"
+                     , Space
+                     , Str "Age"
+                     ]
+                 ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [ Str "Column" , Space , Str "Labels" ] ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [] ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [] ]
+             , Cell
+                 ( "" , [] , [] )
+                 AlignDefault
+                 (RowSpan 1)
+                 (ColSpan 1)
+                 [ Plain [] ]
+             ]
+         ])
+      [ TableBody
+          ( "" , [] , [] )
+          (RowHeadColumns 0)
+          []
+          [ Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Row" , Space , Str "Labels" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Moscow" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Switzerland" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "(blank)" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Grand" , Space , Str "Total" ] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Anton" , Space , Str "Antich" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "23.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "23.0" ] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "James" , Space , Str "Bond" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "35.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "35.0" ] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "(blank)" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              ]
+          , Row
+              ( "" , [] , [] )
+              [ Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "Grand" , Space , Str "Total" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "35.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "23.0" ] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [] ]
+              , Cell
+                  ( "" , [] , [] )
+                  AlignDefault
+                  (RowSpan 1)
+                  (ColSpan 1)
+                  [ Plain [ Str "58.0" ] ]
+              ]
+          ]
+      ]
+      (TableFoot ( "" , [] , [] ) [])
+  ]
diff --git a/test/xlsx-reader/basic.xlsx b/test/xlsx-reader/basic.xlsx
new file mode 100644
index 000000000..55d62d56e
--- /dev/null
+++ b/test/xlsx-reader/basic.xlsx
author	Anton Antich <[email protected]>	2025-11-09 14:07:38 +0100
committer	John MacFarlane <[email protected]>	2025-11-24 23:10:33 +0100
commit	59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch)
tree	1ddde4712ae7f8a879bd1299ae408cc596b02a89
parent	ec75b693e5618c12ddac872d48e084436f1e1b48 (diff)