Add `xlsx` (Microsoft Excel) as an input format.

Each worksheet turns into a section containing a table. The common file `nativeDiff` has been extract from the Docx and Pptx text files and put in Tests.Helpers.
author: Anton Antich <[email protected]> 2025-11-09 14:07:38 +0100
committer: John MacFarlane <[email protected]> 2025-11-24 23:10:33 +0100
commit: 59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch)
tree: 1ddde4712ae7f8a879bd1299ae408cc596b02a89 /src
parent: ec75b693e5618c12ddac872d48e084436f1e1b48 (diff)
5 files changed, 514 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 5f7b891e2..beec98587 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -27,6 +27,7 @@ module Text.Pandoc.Readers
   , readers
   , readDocx
   , readPptx
+  , readXlsx
   , readODT
   , readMarkdown
   , readCommonMark
@@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
 import Text.Pandoc.Readers.Pptx
+import Text.Pandoc.Readers.Xlsx
 import Text.Pandoc.Readers.DokuWiki
 import Text.Pandoc.Readers.EPUB
 import Text.Pandoc.Readers.FB2
@@ -160,6 +162,7 @@ readers = [("native"       , TextReader readNative)
           ,("tikiwiki"     , TextReader readTikiWiki)
           ,("docx"         , ByteStringReader readDocx)
           ,("pptx"         , ByteStringReader readPptx)
+          ,("xlsx"         , ByteStringReader readXlsx)
           ,("odt"          , ByteStringReader readODT)
           ,("t2t"          , TextReader readTxt2Tags)
           ,("epub"         , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs
new file mode 100644
index 000000000..514dfd99e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Xlsx (readXlsx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
+import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)
+
+-- | Read XLSX file into Pandoc AST
+readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readXlsx opts bytes =
+  case toArchiveOrFail bytes of
+    Right archive ->
+      case archiveToXlsx archive of
+        Right xlsx -> do
+          let (meta, blocks) = xlsxToOutput opts xlsx
+          return $ Pandoc meta blocks
+        Left err ->
+          throwError $ PandocParseError $ "Failed to parse XLSX: " <> err
+
+    Left err ->
+      throwError $ PandocParseError $
+        "Failed to unpack XLSX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
new file mode 100644
index 000000000..ca0ad24f1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Cells
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Cell types and parsing for XLSX.
+-}
+module Text.Pandoc.Readers.Xlsx.Cells
+  ( CellRef(..)
+  , XlsxCell(..)
+  , CellValue(..)
+  , parseCellRef
+  ) where
+
+import qualified Data.Text as T
+import Data.Text (Text)
+import Data.Char (ord, isAlpha)
+import Text.Read (readMaybe)
+
+-- | Cell reference (A1 notation)
+data CellRef = CellRef
+  { cellRefCol :: Int    -- 1-based (A=1, B=2, ..., AA=27)
+  , cellRefRow :: Int    -- 1-based
+  } deriving (Show, Eq, Ord)
+
+-- | Cell value types
+data CellValue
+  = TextValue Text
+  | NumberValue Double
+  | EmptyValue
+  deriving (Show, Eq)
+
+-- | Parsed cell
+data XlsxCell = XlsxCell
+  { cellRef :: CellRef
+  , cellValue :: CellValue
+  , cellBold :: Bool
+  , cellItalic :: Bool
+  } deriving (Show)
+
+-- | Parse cell reference (A1 → CellRef)
+parseCellRef :: Text -> Either Text CellRef
+parseCellRef ref = do
+  let (colStr, rowStr) = T.span isAlpha ref
+
+  row <- case readMaybe (T.unpack rowStr) of
+    Just r | r > 0 -> Right r
+    _ -> Left $ "Invalid row: " <> rowStr
+
+  col <- parseColumn colStr
+
+  return $ CellRef col row
+
+-- | Parse column (A=1, Z=26, AA=27, etc.)
+parseColumn :: Text -> Either Text Int
+parseColumn colStr
+  | T.null colStr = Left "Empty column"
+  | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr
diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
new file mode 100644
index 000000000..1ee350dd1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
@@ -0,0 +1,296 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Parse
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Parsing of XLSX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Xlsx.Parse
+  ( Xlsx(..)
+  , XlsxWorkbook(..)
+  , XlsxSheet(..)
+  , SheetId(..)
+  , SharedStrings
+  , Styles(..)
+  , FontInfo(..)
+  , archiveToXlsx
+  ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import Data.List (find)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import qualified Data.Vector as V
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Xlsx.Cells
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Sheet identifier
+newtype SheetId = SheetId Int deriving (Show, Eq, Ord)
+
+-- | Shared strings table (Vector for O(1) lookup)
+type SharedStrings = V.Vector Text
+
+-- | Font information
+data FontInfo = FontInfo
+  { fontBold :: Bool
+  , fontItalic :: Bool
+  , fontUnderline :: Bool
+  } deriving (Show)
+
+-- | Style information
+data Styles = Styles
+  { styleFonts :: V.Vector FontInfo
+  } deriving (Show)
+
+-- | Complete XLSX document
+data Xlsx = Xlsx
+  { xlsxWorkbook :: XlsxWorkbook
+  , xlsxSheets :: [XlsxSheet]
+  , xlsxSharedStrings :: SharedStrings
+  , xlsxStyles :: Styles
+  } deriving (Show)
+
+-- | Workbook information
+data XlsxWorkbook = XlsxWorkbook
+  { workbookSheetNames :: [(SheetId, Text, Text)]  -- (id, name, relId)
+  } deriving (Show)
+
+-- | Individual worksheet
+data XlsxSheet = XlsxSheet
+  { sheetId :: SheetId
+  , sheetName :: Text
+  , sheetCells :: M.Map CellRef XlsxCell
+  } deriving (Show)
+
+-- | Parse XLSX archive
+archiveToXlsx :: Archive -> Either Text Xlsx
+archiveToXlsx archive = do
+  -- Find and parse workbook.xml
+  workbookPath <- getWorkbookXmlPath archive
+  workbookElem <- loadXMLFromArchive archive workbookPath
+  workbook <- parseWorkbook workbookElem
+    `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath)
+
+  -- Load workbook relationships
+  workbookRels <- loadRelationships archive (relsPathFor workbookPath)
+
+  -- Parse shared strings (look for sharedStrings relationship)
+  sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of
+    Just (_, target) -> do
+      let path = "xl/" ++ T.unpack target
+      el <- loadXMLFromArchive archive path
+      parseSharedStrings el
+    Nothing -> Right V.empty
+
+  -- Parse styles
+  styles <- case findRelWithTarget workbookRels "styles" of
+    Just (_, target) -> do
+      let path = "xl/" ++ T.unpack target
+      el <- loadXMLFromArchive archive path
+      parseStyles el
+    Nothing -> Right $ Styles V.empty
+
+  -- Parse worksheets
+  sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo)
+                 (workbookSheetNames workbook)
+
+  return $ Xlsx workbook sheets sharedStrings styles
+
+-- | Find workbook.xml via root relationships
+getWorkbookXmlPath :: Archive -> Either Text FilePath
+getWorkbookXmlPath archive = do
+  relsEntry <- maybeToEither "Missing _rels/.rels" $
+               findEntryByPath "_rels/.rels" archive
+  relsElem <- parseXMLFromEntry relsEntry
+
+  let relElems = onlyElems $ elContent relsElem
+  case find isOfficeDocRel relElems of
+    Nothing -> Left "No workbook.xml relationship found"
+    Just rel -> do
+      target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel
+      return $ T.unpack target
+  where
+    isOfficeDocRel el =
+      case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of
+        (Just relType, Just target) ->
+          "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target
+        _ -> False
+
+-- | Parse workbook.xml
+parseWorkbook :: Element -> Either Text XlsxWorkbook
+parseWorkbook wbElem = do
+  let ns = elemToNameSpaces wbElem
+
+  -- Find sheets element (match by local name only)
+  sheets <- maybeToEither "Missing <sheets>" $
+            find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem)
+
+  let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets)
+  sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems)
+
+  return $ XlsxWorkbook sheetRefs
+
+parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text)
+parseSheetRef ns (idx, sheetElem) = do
+  let name = fromMaybe ("Sheet" <> T.pack (show idx)) $
+             findAttr (unqual "name") sheetElem
+  relId <- maybeToEither "Missing r:id" $
+           findAttrByName ns "r" "id" sheetElem
+  return (SheetId idx, name, relId)
+
+-- | Parse shared strings
+parseSharedStrings :: Element -> Either Text SharedStrings
+parseSharedStrings sstElem = do
+  let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem)
+      strings = map extractString siElems
+  return $ V.fromList strings
+  where
+    extractString siElem =
+      case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of
+        Just tElem -> strContent tElem
+        Nothing -> getAllText siElem
+
+-- | Parse styles (fonts only for MVP)
+parseStyles :: Element -> Either Text Styles
+parseStyles stylesElem = do
+  -- Parse fonts (match by local name)
+  let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem)
+      fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem
+      fonts = V.fromList $ map (parseFont mempty) fontElems
+
+  return $ Styles fonts
+
+parseFont :: NameSpaces -> Element -> FontInfo
+parseFont _ns fontElem =
+  FontInfo
+    { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem)
+    , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem)
+    , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem)
+    }
+
+-- | Parse individual worksheet
+parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet
+parseSheet archive rels sharedStrings styles (sid, name, relId) = do
+  target <- maybeToEither ("Sheet relationship not found: " <> relId) $
+            lookup relId rels
+
+  let sheetPath = "xl/" ++ T.unpack target
+  sheetElem <- loadXMLFromArchive archive sheetPath
+
+  cells <- parseSheetCells sheetElem sharedStrings styles
+
+  return $ XlsxSheet sid name cells
+
+-- | Parse sheet cells
+parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell)
+parseSheetCells sheetElem sharedStrings styles = do
+  -- Find sheetData by local name
+  case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of
+    Nothing -> return M.empty
+    Just sheetData -> do
+      let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData)
+          cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems
+          cells = mapMaybe (parseCell sharedStrings styles) cellElems
+      return $ M.fromList [(cellRef c, c) | c <- cells]
+
+-- | Parse individual cell
+parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell
+parseCell sharedStrings styles cElem = do
+  -- Get cell reference
+  refText <- findAttr (unqual "r") cElem
+  cellRefParsed <- either (const Nothing) Just $ parseCellRef refText
+
+  -- Get cell type (default to number if missing)
+  let cellType = fromMaybe "" $ findAttr (unqual "t") cElem
+      styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack
+
+  -- Get value (match by local name)
+  let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem)
+      vText = maybe "" strContent vElem
+
+  -- Parse value based on type
+  let value = if cellType == "s"
+              then
+                -- Shared string
+                case readMaybe (T.unpack vText) of
+                  Just idx | idx >= 0 && idx < V.length sharedStrings ->
+                    TextValue (sharedStrings V.! idx)
+                  _ -> EmptyValue
+              else if T.null vText
+                then EmptyValue
+                else
+                  -- Number
+                  case readMaybe (T.unpack vText) of
+                    Just n -> NumberValue n
+                    Nothing -> TextValue vText
+
+  -- Get formatting from style
+  let (bold, italic) = case styleIdx of
+        Just idx | idx >= 0 && idx < V.length (styleFonts styles) ->
+          let font = styleFonts styles V.! idx
+           in (fontBold font, fontItalic font)
+        _ -> (False, False)
+
+  return $ XlsxCell cellRefParsed value bold italic
+
+-- Helper functions
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+  entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+           findEntryByPath path archive
+  parseXMLFromEntry entry
+
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry entry =
+  let lazyText = TL.decodeUtf8 $ fromEntry entry
+   in parseXMLElement lazyText
+
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+  case findEntryByPath relsPath archive of
+    Nothing -> Right []
+    Just entry -> do
+      relsElem <- parseXMLFromEntry entry
+      let relElems = onlyElems $ elContent relsElem
+      return $ mapMaybe extractRel relElems
+  where
+    extractRel el = do
+      relId <- findAttr (unqual "Id") el
+      target <- findAttr (unqual "Target") el
+      return (relId, target)
+
+relsPathFor :: FilePath -> FilePath
+relsPathFor path =
+  let (dir, file) = splitFileName path
+   in dir ++ "/_rels/" ++ file ++ ".rels"
+
+findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text)
+findRelWithTarget rels targetName =
+  find (\(_, target) -> targetName `T.isInfixOf` target) rels
+
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
+
+getAllText :: Element -> Text
+getAllText el =
+  let textFromContent (Text cdata) = cdData cdata
+      textFromContent (Elem e) = getAllText e
+      textFromContent _ = ""
+      texts = map textFromContent (elContent el)
+   in T.unwords $ filter (not . T.null) texts
+
+addContext :: Either Text a -> Text -> Either Text a
+addContext (Right x) _ = Right x
+addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")")
diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
new file mode 100644
index 000000000..e75484790
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Sheets
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of XLSX sheets to Pandoc AST.
+-}
+module Text.Pandoc.Readers.Xlsx.Sheets
+  ( xlsxToOutput
+  ) where
+
+import qualified Data.Map.Strict as M
+import qualified Data.Text as T
+import Data.List (sort, dropWhileEnd)
+import Data.Char (isSpace)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse
+import Text.Pandoc.Readers.Xlsx.Cells
+import qualified Text.Pandoc.Builder as B
+
+-- | Convert XLSX to Pandoc output
+xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block])
+xlsxToOutput _opts xlsx =
+  let sheets = xlsxSheets xlsx
+      sheetBlocks = concatMap sheetToBlocks sheets
+   in (mempty, sheetBlocks)
+
+-- | Convert sheet to blocks (header + table)
+sheetToBlocks :: XlsxSheet -> [Block]
+sheetToBlocks sheet =
+  let SheetId n = sheetId sheet
+      name = sheetName sheet
+      sheetIdent = "sheet-" <> T.pack (show n)
+      header = Header 2 (sheetIdent, [], []) (B.toList (B.text name))
+
+      -- Convert cells to table
+      tableBlock = case cellsToTable sheet of
+        Just tbl -> [tbl]
+        Nothing -> []  -- Empty sheet
+   in header : tableBlock
+
+-- | Convert cells to Pandoc Table
+cellsToTable :: XlsxSheet -> Maybe Block
+cellsToTable sheet
+  | M.null (sheetCells sheet) = Nothing
+  | otherwise =
+      let cells = sheetCells sheet
+          -- Get bounds
+          refs = sort $ M.keys cells
+          minCol = minimum $ map cellRefCol refs
+          maxCol = maximum $ map cellRefCol refs
+          minRow = minimum $ map cellRefRow refs
+          maxRow = maximum $ map cellRefRow refs
+
+          -- Build dense grid
+          grid = [ [ M.lookup (CellRef col row) cells
+                   | col <- [minCol..maxCol]
+                   ]
+                 | row <- [minRow..maxRow]
+                 ]
+
+          -- First row is header (simple heuristic)
+          (headerRow, bodyRows) = case grid of
+            (h:bs) -> (h, bs)
+            [] -> ([], [])
+
+          -- Filter out trailing empty rows (rows with only whitespace)
+          filteredBodyRows = dropWhileEnd isEmptyRow bodyRows
+
+          makeCell mcell = case mcell of
+            Just cell ->
+              let inlines = cellToInlines cell
+               in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines]
+            Nothing ->
+              Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []]
+
+          numCols = length headerRow
+          colSpec = replicate numCols (AlignDefault, ColWidthDefault)
+          thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow]
+          tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows]
+          tfoot = TableFoot nullAttr []
+
+       in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot
+
+-- | Check if a row contains only whitespace or empty cells
+isEmptyRow :: [Maybe XlsxCell] -> Bool
+isEmptyRow = all isEmptyCell
+  where
+    isEmptyCell Nothing = True
+    isEmptyCell (Just cell) = case cellValue cell of
+      EmptyValue -> True
+      TextValue t -> T.all isSpace t
+      NumberValue _ -> False
+
+-- | Convert cell to Pandoc inlines
+cellToInlines :: XlsxCell -> [Inline]
+cellToInlines cell =
+  let base = case cellValue cell of
+        TextValue t -> B.toList $ B.text t
+        NumberValue n -> [Str $ T.pack $ show n]
+        EmptyValue -> []
+
+      applyBold inls = if cellBold cell then [Strong inls] else inls
+      applyItalic inls = if cellItalic cell then [Emph inls] else inls
+
+   in applyItalic $ applyBold base
author	Anton Antich <[email protected]>	2025-11-09 14:07:38 +0100
committer	John MacFarlane <[email protected]>	2025-11-24 23:10:33 +0100
commit	59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch)
tree	1ddde4712ae7f8a879bd1299ae408cc596b02a89 /src
parent	ec75b693e5618c12ddac872d48e084436f1e1b48 (diff)