aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/Xlsx.hs40
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Cells.hs63
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Parse.hs296
-rw-r--r--src/Text/Pandoc/Readers/Xlsx/Sheets.hs112
5 files changed, 514 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 5f7b891e2..beec98587 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -27,6 +27,7 @@ module Text.Pandoc.Readers
, readers
, readDocx
, readPptx
+ , readXlsx
, readODT
, readMarkdown
, readCommonMark
@@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.Pptx
+import Text.Pandoc.Readers.Xlsx
import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
@@ -160,6 +162,7 @@ readers = [("native" , TextReader readNative)
,("tikiwiki" , TextReader readTikiWiki)
,("docx" , ByteStringReader readDocx)
,("pptx" , ByteStringReader readPptx)
+ ,("xlsx" , ByteStringReader readXlsx)
,("odt" , ByteStringReader readODT)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs
new file mode 100644
index 000000000..514dfd99e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Xlsx (readXlsx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
+import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)
+
+-- | Read XLSX file into Pandoc AST
+readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readXlsx opts bytes =
+ case toArchiveOrFail bytes of
+ Right archive ->
+ case archiveToXlsx archive of
+ Right xlsx -> do
+ let (meta, blocks) = xlsxToOutput opts xlsx
+ return $ Pandoc meta blocks
+ Left err ->
+ throwError $ PandocParseError $ "Failed to parse XLSX: " <> err
+
+ Left err ->
+ throwError $ PandocParseError $
+ "Failed to unpack XLSX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
new file mode 100644
index 000000000..ca0ad24f1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Cells
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Cell types and parsing for XLSX.
+-}
+module Text.Pandoc.Readers.Xlsx.Cells
+ ( CellRef(..)
+ , XlsxCell(..)
+ , CellValue(..)
+ , parseCellRef
+ ) where
+
+import qualified Data.Text as T
+import Data.Text (Text)
+import Data.Char (ord, isAlpha)
+import Text.Read (readMaybe)
+
+-- | Cell reference (A1 notation)
+data CellRef = CellRef
+ { cellRefCol :: Int -- 1-based (A=1, B=2, ..., AA=27)
+ , cellRefRow :: Int -- 1-based
+ } deriving (Show, Eq, Ord)
+
+-- | Cell value types
+data CellValue
+ = TextValue Text
+ | NumberValue Double
+ | EmptyValue
+ deriving (Show, Eq)
+
+-- | Parsed cell
+data XlsxCell = XlsxCell
+ { cellRef :: CellRef
+ , cellValue :: CellValue
+ , cellBold :: Bool
+ , cellItalic :: Bool
+ } deriving (Show)
+
+-- | Parse cell reference (A1 → CellRef)
+parseCellRef :: Text -> Either Text CellRef
+parseCellRef ref = do
+ let (colStr, rowStr) = T.span isAlpha ref
+
+ row <- case readMaybe (T.unpack rowStr) of
+ Just r | r > 0 -> Right r
+ _ -> Left $ "Invalid row: " <> rowStr
+
+ col <- parseColumn colStr
+
+ return $ CellRef col row
+
+-- | Parse column (A=1, Z=26, AA=27, etc.)
+parseColumn :: Text -> Either Text Int
+parseColumn colStr
+ | T.null colStr = Left "Empty column"
+ | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr
diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
new file mode 100644
index 000000000..1ee350dd1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs
@@ -0,0 +1,296 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Parse
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Parsing of XLSX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Xlsx.Parse
+ ( Xlsx(..)
+ , XlsxWorkbook(..)
+ , XlsxSheet(..)
+ , SheetId(..)
+ , SharedStrings
+ , Styles(..)
+ , FontInfo(..)
+ , archiveToXlsx
+ ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import Data.List (find)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import qualified Data.Vector as V
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Xlsx.Cells
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Sheet identifier
+newtype SheetId = SheetId Int deriving (Show, Eq, Ord)
+
+-- | Shared strings table (Vector for O(1) lookup)
+type SharedStrings = V.Vector Text
+
+-- | Font information
+data FontInfo = FontInfo
+ { fontBold :: Bool
+ , fontItalic :: Bool
+ , fontUnderline :: Bool
+ } deriving (Show)
+
+-- | Style information
+data Styles = Styles
+ { styleFonts :: V.Vector FontInfo
+ } deriving (Show)
+
+-- | Complete XLSX document
+data Xlsx = Xlsx
+ { xlsxWorkbook :: XlsxWorkbook
+ , xlsxSheets :: [XlsxSheet]
+ , xlsxSharedStrings :: SharedStrings
+ , xlsxStyles :: Styles
+ } deriving (Show)
+
+-- | Workbook information
+data XlsxWorkbook = XlsxWorkbook
+ { workbookSheetNames :: [(SheetId, Text, Text)] -- (id, name, relId)
+ } deriving (Show)
+
+-- | Individual worksheet
+data XlsxSheet = XlsxSheet
+ { sheetId :: SheetId
+ , sheetName :: Text
+ , sheetCells :: M.Map CellRef XlsxCell
+ } deriving (Show)
+
+-- | Parse XLSX archive
+archiveToXlsx :: Archive -> Either Text Xlsx
+archiveToXlsx archive = do
+ -- Find and parse workbook.xml
+ workbookPath <- getWorkbookXmlPath archive
+ workbookElem <- loadXMLFromArchive archive workbookPath
+ workbook <- parseWorkbook workbookElem
+ `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath)
+
+ -- Load workbook relationships
+ workbookRels <- loadRelationships archive (relsPathFor workbookPath)
+
+ -- Parse shared strings (look for sharedStrings relationship)
+ sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of
+ Just (_, target) -> do
+ let path = "xl/" ++ T.unpack target
+ el <- loadXMLFromArchive archive path
+ parseSharedStrings el
+ Nothing -> Right V.empty
+
+ -- Parse styles
+ styles <- case findRelWithTarget workbookRels "styles" of
+ Just (_, target) -> do
+ let path = "xl/" ++ T.unpack target
+ el <- loadXMLFromArchive archive path
+ parseStyles el
+ Nothing -> Right $ Styles V.empty
+
+ -- Parse worksheets
+ sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo)
+ (workbookSheetNames workbook)
+
+ return $ Xlsx workbook sheets sharedStrings styles
+
+-- | Find workbook.xml via root relationships
+getWorkbookXmlPath :: Archive -> Either Text FilePath
+getWorkbookXmlPath archive = do
+ relsEntry <- maybeToEither "Missing _rels/.rels" $
+ findEntryByPath "_rels/.rels" archive
+ relsElem <- parseXMLFromEntry relsEntry
+
+ let relElems = onlyElems $ elContent relsElem
+ case find isOfficeDocRel relElems of
+ Nothing -> Left "No workbook.xml relationship found"
+ Just rel -> do
+ target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel
+ return $ T.unpack target
+ where
+ isOfficeDocRel el =
+ case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of
+ (Just relType, Just target) ->
+ "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target
+ _ -> False
+
+-- | Parse workbook.xml
+parseWorkbook :: Element -> Either Text XlsxWorkbook
+parseWorkbook wbElem = do
+ let ns = elemToNameSpaces wbElem
+
+ -- Find sheets element (match by local name only)
+ sheets <- maybeToEither "Missing <sheets>" $
+ find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem)
+
+ let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets)
+ sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems)
+
+ return $ XlsxWorkbook sheetRefs
+
+parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text)
+parseSheetRef ns (idx, sheetElem) = do
+ let name = fromMaybe ("Sheet" <> T.pack (show idx)) $
+ findAttr (unqual "name") sheetElem
+ relId <- maybeToEither "Missing r:id" $
+ findAttrByName ns "r" "id" sheetElem
+ return (SheetId idx, name, relId)
+
+-- | Parse shared strings
+parseSharedStrings :: Element -> Either Text SharedStrings
+parseSharedStrings sstElem = do
+ let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem)
+ strings = map extractString siElems
+ return $ V.fromList strings
+ where
+ extractString siElem =
+ case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of
+ Just tElem -> strContent tElem
+ Nothing -> getAllText siElem
+
+-- | Parse styles (fonts only for MVP)
+parseStyles :: Element -> Either Text Styles
+parseStyles stylesElem = do
+ -- Parse fonts (match by local name)
+ let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem)
+ fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem
+ fonts = V.fromList $ map (parseFont mempty) fontElems
+
+ return $ Styles fonts
+
+parseFont :: NameSpaces -> Element -> FontInfo
+parseFont _ns fontElem =
+ FontInfo
+ { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem)
+ , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem)
+ , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem)
+ }
+
+-- | Parse individual worksheet
+parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet
+parseSheet archive rels sharedStrings styles (sid, name, relId) = do
+ target <- maybeToEither ("Sheet relationship not found: " <> relId) $
+ lookup relId rels
+
+ let sheetPath = "xl/" ++ T.unpack target
+ sheetElem <- loadXMLFromArchive archive sheetPath
+
+ cells <- parseSheetCells sheetElem sharedStrings styles
+
+ return $ XlsxSheet sid name cells
+
+-- | Parse sheet cells
+parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell)
+parseSheetCells sheetElem sharedStrings styles = do
+ -- Find sheetData by local name
+ case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of
+ Nothing -> return M.empty
+ Just sheetData -> do
+ let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData)
+ cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems
+ cells = mapMaybe (parseCell sharedStrings styles) cellElems
+ return $ M.fromList [(cellRef c, c) | c <- cells]
+
+-- | Parse individual cell
+parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell
+parseCell sharedStrings styles cElem = do
+ -- Get cell reference
+ refText <- findAttr (unqual "r") cElem
+ cellRefParsed <- either (const Nothing) Just $ parseCellRef refText
+
+ -- Get cell type (default to number if missing)
+ let cellType = fromMaybe "" $ findAttr (unqual "t") cElem
+ styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack
+
+ -- Get value (match by local name)
+ let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem)
+ vText = maybe "" strContent vElem
+
+ -- Parse value based on type
+ let value = if cellType == "s"
+ then
+ -- Shared string
+ case readMaybe (T.unpack vText) of
+ Just idx | idx >= 0 && idx < V.length sharedStrings ->
+ TextValue (sharedStrings V.! idx)
+ _ -> EmptyValue
+ else if T.null vText
+ then EmptyValue
+ else
+ -- Number
+ case readMaybe (T.unpack vText) of
+ Just n -> NumberValue n
+ Nothing -> TextValue vText
+
+ -- Get formatting from style
+ let (bold, italic) = case styleIdx of
+ Just idx | idx >= 0 && idx < V.length (styleFonts styles) ->
+ let font = styleFonts styles V.! idx
+ in (fontBold font, fontItalic font)
+ _ -> (False, False)
+
+ return $ XlsxCell cellRefParsed value bold italic
+
+-- Helper functions
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+ entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+ findEntryByPath path archive
+ parseXMLFromEntry entry
+
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry entry =
+ let lazyText = TL.decodeUtf8 $ fromEntry entry
+ in parseXMLElement lazyText
+
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+ case findEntryByPath relsPath archive of
+ Nothing -> Right []
+ Just entry -> do
+ relsElem <- parseXMLFromEntry entry
+ let relElems = onlyElems $ elContent relsElem
+ return $ mapMaybe extractRel relElems
+ where
+ extractRel el = do
+ relId <- findAttr (unqual "Id") el
+ target <- findAttr (unqual "Target") el
+ return (relId, target)
+
+relsPathFor :: FilePath -> FilePath
+relsPathFor path =
+ let (dir, file) = splitFileName path
+ in dir ++ "/_rels/" ++ file ++ ".rels"
+
+findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text)
+findRelWithTarget rels targetName =
+ find (\(_, target) -> targetName `T.isInfixOf` target) rels
+
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
+
+getAllText :: Element -> Text
+getAllText el =
+ let textFromContent (Text cdata) = cdData cdata
+ textFromContent (Elem e) = getAllText e
+ textFromContent _ = ""
+ texts = map textFromContent (elContent el)
+ in T.unwords $ filter (not . T.null) texts
+
+addContext :: Either Text a -> Text -> Either Text a
+addContext (Right x) _ = Right x
+addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")")
diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
new file mode 100644
index 000000000..e75484790
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Xlsx.Sheets
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of XLSX sheets to Pandoc AST.
+-}
+module Text.Pandoc.Readers.Xlsx.Sheets
+ ( xlsxToOutput
+ ) where
+
+import qualified Data.Map.Strict as M
+import qualified Data.Text as T
+import Data.List (sort, dropWhileEnd)
+import Data.Char (isSpace)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse
+import Text.Pandoc.Readers.Xlsx.Cells
+import qualified Text.Pandoc.Builder as B
+
+-- | Convert XLSX to Pandoc output
+xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block])
+xlsxToOutput _opts xlsx =
+ let sheets = xlsxSheets xlsx
+ sheetBlocks = concatMap sheetToBlocks sheets
+ in (mempty, sheetBlocks)
+
+-- | Convert sheet to blocks (header + table)
+sheetToBlocks :: XlsxSheet -> [Block]
+sheetToBlocks sheet =
+ let SheetId n = sheetId sheet
+ name = sheetName sheet
+ sheetIdent = "sheet-" <> T.pack (show n)
+ header = Header 2 (sheetIdent, [], []) (B.toList (B.text name))
+
+ -- Convert cells to table
+ tableBlock = case cellsToTable sheet of
+ Just tbl -> [tbl]
+ Nothing -> [] -- Empty sheet
+ in header : tableBlock
+
+-- | Convert cells to Pandoc Table
+cellsToTable :: XlsxSheet -> Maybe Block
+cellsToTable sheet
+ | M.null (sheetCells sheet) = Nothing
+ | otherwise =
+ let cells = sheetCells sheet
+ -- Get bounds
+ refs = sort $ M.keys cells
+ minCol = minimum $ map cellRefCol refs
+ maxCol = maximum $ map cellRefCol refs
+ minRow = minimum $ map cellRefRow refs
+ maxRow = maximum $ map cellRefRow refs
+
+ -- Build dense grid
+ grid = [ [ M.lookup (CellRef col row) cells
+ | col <- [minCol..maxCol]
+ ]
+ | row <- [minRow..maxRow]
+ ]
+
+ -- First row is header (simple heuristic)
+ (headerRow, bodyRows) = case grid of
+ (h:bs) -> (h, bs)
+ [] -> ([], [])
+
+ -- Filter out trailing empty rows (rows with only whitespace)
+ filteredBodyRows = dropWhileEnd isEmptyRow bodyRows
+
+ makeCell mcell = case mcell of
+ Just cell ->
+ let inlines = cellToInlines cell
+ in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines]
+ Nothing ->
+ Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []]
+
+ numCols = length headerRow
+ colSpec = replicate numCols (AlignDefault, ColWidthDefault)
+ thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow]
+ tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows]
+ tfoot = TableFoot nullAttr []
+
+ in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot
+
+-- | Check if a row contains only whitespace or empty cells
+isEmptyRow :: [Maybe XlsxCell] -> Bool
+isEmptyRow = all isEmptyCell
+ where
+ isEmptyCell Nothing = True
+ isEmptyCell (Just cell) = case cellValue cell of
+ EmptyValue -> True
+ TextValue t -> T.all isSpace t
+ NumberValue _ -> False
+
+-- | Convert cell to Pandoc inlines
+cellToInlines :: XlsxCell -> [Inline]
+cellToInlines cell =
+ let base = case cellValue cell of
+ TextValue t -> B.toList $ B.text t
+ NumberValue n -> [Str $ T.pack $ show n]
+ EmptyValue -> []
+
+ applyBold inls = if cellBold cell then [Strong inls] else inls
+ applyItalic inls = if cellItalic cell then [Emph inls] else inls
+
+ in applyItalic $ applyBold base