diff options
| author | Anton Antich <[email protected]> | 2025-11-09 14:07:38 +0100 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-11-24 23:10:33 +0100 |
| commit | 59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch) | |
| tree | 1ddde4712ae7f8a879bd1299ae408cc596b02a89 /src | |
| parent | ec75b693e5618c12ddac872d48e084436f1e1b48 (diff) | |
Add `xlsx` (Microsoft Excel) as an input format.
Each worksheet turns into a section containing a table.
The common file `nativeDiff` has been extract from
the Docx and Pptx text files and put in Tests.Helpers.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx.hs | 40 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Cells.hs | 63 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Parse.hs | 296 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Sheets.hs | 112 |
5 files changed, 514 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 5f7b891e2..beec98587 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -27,6 +27,7 @@ module Text.Pandoc.Readers , readers , readDocx , readPptx + , readXlsx , readODT , readMarkdown , readCommonMark @@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx import Text.Pandoc.Readers.Pptx +import Text.Pandoc.Readers.Xlsx import Text.Pandoc.Readers.DokuWiki import Text.Pandoc.Readers.EPUB import Text.Pandoc.Readers.FB2 @@ -160,6 +162,7 @@ readers = [("native" , TextReader readNative) ,("tikiwiki" , TextReader readTikiWiki) ,("docx" , ByteStringReader readDocx) ,("pptx" , ByteStringReader readPptx) + ,("xlsx" , ByteStringReader readXlsx) ,("odt" , ByteStringReader readODT) ,("t2t" , TextReader readTxt2Tags) ,("epub" , ByteStringReader readEPUB) diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs new file mode 100644 index 000000000..514dfd99e --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx.hs @@ -0,0 +1,40 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Xlsx (readXlsx) where + +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Codec.Archive.Zip (toArchiveOrFail) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition (Pandoc(..)) +import Text.Pandoc.Error (PandocError(..)) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx) +import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput) + +-- | Read XLSX file into Pandoc AST +readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc +readXlsx opts bytes = + case toArchiveOrFail bytes of + Right archive -> + case archiveToXlsx archive of + Right xlsx -> do + let (meta, blocks) = xlsxToOutput opts xlsx + return $ Pandoc meta blocks + Left err -> + throwError $ PandocParseError $ "Failed to parse XLSX: " <> err + + Left err -> + throwError $ PandocParseError $ + "Failed to unpack XLSX archive: " <> T.pack err diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs new file mode 100644 index 000000000..ca0ad24f1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs @@ -0,0 +1,63 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Cells + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Cell types and parsing for XLSX. +-} +module Text.Pandoc.Readers.Xlsx.Cells + ( CellRef(..) + , XlsxCell(..) + , CellValue(..) + , parseCellRef + ) where + +import qualified Data.Text as T +import Data.Text (Text) +import Data.Char (ord, isAlpha) +import Text.Read (readMaybe) + +-- | Cell reference (A1 notation) +data CellRef = CellRef + { cellRefCol :: Int -- 1-based (A=1, B=2, ..., AA=27) + , cellRefRow :: Int -- 1-based + } deriving (Show, Eq, Ord) + +-- | Cell value types +data CellValue + = TextValue Text + | NumberValue Double + | EmptyValue + deriving (Show, Eq) + +-- | Parsed cell +data XlsxCell = XlsxCell + { cellRef :: CellRef + , cellValue :: CellValue + , cellBold :: Bool + , cellItalic :: Bool + } deriving (Show) + +-- | Parse cell reference (A1 → CellRef) +parseCellRef :: Text -> Either Text CellRef +parseCellRef ref = do + let (colStr, rowStr) = T.span isAlpha ref + + row <- case readMaybe (T.unpack rowStr) of + Just r | r > 0 -> Right r + _ -> Left $ "Invalid row: " <> rowStr + + col <- parseColumn colStr + + return $ CellRef col row + +-- | Parse column (A=1, Z=26, AA=27, etc.) +parseColumn :: Text -> Either Text Int +parseColumn colStr + | T.null colStr = Left "Empty column" + | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs new file mode 100644 index 000000000..1ee350dd1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs @@ -0,0 +1,296 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Parse + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Parsing of XLSX archive to intermediate representation. +-} +module Text.Pandoc.Readers.Xlsx.Parse + ( Xlsx(..) + , XlsxWorkbook(..) + , XlsxSheet(..) + , SheetId(..) + , SharedStrings + , Styles(..) + , FontInfo(..) + , archiveToXlsx + ) where + +import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry) +import Data.List (find) +import qualified Data.Map.Strict as M +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Text as T +import qualified Data.Text.Lazy.Encoding as TL +import Data.Text (Text) +import qualified Data.Vector as V +import System.FilePath (splitFileName) +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.Readers.Xlsx.Cells +import Text.Pandoc.XML.Light +import Text.Read (readMaybe) + +-- | Sheet identifier +newtype SheetId = SheetId Int deriving (Show, Eq, Ord) + +-- | Shared strings table (Vector for O(1) lookup) +type SharedStrings = V.Vector Text + +-- | Font information +data FontInfo = FontInfo + { fontBold :: Bool + , fontItalic :: Bool + , fontUnderline :: Bool + } deriving (Show) + +-- | Style information +data Styles = Styles + { styleFonts :: V.Vector FontInfo + } deriving (Show) + +-- | Complete XLSX document +data Xlsx = Xlsx + { xlsxWorkbook :: XlsxWorkbook + , xlsxSheets :: [XlsxSheet] + , xlsxSharedStrings :: SharedStrings + , xlsxStyles :: Styles + } deriving (Show) + +-- | Workbook information +data XlsxWorkbook = XlsxWorkbook + { workbookSheetNames :: [(SheetId, Text, Text)] -- (id, name, relId) + } deriving (Show) + +-- | Individual worksheet +data XlsxSheet = XlsxSheet + { sheetId :: SheetId + , sheetName :: Text + , sheetCells :: M.Map CellRef XlsxCell + } deriving (Show) + +-- | Parse XLSX archive +archiveToXlsx :: Archive -> Either Text Xlsx +archiveToXlsx archive = do + -- Find and parse workbook.xml + workbookPath <- getWorkbookXmlPath archive + workbookElem <- loadXMLFromArchive archive workbookPath + workbook <- parseWorkbook workbookElem + `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath) + + -- Load workbook relationships + workbookRels <- loadRelationships archive (relsPathFor workbookPath) + + -- Parse shared strings (look for sharedStrings relationship) + sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of + Just (_, target) -> do + let path = "xl/" ++ T.unpack target + el <- loadXMLFromArchive archive path + parseSharedStrings el + Nothing -> Right V.empty + + -- Parse styles + styles <- case findRelWithTarget workbookRels "styles" of + Just (_, target) -> do + let path = "xl/" ++ T.unpack target + el <- loadXMLFromArchive archive path + parseStyles el + Nothing -> Right $ Styles V.empty + + -- Parse worksheets + sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo) + (workbookSheetNames workbook) + + return $ Xlsx workbook sheets sharedStrings styles + +-- | Find workbook.xml via root relationships +getWorkbookXmlPath :: Archive -> Either Text FilePath +getWorkbookXmlPath archive = do + relsEntry <- maybeToEither "Missing _rels/.rels" $ + findEntryByPath "_rels/.rels" archive + relsElem <- parseXMLFromEntry relsEntry + + let relElems = onlyElems $ elContent relsElem + case find isOfficeDocRel relElems of + Nothing -> Left "No workbook.xml relationship found" + Just rel -> do + target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel + return $ T.unpack target + where + isOfficeDocRel el = + case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of + (Just relType, Just target) -> + "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target + _ -> False + +-- | Parse workbook.xml +parseWorkbook :: Element -> Either Text XlsxWorkbook +parseWorkbook wbElem = do + let ns = elemToNameSpaces wbElem + + -- Find sheets element (match by local name only) + sheets <- maybeToEither "Missing <sheets>" $ + find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem) + + let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets) + sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems) + + return $ XlsxWorkbook sheetRefs + +parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text) +parseSheetRef ns (idx, sheetElem) = do + let name = fromMaybe ("Sheet" <> T.pack (show idx)) $ + findAttr (unqual "name") sheetElem + relId <- maybeToEither "Missing r:id" $ + findAttrByName ns "r" "id" sheetElem + return (SheetId idx, name, relId) + +-- | Parse shared strings +parseSharedStrings :: Element -> Either Text SharedStrings +parseSharedStrings sstElem = do + let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem) + strings = map extractString siElems + return $ V.fromList strings + where + extractString siElem = + case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of + Just tElem -> strContent tElem + Nothing -> getAllText siElem + +-- | Parse styles (fonts only for MVP) +parseStyles :: Element -> Either Text Styles +parseStyles stylesElem = do + -- Parse fonts (match by local name) + let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem) + fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem + fonts = V.fromList $ map (parseFont mempty) fontElems + + return $ Styles fonts + +parseFont :: NameSpaces -> Element -> FontInfo +parseFont _ns fontElem = + FontInfo + { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem) + , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem) + , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem) + } + +-- | Parse individual worksheet +parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet +parseSheet archive rels sharedStrings styles (sid, name, relId) = do + target <- maybeToEither ("Sheet relationship not found: " <> relId) $ + lookup relId rels + + let sheetPath = "xl/" ++ T.unpack target + sheetElem <- loadXMLFromArchive archive sheetPath + + cells <- parseSheetCells sheetElem sharedStrings styles + + return $ XlsxSheet sid name cells + +-- | Parse sheet cells +parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell) +parseSheetCells sheetElem sharedStrings styles = do + -- Find sheetData by local name + case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of + Nothing -> return M.empty + Just sheetData -> do + let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData) + cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems + cells = mapMaybe (parseCell sharedStrings styles) cellElems + return $ M.fromList [(cellRef c, c) | c <- cells] + +-- | Parse individual cell +parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell +parseCell sharedStrings styles cElem = do + -- Get cell reference + refText <- findAttr (unqual "r") cElem + cellRefParsed <- either (const Nothing) Just $ parseCellRef refText + + -- Get cell type (default to number if missing) + let cellType = fromMaybe "" $ findAttr (unqual "t") cElem + styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack + + -- Get value (match by local name) + let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem) + vText = maybe "" strContent vElem + + -- Parse value based on type + let value = if cellType == "s" + then + -- Shared string + case readMaybe (T.unpack vText) of + Just idx | idx >= 0 && idx < V.length sharedStrings -> + TextValue (sharedStrings V.! idx) + _ -> EmptyValue + else if T.null vText + then EmptyValue + else + -- Number + case readMaybe (T.unpack vText) of + Just n -> NumberValue n + Nothing -> TextValue vText + + -- Get formatting from style + let (bold, italic) = case styleIdx of + Just idx | idx >= 0 && idx < V.length (styleFonts styles) -> + let font = styleFonts styles V.! idx + in (fontBold font, fontItalic font) + _ -> (False, False) + + return $ XlsxCell cellRefParsed value bold italic + +-- Helper functions +loadXMLFromArchive :: Archive -> FilePath -> Either Text Element +loadXMLFromArchive archive path = do + entry <- maybeToEither ("Entry not found: " <> T.pack path) $ + findEntryByPath path archive + parseXMLFromEntry entry + +parseXMLFromEntry :: Entry -> Either Text Element +parseXMLFromEntry entry = + let lazyText = TL.decodeUtf8 $ fromEntry entry + in parseXMLElement lazyText + +loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)] +loadRelationships archive relsPath = + case findEntryByPath relsPath archive of + Nothing -> Right [] + Just entry -> do + relsElem <- parseXMLFromEntry entry + let relElems = onlyElems $ elContent relsElem + return $ mapMaybe extractRel relElems + where + extractRel el = do + relId <- findAttr (unqual "Id") el + target <- findAttr (unqual "Target") el + return (relId, target) + +relsPathFor :: FilePath -> FilePath +relsPathFor path = + let (dir, file) = splitFileName path + in dir ++ "/_rels/" ++ file ++ ".rels" + +findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text) +findRelWithTarget rels targetName = + find (\(_, target) -> targetName `T.isInfixOf` target) rels + +maybeToEither :: Text -> Maybe a -> Either Text a +maybeToEither err Nothing = Left err +maybeToEither _ (Just x) = Right x + +getAllText :: Element -> Text +getAllText el = + let textFromContent (Text cdata) = cdData cdata + textFromContent (Elem e) = getAllText e + textFromContent _ = "" + texts = map textFromContent (elContent el) + in T.unwords $ filter (not . T.null) texts + +addContext :: Either Text a -> Text -> Either Text a +addContext (Right x) _ = Right x +addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")") diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs new file mode 100644 index 000000000..e75484790 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs @@ -0,0 +1,112 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Sheets + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Conversion of XLSX sheets to Pandoc AST. +-} +module Text.Pandoc.Readers.Xlsx.Sheets + ( xlsxToOutput + ) where + +import qualified Data.Map.Strict as M +import qualified Data.Text as T +import Data.List (sort, dropWhileEnd) +import Data.Char (isSpace) +import Text.Pandoc.Definition +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.Xlsx.Parse +import Text.Pandoc.Readers.Xlsx.Cells +import qualified Text.Pandoc.Builder as B + +-- | Convert XLSX to Pandoc output +xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block]) +xlsxToOutput _opts xlsx = + let sheets = xlsxSheets xlsx + sheetBlocks = concatMap sheetToBlocks sheets + in (mempty, sheetBlocks) + +-- | Convert sheet to blocks (header + table) +sheetToBlocks :: XlsxSheet -> [Block] +sheetToBlocks sheet = + let SheetId n = sheetId sheet + name = sheetName sheet + sheetIdent = "sheet-" <> T.pack (show n) + header = Header 2 (sheetIdent, [], []) (B.toList (B.text name)) + + -- Convert cells to table + tableBlock = case cellsToTable sheet of + Just tbl -> [tbl] + Nothing -> [] -- Empty sheet + in header : tableBlock + +-- | Convert cells to Pandoc Table +cellsToTable :: XlsxSheet -> Maybe Block +cellsToTable sheet + | M.null (sheetCells sheet) = Nothing + | otherwise = + let cells = sheetCells sheet + -- Get bounds + refs = sort $ M.keys cells + minCol = minimum $ map cellRefCol refs + maxCol = maximum $ map cellRefCol refs + minRow = minimum $ map cellRefRow refs + maxRow = maximum $ map cellRefRow refs + + -- Build dense grid + grid = [ [ M.lookup (CellRef col row) cells + | col <- [minCol..maxCol] + ] + | row <- [minRow..maxRow] + ] + + -- First row is header (simple heuristic) + (headerRow, bodyRows) = case grid of + (h:bs) -> (h, bs) + [] -> ([], []) + + -- Filter out trailing empty rows (rows with only whitespace) + filteredBodyRows = dropWhileEnd isEmptyRow bodyRows + + makeCell mcell = case mcell of + Just cell -> + let inlines = cellToInlines cell + in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines] + Nothing -> + Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []] + + numCols = length headerRow + colSpec = replicate numCols (AlignDefault, ColWidthDefault) + thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow] + tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows] + tfoot = TableFoot nullAttr [] + + in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot + +-- | Check if a row contains only whitespace or empty cells +isEmptyRow :: [Maybe XlsxCell] -> Bool +isEmptyRow = all isEmptyCell + where + isEmptyCell Nothing = True + isEmptyCell (Just cell) = case cellValue cell of + EmptyValue -> True + TextValue t -> T.all isSpace t + NumberValue _ -> False + +-- | Convert cell to Pandoc inlines +cellToInlines :: XlsxCell -> [Inline] +cellToInlines cell = + let base = case cellValue cell of + TextValue t -> B.toList $ B.text t + NumberValue n -> [Str $ T.pack $ show n] + EmptyValue -> [] + + applyBold inls = if cellBold cell then [Strong inls] else inls + applyItalic inls = if cellItalic cell then [Emph inls] else inls + + in applyItalic $ applyBold base |
