diff options
| author | Anton Antich <[email protected]> | 2025-11-09 14:07:38 +0100 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-11-24 23:10:33 +0100 |
| commit | 59b8b3ed4b6998fad87b3a29ce59ba7a5cf40c46 (patch) | |
| tree | 1ddde4712ae7f8a879bd1299ae408cc596b02a89 | |
| parent | ec75b693e5618c12ddac872d48e084436f1e1b48 (diff) | |
Add `xlsx` (Microsoft Excel) as an input format.
Each worksheet turns into a section containing a table.
The common file `nativeDiff` has been extract from
the Docx and Pptx text files and put in Tests.Helpers.
| -rw-r--r-- | MANUAL.txt | 2 | ||||
| -rw-r--r-- | pandoc.cabal | 7 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx.hs | 40 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Cells.hs | 63 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Parse.hs | 296 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Xlsx/Sheets.hs | 112 | ||||
| -rw-r--r-- | test/Tests/Helpers.hs | 14 | ||||
| -rw-r--r-- | test/Tests/Readers/Docx.hs | 14 | ||||
| -rw-r--r-- | test/Tests/Readers/Pptx.hs | 15 | ||||
| -rw-r--r-- | test/Tests/Readers/Xlsx.hs | 48 | ||||
| -rw-r--r-- | test/test-pandoc.hs | 4 | ||||
| -rw-r--r-- | test/xlsx-reader/basic.native | 381 | ||||
| -rw-r--r-- | test/xlsx-reader/basic.xlsx | bin | 0 -> 13604 bytes |
14 files changed, 969 insertions, 30 deletions
diff --git a/MANUAL.txt b/MANUAL.txt index d394b2f6d..c3b657f72 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -283,6 +283,7 @@ header when requesting a document from a URL: - `twiki` ([TWiki markup]) - `typst` ([typst]) - `vimwiki` ([Vimwiki]) + - `xlsx` ([Excel spreadsheet][XLSX]) - `xml` (XML version of native AST) - the path of a custom Lua reader, see [Custom readers and writers] below ::: @@ -519,6 +520,7 @@ header when requesting a document from a URL: [DokuWiki markup]: https://www.dokuwiki.org/dokuwiki [ZimWiki markup]: https://zim-wiki.org/manual/Help/Wiki_Syntax.html [XWiki markup]: https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/ +[XLSX]: https://en.wikipedia.org/wiki/Microsoft_Excel#File_formats [Vimdoc]: https://vimhelp.org/helphelp.txt.html#help-writing [TWiki markup]: https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules [TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax diff --git a/pandoc.cabal b/pandoc.cabal index f648dce11..8aa2ddd1d 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -440,6 +440,8 @@ extra-source-files: test/odt/odt/*.odt test/odt/markdown/*.md test/odt/native/*.native + test/xlsx-reader/*.xlsx + test/xlsx-reader/*.native test/pod-reader.pod test/vimdoc/*.markdown test/vimdoc/*.vimdoc @@ -613,6 +615,7 @@ library Text.Pandoc.Readers.Txt2Tags, Text.Pandoc.Readers.Docx, Text.Pandoc.Readers.Pptx, + Text.Pandoc.Readers.Xlsx, Text.Pandoc.Readers.ODT, Text.Pandoc.Readers.EPUB, Text.Pandoc.Readers.Muse, @@ -726,6 +729,9 @@ library Text.Pandoc.Readers.Pptx.Shapes, Text.Pandoc.Readers.Pptx.Slides, Text.Pandoc.Readers.Pptx.SmartArt, + Text.Pandoc.Readers.Xlsx.Parse, + Text.Pandoc.Readers.Xlsx.Cells, + Text.Pandoc.Readers.Xlsx.Sheets, Text.Pandoc.Readers.HTML.Parsing, Text.Pandoc.Readers.HTML.Table, Text.Pandoc.Readers.HTML.TagCategories, @@ -863,6 +869,7 @@ test-suite test-pandoc Tests.Readers.RTF Tests.Readers.Docx Tests.Readers.Pptx + Tests.Readers.Xlsx Tests.Readers.ODT Tests.Readers.Txt2Tags Tests.Readers.EPUB diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 5f7b891e2..beec98587 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -27,6 +27,7 @@ module Text.Pandoc.Readers , readers , readDocx , readPptx + , readXlsx , readODT , readMarkdown , readCommonMark @@ -89,6 +90,7 @@ import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx import Text.Pandoc.Readers.Pptx +import Text.Pandoc.Readers.Xlsx import Text.Pandoc.Readers.DokuWiki import Text.Pandoc.Readers.EPUB import Text.Pandoc.Readers.FB2 @@ -160,6 +162,7 @@ readers = [("native" , TextReader readNative) ,("tikiwiki" , TextReader readTikiWiki) ,("docx" , ByteStringReader readDocx) ,("pptx" , ByteStringReader readPptx) + ,("xlsx" , ByteStringReader readXlsx) ,("odt" , ByteStringReader readODT) ,("t2t" , TextReader readTxt2Tags) ,("epub" , ByteStringReader readEPUB) diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs new file mode 100644 index 000000000..514dfd99e --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx.hs @@ -0,0 +1,40 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Xlsx (readXlsx) where + +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Codec.Archive.Zip (toArchiveOrFail) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition (Pandoc(..)) +import Text.Pandoc.Error (PandocError(..)) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx) +import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput) + +-- | Read XLSX file into Pandoc AST +readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc +readXlsx opts bytes = + case toArchiveOrFail bytes of + Right archive -> + case archiveToXlsx archive of + Right xlsx -> do + let (meta, blocks) = xlsxToOutput opts xlsx + return $ Pandoc meta blocks + Left err -> + throwError $ PandocParseError $ "Failed to parse XLSX: " <> err + + Left err -> + throwError $ PandocParseError $ + "Failed to unpack XLSX archive: " <> T.pack err diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs new file mode 100644 index 000000000..ca0ad24f1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Cells.hs @@ -0,0 +1,63 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Cells + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Cell types and parsing for XLSX. +-} +module Text.Pandoc.Readers.Xlsx.Cells + ( CellRef(..) + , XlsxCell(..) + , CellValue(..) + , parseCellRef + ) where + +import qualified Data.Text as T +import Data.Text (Text) +import Data.Char (ord, isAlpha) +import Text.Read (readMaybe) + +-- | Cell reference (A1 notation) +data CellRef = CellRef + { cellRefCol :: Int -- 1-based (A=1, B=2, ..., AA=27) + , cellRefRow :: Int -- 1-based + } deriving (Show, Eq, Ord) + +-- | Cell value types +data CellValue + = TextValue Text + | NumberValue Double + | EmptyValue + deriving (Show, Eq) + +-- | Parsed cell +data XlsxCell = XlsxCell + { cellRef :: CellRef + , cellValue :: CellValue + , cellBold :: Bool + , cellItalic :: Bool + } deriving (Show) + +-- | Parse cell reference (A1 → CellRef) +parseCellRef :: Text -> Either Text CellRef +parseCellRef ref = do + let (colStr, rowStr) = T.span isAlpha ref + + row <- case readMaybe (T.unpack rowStr) of + Just r | r > 0 -> Right r + _ -> Left $ "Invalid row: " <> rowStr + + col <- parseColumn colStr + + return $ CellRef col row + +-- | Parse column (A=1, Z=26, AA=27, etc.) +parseColumn :: Text -> Either Text Int +parseColumn colStr + | T.null colStr = Left "Empty column" + | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr diff --git a/src/Text/Pandoc/Readers/Xlsx/Parse.hs b/src/Text/Pandoc/Readers/Xlsx/Parse.hs new file mode 100644 index 000000000..1ee350dd1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Parse.hs @@ -0,0 +1,296 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Parse + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Parsing of XLSX archive to intermediate representation. +-} +module Text.Pandoc.Readers.Xlsx.Parse + ( Xlsx(..) + , XlsxWorkbook(..) + , XlsxSheet(..) + , SheetId(..) + , SharedStrings + , Styles(..) + , FontInfo(..) + , archiveToXlsx + ) where + +import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry) +import Data.List (find) +import qualified Data.Map.Strict as M +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Text as T +import qualified Data.Text.Lazy.Encoding as TL +import Data.Text (Text) +import qualified Data.Vector as V +import System.FilePath (splitFileName) +import Text.Pandoc.Readers.OOXML.Shared +import Text.Pandoc.Readers.Xlsx.Cells +import Text.Pandoc.XML.Light +import Text.Read (readMaybe) + +-- | Sheet identifier +newtype SheetId = SheetId Int deriving (Show, Eq, Ord) + +-- | Shared strings table (Vector for O(1) lookup) +type SharedStrings = V.Vector Text + +-- | Font information +data FontInfo = FontInfo + { fontBold :: Bool + , fontItalic :: Bool + , fontUnderline :: Bool + } deriving (Show) + +-- | Style information +data Styles = Styles + { styleFonts :: V.Vector FontInfo + } deriving (Show) + +-- | Complete XLSX document +data Xlsx = Xlsx + { xlsxWorkbook :: XlsxWorkbook + , xlsxSheets :: [XlsxSheet] + , xlsxSharedStrings :: SharedStrings + , xlsxStyles :: Styles + } deriving (Show) + +-- | Workbook information +data XlsxWorkbook = XlsxWorkbook + { workbookSheetNames :: [(SheetId, Text, Text)] -- (id, name, relId) + } deriving (Show) + +-- | Individual worksheet +data XlsxSheet = XlsxSheet + { sheetId :: SheetId + , sheetName :: Text + , sheetCells :: M.Map CellRef XlsxCell + } deriving (Show) + +-- | Parse XLSX archive +archiveToXlsx :: Archive -> Either Text Xlsx +archiveToXlsx archive = do + -- Find and parse workbook.xml + workbookPath <- getWorkbookXmlPath archive + workbookElem <- loadXMLFromArchive archive workbookPath + workbook <- parseWorkbook workbookElem + `addContext` ("Parsing workbook.xml from: " <> T.pack workbookPath) + + -- Load workbook relationships + workbookRels <- loadRelationships archive (relsPathFor workbookPath) + + -- Parse shared strings (look for sharedStrings relationship) + sharedStrings <- case findRelWithTarget workbookRels "sharedStrings" of + Just (_, target) -> do + let path = "xl/" ++ T.unpack target + el <- loadXMLFromArchive archive path + parseSharedStrings el + Nothing -> Right V.empty + + -- Parse styles + styles <- case findRelWithTarget workbookRels "styles" of + Just (_, target) -> do + let path = "xl/" ++ T.unpack target + el <- loadXMLFromArchive archive path + parseStyles el + Nothing -> Right $ Styles V.empty + + -- Parse worksheets + sheets <- mapM (\sheetInfo -> parseSheet archive workbookRels sharedStrings styles sheetInfo) + (workbookSheetNames workbook) + + return $ Xlsx workbook sheets sharedStrings styles + +-- | Find workbook.xml via root relationships +getWorkbookXmlPath :: Archive -> Either Text FilePath +getWorkbookXmlPath archive = do + relsEntry <- maybeToEither "Missing _rels/.rels" $ + findEntryByPath "_rels/.rels" archive + relsElem <- parseXMLFromEntry relsEntry + + let relElems = onlyElems $ elContent relsElem + case find isOfficeDocRel relElems of + Nothing -> Left "No workbook.xml relationship found" + Just rel -> do + target <- maybeToEither "Missing Target" $ findAttr (unqual "Target") rel + return $ T.unpack target + where + isOfficeDocRel el = + case (findAttr (unqual "Type") el, findAttr (unqual "Target") el) of + (Just relType, Just target) -> + "officeDocument" `T.isInfixOf` relType && "workbook" `T.isInfixOf` target + _ -> False + +-- | Parse workbook.xml +parseWorkbook :: Element -> Either Text XlsxWorkbook +parseWorkbook wbElem = do + let ns = elemToNameSpaces wbElem + + -- Find sheets element (match by local name only) + sheets <- maybeToEither "Missing <sheets>" $ + find (\e -> qName (elName e) == "sheets") (onlyElems $ elContent wbElem) + + let sheetElems = filter (\e -> qName (elName e) == "sheet") (onlyElems $ elContent sheets) + sheetRefs <- mapM (parseSheetRef ns) (zip [1..] sheetElems) + + return $ XlsxWorkbook sheetRefs + +parseSheetRef :: NameSpaces -> (Int, Element) -> Either Text (SheetId, Text, Text) +parseSheetRef ns (idx, sheetElem) = do + let name = fromMaybe ("Sheet" <> T.pack (show idx)) $ + findAttr (unqual "name") sheetElem + relId <- maybeToEither "Missing r:id" $ + findAttrByName ns "r" "id" sheetElem + return (SheetId idx, name, relId) + +-- | Parse shared strings +parseSharedStrings :: Element -> Either Text SharedStrings +parseSharedStrings sstElem = do + let siElems = filter (\e -> qName (elName e) == "si") (onlyElems $ elContent sstElem) + strings = map extractString siElems + return $ V.fromList strings + where + extractString siElem = + case find (\e -> qName (elName e) == "t") (onlyElems $ elContent siElem) of + Just tElem -> strContent tElem + Nothing -> getAllText siElem + +-- | Parse styles (fonts only for MVP) +parseStyles :: Element -> Either Text Styles +parseStyles stylesElem = do + -- Parse fonts (match by local name) + let fontsElem = find (\e -> qName (elName e) == "fonts") (onlyElems $ elContent stylesElem) + fontElems = maybe [] (\fe -> filter (\e -> qName (elName e) == "font") (onlyElems $ elContent fe)) fontsElem + fonts = V.fromList $ map (parseFont mempty) fontElems + + return $ Styles fonts + +parseFont :: NameSpaces -> Element -> FontInfo +parseFont _ns fontElem = + FontInfo + { fontBold = any (\e -> qName (elName e) == "b") (onlyElems $ elContent fontElem) + , fontItalic = any (\e -> qName (elName e) == "i") (onlyElems $ elContent fontElem) + , fontUnderline = any (\e -> qName (elName e) == "u") (onlyElems $ elContent fontElem) + } + +-- | Parse individual worksheet +parseSheet :: Archive -> [(Text, Text)] -> SharedStrings -> Styles -> (SheetId, Text, Text) -> Either Text XlsxSheet +parseSheet archive rels sharedStrings styles (sid, name, relId) = do + target <- maybeToEither ("Sheet relationship not found: " <> relId) $ + lookup relId rels + + let sheetPath = "xl/" ++ T.unpack target + sheetElem <- loadXMLFromArchive archive sheetPath + + cells <- parseSheetCells sheetElem sharedStrings styles + + return $ XlsxSheet sid name cells + +-- | Parse sheet cells +parseSheetCells :: Element -> SharedStrings -> Styles -> Either Text (M.Map CellRef XlsxCell) +parseSheetCells sheetElem sharedStrings styles = do + -- Find sheetData by local name + case find (\e -> qName (elName e) == "sheetData") (onlyElems $ elContent sheetElem) of + Nothing -> return M.empty + Just sheetData -> do + let rowElems = filter (\e -> qName (elName e) == "row") (onlyElems $ elContent sheetData) + cellElems = concatMap (\r -> filter (\e -> qName (elName e) == "c") (onlyElems $ elContent r)) rowElems + cells = mapMaybe (parseCell sharedStrings styles) cellElems + return $ M.fromList [(cellRef c, c) | c <- cells] + +-- | Parse individual cell +parseCell :: SharedStrings -> Styles -> Element -> Maybe XlsxCell +parseCell sharedStrings styles cElem = do + -- Get cell reference + refText <- findAttr (unqual "r") cElem + cellRefParsed <- either (const Nothing) Just $ parseCellRef refText + + -- Get cell type (default to number if missing) + let cellType = fromMaybe "" $ findAttr (unqual "t") cElem + styleIdx = findAttr (unqual "s") cElem >>= readMaybe . T.unpack + + -- Get value (match by local name) + let vElem = find (\e -> qName (elName e) == "v") (onlyElems $ elContent cElem) + vText = maybe "" strContent vElem + + -- Parse value based on type + let value = if cellType == "s" + then + -- Shared string + case readMaybe (T.unpack vText) of + Just idx | idx >= 0 && idx < V.length sharedStrings -> + TextValue (sharedStrings V.! idx) + _ -> EmptyValue + else if T.null vText + then EmptyValue + else + -- Number + case readMaybe (T.unpack vText) of + Just n -> NumberValue n + Nothing -> TextValue vText + + -- Get formatting from style + let (bold, italic) = case styleIdx of + Just idx | idx >= 0 && idx < V.length (styleFonts styles) -> + let font = styleFonts styles V.! idx + in (fontBold font, fontItalic font) + _ -> (False, False) + + return $ XlsxCell cellRefParsed value bold italic + +-- Helper functions +loadXMLFromArchive :: Archive -> FilePath -> Either Text Element +loadXMLFromArchive archive path = do + entry <- maybeToEither ("Entry not found: " <> T.pack path) $ + findEntryByPath path archive + parseXMLFromEntry entry + +parseXMLFromEntry :: Entry -> Either Text Element +parseXMLFromEntry entry = + let lazyText = TL.decodeUtf8 $ fromEntry entry + in parseXMLElement lazyText + +loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)] +loadRelationships archive relsPath = + case findEntryByPath relsPath archive of + Nothing -> Right [] + Just entry -> do + relsElem <- parseXMLFromEntry entry + let relElems = onlyElems $ elContent relsElem + return $ mapMaybe extractRel relElems + where + extractRel el = do + relId <- findAttr (unqual "Id") el + target <- findAttr (unqual "Target") el + return (relId, target) + +relsPathFor :: FilePath -> FilePath +relsPathFor path = + let (dir, file) = splitFileName path + in dir ++ "/_rels/" ++ file ++ ".rels" + +findRelWithTarget :: [(Text, Text)] -> Text -> Maybe (Text, Text) +findRelWithTarget rels targetName = + find (\(_, target) -> targetName `T.isInfixOf` target) rels + +maybeToEither :: Text -> Maybe a -> Either Text a +maybeToEither err Nothing = Left err +maybeToEither _ (Just x) = Right x + +getAllText :: Element -> Text +getAllText el = + let textFromContent (Text cdata) = cdData cdata + textFromContent (Elem e) = getAllText e + textFromContent _ = "" + texts = map textFromContent (elContent el) + in T.unwords $ filter (not . T.null) texts + +addContext :: Either Text a -> Text -> Either Text a +addContext (Right x) _ = Right x +addContext (Left err) ctx = Left (err <> " (context: " <> ctx <> ")") diff --git a/src/Text/Pandoc/Readers/Xlsx/Sheets.hs b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs new file mode 100644 index 000000000..e75484790 --- /dev/null +++ b/src/Text/Pandoc/Readers/Xlsx/Sheets.hs @@ -0,0 +1,112 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Xlsx.Sheets + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Conversion of XLSX sheets to Pandoc AST. +-} +module Text.Pandoc.Readers.Xlsx.Sheets + ( xlsxToOutput + ) where + +import qualified Data.Map.Strict as M +import qualified Data.Text as T +import Data.List (sort, dropWhileEnd) +import Data.Char (isSpace) +import Text.Pandoc.Definition +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Readers.Xlsx.Parse +import Text.Pandoc.Readers.Xlsx.Cells +import qualified Text.Pandoc.Builder as B + +-- | Convert XLSX to Pandoc output +xlsxToOutput :: ReaderOptions -> Xlsx -> (Meta, [Block]) +xlsxToOutput _opts xlsx = + let sheets = xlsxSheets xlsx + sheetBlocks = concatMap sheetToBlocks sheets + in (mempty, sheetBlocks) + +-- | Convert sheet to blocks (header + table) +sheetToBlocks :: XlsxSheet -> [Block] +sheetToBlocks sheet = + let SheetId n = sheetId sheet + name = sheetName sheet + sheetIdent = "sheet-" <> T.pack (show n) + header = Header 2 (sheetIdent, [], []) (B.toList (B.text name)) + + -- Convert cells to table + tableBlock = case cellsToTable sheet of + Just tbl -> [tbl] + Nothing -> [] -- Empty sheet + in header : tableBlock + +-- | Convert cells to Pandoc Table +cellsToTable :: XlsxSheet -> Maybe Block +cellsToTable sheet + | M.null (sheetCells sheet) = Nothing + | otherwise = + let cells = sheetCells sheet + -- Get bounds + refs = sort $ M.keys cells + minCol = minimum $ map cellRefCol refs + maxCol = maximum $ map cellRefCol refs + minRow = minimum $ map cellRefRow refs + maxRow = maximum $ map cellRefRow refs + + -- Build dense grid + grid = [ [ M.lookup (CellRef col row) cells + | col <- [minCol..maxCol] + ] + | row <- [minRow..maxRow] + ] + + -- First row is header (simple heuristic) + (headerRow, bodyRows) = case grid of + (h:bs) -> (h, bs) + [] -> ([], []) + + -- Filter out trailing empty rows (rows with only whitespace) + filteredBodyRows = dropWhileEnd isEmptyRow bodyRows + + makeCell mcell = case mcell of + Just cell -> + let inlines = cellToInlines cell + in Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain inlines] + Nothing -> + Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain []] + + numCols = length headerRow + colSpec = replicate numCols (AlignDefault, ColWidthDefault) + thead = TableHead nullAttr [Row nullAttr $ map makeCell headerRow] + tbody = [TableBody nullAttr 0 [] $ map (Row nullAttr . map makeCell) filteredBodyRows] + tfoot = TableFoot nullAttr [] + + in Just $ Table nullAttr (Caption Nothing []) colSpec thead tbody tfoot + +-- | Check if a row contains only whitespace or empty cells +isEmptyRow :: [Maybe XlsxCell] -> Bool +isEmptyRow = all isEmptyCell + where + isEmptyCell Nothing = True + isEmptyCell (Just cell) = case cellValue cell of + EmptyValue -> True + TextValue t -> T.all isSpace t + NumberValue _ -> False + +-- | Convert cell to Pandoc inlines +cellToInlines :: XlsxCell -> [Inline] +cellToInlines cell = + let base = case cellValue cell of + TextValue t -> B.toList $ B.text t + NumberValue n -> [Str $ T.pack $ show n] + EmptyValue -> [] + + applyBold inls = if cellBold cell then [Strong inls] else inls + applyItalic inls = if cellItalic cell then [Emph inls] else inls + + in applyItalic $ applyBold base diff --git a/test/Tests/Helpers.hs b/test/Tests/Helpers.hs index 3e930b14a..081611ed9 100644 --- a/test/Tests/Helpers.hs +++ b/test/Tests/Helpers.hs @@ -16,6 +16,7 @@ module Tests.Helpers ( test , TestResult(..) , setupEnvironment , showDiff + , nativeDiff , testGolden , (=?>) , purely @@ -132,6 +133,19 @@ vividize (Both s _) = " " ++ s vividize (First s) = "- " ++ s vividize (Second s) = "+ " ++ s +nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String) +nativeDiff normPath expectedNative actualNative + | expectedNative == actualNative = return Nothing + | otherwise = Just <$> do + expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative) + actual <- T.unpack <$> runIOorExplode (writeNative def actualNative) + let dash = replicate 72 '-' + let diff = getDiff (lines actual) (lines expected) + return $ '\n' : dash ++ + "\n--- " ++ normPath ++ + "\n+++ " ++ "test" ++ "\n" ++ + showDiff (1,1) diff ++ dash + purely :: (b -> PandocPure a) -> b -> a purely f = either (error . show) id . runPure . f diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 0bd70d0e2..76af649b4 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -34,20 +34,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "docx" } testCompare :: String -> FilePath -> FilePath -> TestTree testCompare = testCompareWithOpts defopts - -nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String) -nativeDiff normPath expectedNative actualNative - | expectedNative == actualNative = return Nothing - | otherwise = Just <$> do - expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative) - actual <- T.unpack <$> runIOorExplode (writeNative def actualNative) - let dash = replicate 72 '-' - let diff = getDiff (lines actual) (lines expected) - return $ '\n' : dash ++ - "\n--- " ++ normPath ++ - "\n+++ " ++ "test" ++ "\n" ++ - showDiff (1,1) diff ++ dash - testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree testCompareWithOpts opts testName docxFP nativeFP = goldenTest diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs index 613d5b50f..3358e4111 100644 --- a/test/Tests/Readers/Pptx.hs +++ b/test/Tests/Readers/Pptx.hs @@ -12,10 +12,8 @@ Tests for the PPTX reader. -} module Tests.Readers.Pptx (tests) where -import Data.Algorithm.Diff (getDiff) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B -import qualified Data.Text as T import Test.Tasty import Test.Tasty.Golden.Advanced import Tests.Helpers @@ -28,19 +26,6 @@ defopts = def{ readerExtensions = getDefaultExtensions "pptx" } testCompare :: String -> FilePath -> FilePath -> TestTree testCompare = testCompareWithOpts defopts -nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String) -nativeDiff normPath expectedNative actualNative - | expectedNative == actualNative = return Nothing - | otherwise = Just <$> do - expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative) - actual <- T.unpack <$> runIOorExplode (writeNative def actualNative) - let dash = replicate 72 '-' - let diff = getDiff (lines actual) (lines expected) - return $ '\n' : dash ++ - "\n--- " ++ normPath ++ - "\n+++ " ++ "test" ++ "\n" ++ - showDiff (1,1) diff ++ dash - testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree testCompareWithOpts opts testName pptxFP nativeFP = goldenTest diff --git a/test/Tests/Readers/Xlsx.hs b/test/Tests/Readers/Xlsx.hs new file mode 100644 index 000000000..189cd0c16 --- /dev/null +++ b/test/Tests/Readers/Xlsx.hs @@ -0,0 +1,48 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Xlsx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic <[email protected]> + Stability : alpha + Portability : portable + +Tests for the XLSX reader. +-} +module Tests.Readers.Xlsx (tests) where + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Test.Tasty +import Test.Tasty.Golden.Advanced +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.UTF8 as UTF8 + +defopts :: ReaderOptions +defopts = def{ readerExtensions = getDefaultExtensions "xlsx" } + +testCompare :: String -> FilePath -> FilePath -> TestTree +testCompare = testCompareWithOpts defopts + +testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree +testCompareWithOpts opts testName xlsxFP nativeFP = + goldenTest + testName + (do nf <- UTF8.toText <$> BS.readFile nativeFP + runIOorExplode (readNative def nf)) + (do df <- B.readFile xlsxFP + runIOorExplode (readXlsx opts df)) + (nativeDiff nativeFP) + (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a) + >>= BS.writeFile nativeFP . UTF8.fromText) + +tests :: [TestTree] +tests = [ testGroup "basic" + [ testCompare + "sheet extraction" + "xlsx-reader/basic.xlsx" + "xlsx-reader/basic.native" + ] + ] diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index 0d04b361f..9ae97d9c0 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -13,6 +13,7 @@ import qualified Tests.Old import qualified Tests.Readers.Creole import qualified Tests.Readers.Docx import qualified Tests.Readers.Pptx +import qualified Tests.Readers.Xlsx import qualified Tests.Readers.DokuWiki import qualified Tests.Readers.EPUB import qualified Tests.Readers.FB2 @@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "RTF" Tests.Readers.RTF.tests , testGroup "Docx" Tests.Readers.Docx.tests , testGroup "Pptx" Tests.Readers.Pptx.tests + , testGroup "Xlsx" Tests.Readers.Xlsx.tests , testGroup "ODT" Tests.Readers.ODT.tests , testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests , testGroup "EPUB" Tests.Readers.EPUB.tests @@ -126,4 +128,4 @@ main = do _ -> inDirectory "test" $ do fp <- getExecutablePath -- putStrLn $ "Using pandoc executable at " ++ fp - defaultMain $ tests fp
\ No newline at end of file + defaultMain $ tests fp diff --git a/test/xlsx-reader/basic.native b/test/xlsx-reader/basic.native new file mode 100644 index 000000000..f69f78a41 --- /dev/null +++ b/test/xlsx-reader/basic.native @@ -0,0 +1,381 @@ +Pandoc + Meta { unMeta = fromList [] } + [ Header 2 ( "sheet-1" , [] , [] ) [ Str "Main" ] + , Table + ( "" , [] , [] ) + (Caption Nothing []) + [ ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + ] + (TableHead + ( "" , [] , [] ) + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Strong [ Str "Person" ] ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Strong [ Str "Age" ] ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Strong [ Str "Location" ] ] ] + ] + ]) + [ TableBody + ( "" , [] , [] ) + (RowHeadColumns 0) + [] + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Anton" , Space , Str "Antich" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Switzerland" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "James" , Space , Str "Bond" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "35.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Moscow" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain + [ Str "Just" + , Space + , Str "a" + , Space + , Str "random" + , Space + , Str "cell" + ] + ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + ] + ] + ] + (TableFoot ( "" , [] , [] ) []) + , Header 2 ( "sheet-2" , [] , [] ) [ Str "Secondary" ] + , Table + ( "" , [] , [] ) + (Caption Nothing []) + [ ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + ] + (TableHead + ( "" , [] , [] ) + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain + [ Str "Sum" + , Space + , Str "of" + , Space + , Str "Age" + ] + ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Column" , Space , Str "Labels" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + ] + ]) + [ TableBody + ( "" , [] , [] ) + (RowHeadColumns 0) + [] + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Row" , Space , Str "Labels" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Moscow" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Switzerland" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "(blank)" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Grand" , Space , Str "Total" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Anton" , Space , Str "Antich" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23.0" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "James" , Space , Str "Bond" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "35.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "35.0" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "(blank)" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Grand" , Space , Str "Total" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "35.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23.0" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "58.0" ] ] + ] + ] + ] + (TableFoot ( "" , [] , [] ) []) + ] diff --git a/test/xlsx-reader/basic.xlsx b/test/xlsx-reader/basic.xlsx Binary files differnew file mode 100644 index 000000000..55d62d56e --- /dev/null +++ b/test/xlsx-reader/basic.xlsx |
