diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 231 |
1 files changed, 112 insertions, 119 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index e03ac6a97..1c2892d6a 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,5 @@ -{-# LANGUAGE NoImplicitPrelude #-} -{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} @@ -35,8 +35,7 @@ import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) -import Data.List (isPrefixOf) -import Data.List.Split (wordsBy, splitWhen) +import Data.List.Split (splitWhen) import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) @@ -62,8 +61,8 @@ import Text.Pandoc.Options ( extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, - extractSpaces, htmlSpanLikeElements, - onlySimpleTableCells, safeRead, underlineSpan) + extractSpaces, htmlSpanLikeElements, elemText, splitTextBy, + onlySimpleTableCells, safeRead, underlineSpan, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) @@ -93,14 +92,14 @@ readHtml opts inp = do "source" tags case result of Right doc -> return doc - Left err -> throwError $ PandocParseError $ getError err + Left err -> throwError $ PandocParseError $ T.pack $ getError err replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState return $ walk (replaceNotes' (noteTable st)) bs -replaceNotes' :: [(String, Blocks)] -> Inline -> Inline +replaceNotes' :: [(Text, Blocks)] -> Inline -> Inline replaceNotes' noteTbl (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) $ lookup ref noteTbl replaceNotes' _ x = x @@ -108,9 +107,9 @@ replaceNotes' _ x = x data HTMLState = HTMLState { parserState :: ParserState, - noteTable :: [(String, Blocks)], + noteTable :: [(Text, Blocks)], baseHref :: Maybe URI, - identifiers :: Set.Set String, + identifiers :: Set.Set Text, logMessages :: [LogMessage], macros :: M.Map Text Macro } @@ -134,7 +133,7 @@ pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAny for_ (lookup "lang" attr) $ - updateState . B.setMeta "lang" . B.text . T.unpack + updateState . B.setMeta "lang" . B.text pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks @@ -146,11 +145,11 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) setTitle t = mempty <$ updateState (B.setMeta "title" t) pMetaTag = do mt <- pSatisfy (matchTagOpen "meta" []) - let name = T.unpack $ fromAttrib "name" mt - if null name + let name = fromAttrib "name" mt + if T.null name then return mempty else do - let content = T.unpack $ fromAttrib "content" mt + let content = fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ @@ -187,13 +186,13 @@ block = do , pFigure , pRawHtmlBlock ] - trace (take 60 $ show $ B.toList res) + trace (T.take 60 $ tshow $ B.toList res) return res -namespaces :: PandocMonad m => [(String, TagParser m Inlines)] +namespaces :: PandocMonad m => [(Text, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] -mathMLNamespace :: String +mathMLNamespace :: Text mathMLNamespace = "http://www.w3.org/1998/Math/MathML" eSwitch :: (PandocMonad m, Monoid a) @@ -233,7 +232,7 @@ eFootnote = try $ do content <- pInTags tag block addNote ident content -addNote :: PandocMonad m => String -> Blocks -> TagParser m () +addNote :: PandocMonad m => Text -> Blocks -> TagParser m () addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) eNoteref :: PandocMonad m => TagParser m Inlines @@ -245,8 +244,8 @@ eNoteref = try $ do -> (lookup "type" as <|> lookup "epub:type" as) == Just "noteref" _ -> False) - ident <- case T.unpack <$> lookup "href" attr of - Just ('#':rest) -> return rest + ident <- case lookup "href" attr >>= T.uncons of + Just ('#', rest) -> return rest _ -> mzero _ <- manyTill pAny (pSatisfy (\case TagClose t -> t == tag @@ -287,7 +286,7 @@ pListItem nonItem = do maybe id addId (lookup "id" attr) <$> pInTags "li" block <* skipMany nonItem -parseListStyleType :: String -> ListNumberStyle +parseListStyleType :: Text -> ListNumberStyle parseListStyleType "lower-roman" = LowerRoman parseListStyleType "upper-roman" = UpperRoman parseListStyleType "lower-alpha" = LowerAlpha @@ -295,7 +294,7 @@ parseListStyleType "upper-alpha" = UpperAlpha parseListStyleType "decimal" = Decimal parseListStyleType _ = DefaultStyle -parseTypeAttr :: String -> ListNumberStyle +parseTypeAttr :: Text -> ListNumberStyle parseTypeAttr "i" = LowerRoman parseTypeAttr "I" = UpperRoman parseTypeAttr "a" = LowerAlpha @@ -404,20 +403,19 @@ pDiv = try $ do pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do - raw <- T.unpack <$> - (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea" - <|> pRawTag) + raw <- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea" + <|> pRawTag) exts <- getOption readerExtensions - if extensionEnabled Ext_raw_html exts && not (null raw) + if extensionEnabled Ext_raw_html exts && not (T.null raw) then return $ B.rawBlock "html" raw else ignore raw -ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a +ignore :: (Monoid a, PandocMonad m) => Text -> TagParser m a ignore raw = do pos <- getPosition -- raw can be null for tags like <!DOCTYPE>; see paRawTag -- in this case we don't want a warning: - unless (null raw) $ + unless (T.null raw) $ logMessage $ SkippedContent raw pos return mempty @@ -438,7 +436,7 @@ eSection = try $ do headerLevel :: Text -> TagParser m Int headerLevel tagtype = - case safeRead (T.unpack (T.drop 1 tagtype)) of + case safeRead (T.drop 1 tagtype) of Just level -> -- try (do -- guardEnabled Ext_epub_html_exts @@ -468,7 +466,7 @@ pHeader = try $ do level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr - let classes = maybe [] words $ lookup "class" attr + let classes = maybe [] T.words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] attr'' <- registerHeader (ident, classes, keyvals) contents return $ if bodyTitle @@ -529,14 +527,14 @@ pCol = try $ do optional $ pSatisfy (matchTagClose "col") skipMany pBlank let width = case lookup "width" attribs of - Nothing -> case lookup "style" attribs of - Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> - fromMaybe 0.0 $ safeRead (filter - (`notElem` (" \t\r\n%'\";" :: [Char])) xs) - _ -> 0.0 - Just x | not (null x) && last x == '%' -> - fromMaybe 0.0 $ safeRead (init x) - _ -> 0.0 + Nothing -> case lookup "style" attribs of + Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> + fromMaybe 0.0 $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> 0.0 + Just (T.unsnoc -> Just (xs, '%')) -> + fromMaybe 0.0 $ safeRead xs + _ -> 0.0 if width > 0.0 then return $ width / 100.0 else return 0.0 @@ -562,7 +560,7 @@ pCell celltype = try $ do let extractAlign' [] = "" extractAlign' ("text-align":x:_) = x extractAlign' (_:xs) = extractAlign' xs - let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':']) + let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:") let align = case maybeFromAttrib "align" tag `mplus` (extractAlign <$> maybeFromAttrib "style" tag) of Just "left" -> AlignLeft @@ -610,7 +608,7 @@ pFigure = try $ do let caption = fromMaybe mempty mbcap case B.toList <$> mbimg of Just [Image attr _ (url, tit)] -> - return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption + return $ B.para $ B.imageWith attr url ("fig:" <> tit) caption _ -> mzero pCodeBlock :: PandocMonad m => TagParser m Blocks @@ -618,21 +616,21 @@ pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) let attr = toStringAttr attr' contents <- manyTill pAny (pCloses "pre" <|> eof) - let rawText = concatMap tagToString contents + let rawText = T.concat $ map tagToText contents -- drop leading newline if any - let result' = case rawText of - '\n':xs -> xs - _ -> rawText + let result' = case T.uncons rawText of + Just ('\n', xs) -> xs + _ -> rawText -- drop trailing newline if any - let result = case reverse result' of - '\n':_ -> init result' - _ -> result' + let result = case T.unsnoc result' of + Just (result'', '\n') -> result'' + _ -> result' return $ B.codeBlockWith (mkAttr attr) result -tagToString :: Tag Text -> String -tagToString (TagText s) = T.unpack s -tagToString (TagOpen "br" _) = "\n" -tagToString _ = "" +tagToText :: Tag Text -> Text +tagToText (TagText s) = s +tagToText (TagOpen "br" _) = "\n" +tagToText _ = "" inline :: PandocMonad m => TagParser m Inlines inline = choice @@ -667,7 +665,7 @@ pLocation = do pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSat f = do pos <- getPosition - token show (const pos) (\x -> if f x then Just x else Nothing) + token tshow (const pos) (\x -> if f x then Just x else Nothing) pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSatisfy f = try $ optional pLocation >> pSat f @@ -688,10 +686,10 @@ pQ = choice $ map try [citedQuote, normalQuote] where citedQuote = do tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst)) - url <- canonicalizeUrl $ T.unpack $ fromAttrib "cite" tag - let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $ + url <- canonicalizeUrl $ fromAttrib "cite" tag + let uid = fromMaybe (fromAttrib "name" tag) $ maybeFromAttrib "id" tag - let cls = words $ T.unpack $ fromAttrib "class" tag + let cls = T.words $ fromAttrib "class" tag makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)]) normalQuote = do @@ -729,7 +727,7 @@ pSpanLike = TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True) let (ids, cs, kvs) = mkAttr . toStringAttr $ attrs content <- mconcat <$> manyTill inline (pCloses tagName <|> eof) - return $ B.spanWith (ids, T.unpack tagName : cs, kvs) content + return $ B.spanWith (ids, tagName : cs, kvs) content pSmall :: PandocMonad m => TagParser m Inlines pSmall = pInlinesInTags "small" (B.spanWith ("",["small"],[])) @@ -753,19 +751,18 @@ pLineBreak = do -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag Text -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = - T.unpack <$> lookup (T.pack name) attrs +maybeFromAttrib :: Text -> Tag Text -> Maybe Text +maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs maybeFromAttrib _ _ = Nothing pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) - let title = T.unpack $ fromAttrib "title" tag + let title = fromAttrib "title" tag -- take id from id attribute if present, otherwise name - let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $ + let uid = fromMaybe (fromAttrib "name" tag) $ maybeFromAttrib "id" tag - let cls = words $ T.unpack $ fromAttrib "class" tag + let cls = T.words $ fromAttrib "class" tag lab <- mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of @@ -778,34 +775,33 @@ pLink = try $ do pImage :: PandocMonad m => TagParser m Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") - url <- canonicalizeUrl $ T.unpack $ fromAttrib "src" tag - let title = T.unpack $ fromAttrib "title" tag - let alt = T.unpack $ fromAttrib "alt" tag - let uid = T.unpack $ fromAttrib "id" tag - let cls = words $ T.unpack $ fromAttrib "class" tag + url <- canonicalizeUrl $ fromAttrib "src" tag + let title = fromAttrib "title" tag + let alt = fromAttrib "alt" tag + let uid = fromAttrib "id" tag + let cls = T.words $ fromAttrib "class" tag let getAtt k = case fromAttrib k tag of "" -> [] - v -> [(T.unpack k, T.unpack v)] + v -> [(k, v)] let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) -pCodeWithClass :: PandocMonad m => [(T.Text,String)] -> TagParser m Inlines -pCodeWithClass elemToClass = try $ do +pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines +pCodeWithClass elemToClass = try $ do let tagTest = flip elem . fmap fst $ elemToClass TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True) result <- manyTill pAny (pCloses open) let (ids,cs,kvs) = mkAttr . toStringAttr $ attr' cs' = maybe cs (:cs) . lookup open $ elemToClass return . B.codeWith (ids,cs',kvs) . - unwords . lines . T.unpack . innerText $ result + T.unwords . T.lines . innerText $ result pCode :: PandocMonad m => TagParser m Inlines pCode = try $ do (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) let attr = toStringAttr attr' result <- manyTill pAny (pCloses open) - return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $ - innerText result + return $ B.codeWith (mkAttr attr) $ T.unwords $ T.lines $ innerText result pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do @@ -817,7 +813,7 @@ pSpan = try $ do where styleAttr = fromMaybe "" $ lookup "style" attr fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr classes = fromMaybe [] $ - words <$> lookup "class" attr + T.words <$> lookup "class" attr let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) return $ tag contents @@ -829,18 +825,17 @@ pRawHtmlInline = do then pSatisfy (not . isBlockTag) else pSatisfy isInlineTag exts <- getOption readerExtensions - let raw = T.unpack $ renderTags' [result] + let raw = renderTags' [result] if extensionEnabled Ext_raw_html exts then return $ B.rawInline "html" raw else ignore raw -mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath :: Text -> Either Text Text mathMLToTeXMath s = writeTeX <$> readMathML s -toStringAttr :: [(Text, Text)] -> [(String, String)] +toStringAttr :: [(Text, Text)] -> [(Text, Text)] toStringAttr = map go - where go (x,y) = (T.unpack (fromMaybe x $ T.stripPrefix "data-" x), - T.unpack y) + where go (x,y) = (fromMaybe x $ T.stripPrefix "data-" x, y) pScriptMath :: PandocMonad m => TagParser m Inlines pScriptMath = try $ do @@ -849,8 +844,7 @@ pScriptMath = try $ do Just x | "math/tex" `T.isPrefixOf` x -> return $ "display" `T.isSuffixOf` x _ -> mzero - contents <- T.unpack . innerText <$> - manyTill pAny (pSatisfy (matchTagClose "script")) + contents <- innerText <$> manyTill pAny (pSatisfy (matchTagClose "script")) return $ (if isdisplay then B.displayMath else B.math) contents pMath :: PandocMonad m => Bool -> TagParser m Inlines @@ -862,11 +856,11 @@ pMath inCase = try $ do unless inCase $ guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) contents <- manyTill pAny (pSatisfy (matchTagClose "math")) - case mathMLToTeXMath (T.unpack $ renderTags $ + case mathMLToTeXMath (renderTags $ [open] <> contents <> [TagClose "math"]) of Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ - T.unpack $ innerText contents - Right [] -> return mempty + innerText contents + Right "" -> return mempty Right x -> return $ case lookup "display" attr of Just "block" -> B.displayMath x _ -> B.math x @@ -925,7 +919,7 @@ pTagText = try $ do parsed <- lift $ lift $ flip runReaderT qu $ runParserT (many pTagContents) st "text" str case parsed of - Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'" + Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'" Right result -> return $ mconcat result pBlank :: PandocMonad m => TagParser m () @@ -954,11 +948,11 @@ pRawTeX = do guardEnabled Ext_raw_tex inp <- getInput st <- getState - res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" (T.unpack inp) + res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" inp case res of Left _ -> mzero Right (contents, raw) -> do - _ <- count (length raw) anyChar + _ <- count (T.length raw) anyChar return $ B.rawInline "tex" contents pStr :: PandocMonad m => InlinesParser m Inlines @@ -966,7 +960,7 @@ pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) updateLastStrPos - return $ B.str result + return $ B.str $ T.pack result isSpecial :: Char -> Bool isSpecial '"' = True @@ -982,7 +976,7 @@ isSpecial '\8221' = True isSpecial _ = False pSymbol :: PandocMonad m => InlinesParser m Inlines -pSymbol = satisfy isSpecial >>= return . B.str . (:[]) +pSymbol = satisfy isSpecial >>= return . B.str . T.singleton isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML @@ -1019,7 +1013,7 @@ pBad = do '\158' -> '\382' '\159' -> '\376' _ -> '?' - return $ B.str [c'] + return $ B.str $ T.singleton c' pSpace :: PandocMonad m => InlinesParser m Inlines pSpace = many1 (satisfy isSpace) >>= \xs -> @@ -1156,8 +1150,8 @@ _ `closes` _ = False -- | Matches a stretch of HTML in balanced tags. htmlInBalanced :: Monad m - => (Tag String -> Bool) - -> ParserT String st m String + => (Tag Text -> Bool) + -> ParserT Text st m Text htmlInBalanced f = try $ do lookAhead (char '<') inp <- getInput @@ -1174,21 +1168,21 @@ htmlInBalanced f = try $ do (TagClose _ : TagPosition er ec : _) -> do let ls = er - sr let cs = ec - sc - lscontents <- unlines <$> count ls anyLine + lscontents <- T.unlines <$> count ls anyLine cscontents <- count cs anyChar closetag <- do x <- many (satisfy (/='>')) char '>' return (x <> ">") - return (lscontents <> cscontents <> closetag) + return $ lscontents <> T.pack cscontents <> T.pack closetag _ -> mzero _ -> mzero -htmlInBalanced' :: String - -> [Tag String] - -> [Tag String] +htmlInBalanced' :: Text + -> [Tag Text] + -> [Tag Text] htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts - where go :: Int -> [Tag String] -> Maybe [Tag String] + where go :: Int -> [Tag Text] -> Maybe [Tag Text] go n (t@(TagOpen tn' _):rest) | tn' == tagname = (t :) <$> go (n + 1) rest go 1 (t@(TagClose tn'):_) | tn' == tagname = @@ -1204,8 +1198,8 @@ hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: (HasReaderOptions st, Monad m) - => (Tag String -> Bool) - -> ParserT [Char] st m (Tag String, String) + => (Tag Text -> Bool) + -> ParserT Text st m (Tag Text, Text) htmlTag f = try $ do lookAhead (char '<') startpos <- getPosition @@ -1213,7 +1207,7 @@ htmlTag f = try $ do let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = False , optTagPosition = True } - (inp ++ " ") -- add space to ensure that + (inp <> " ") -- add space to ensure that -- we get a TagPosition after the tag (next, ln, col) <- case ts of (TagPosition{} : next : TagPosition ln col : _) @@ -1225,13 +1219,12 @@ htmlTag f = try $ do -- so we exclude . even though it's a valid character -- in XML element names let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_' - let isName s = case s of - [] -> False - (c:cs) -> isLetter c && all isNameChar cs - let isPI s = case s of - ('?':_) -> True -- processing instruction - _ -> False - + let isName s = case T.uncons s of + Nothing -> False + Just (c, cs) -> isLetter c && T.all isNameChar cs + let isPI s = case T.uncons s of + Just ('?', _) -> True -- processing instruction + _ -> False let endpos = if ln == 1 then setSourceColumn startpos (sourceColumn startpos + (col - 1)) @@ -1247,18 +1240,18 @@ htmlTag f = try $ do -- basic sanity check, since the parser is very forgiving -- and finds tags in stuff like x<y) guard $ isName tagname || isPI tagname - guard $ not $ null tagname + guard $ not $ T.null tagname -- <https://example.org> should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] - guard $ last tagname /= ':' + guard $ T.last tagname /= ':' char '<' rendered <- manyTill anyChar endAngle - return (next, "<" ++ rendered ++ ">") + return (next, T.pack $ "<" ++ rendered ++ ">") case next of TagComment s - | "<!--" `isPrefixOf` inp -> do + | "<!--" `T.isPrefixOf` inp -> do string "<!--" - count (length s) anyChar + count (T.length s) anyChar string "-->" stripComments <- getOption readerStripComments if stripComments @@ -1272,12 +1265,12 @@ htmlTag f = try $ do handleTag tagname _ -> mzero -mkAttr :: [(String, String)] -> Attr +mkAttr :: [(Text, Text)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes + attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr - epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr -- Strip namespace prefixes stripPrefixes :: [Tag Text] -> [Tag Text] @@ -1304,11 +1297,11 @@ isSpace _ = False -- Utilities -- | Adjusts a url according to the document's base URL. -canonicalizeUrl :: PandocMonad m => String -> TagParser m String +canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text canonicalizeUrl url = do mbBaseHref <- baseHref <$> getState - return $ case (parseURIReference url, mbBaseHref) of - (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + return $ case (parseURIReference (T.unpack url), mbBaseHref) of + (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url |
