aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs231
1 files changed, 112 insertions, 119 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index e03ac6a97..1c2892d6a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,5 @@
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
@@ -35,8 +35,7 @@ import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
import Data.Char (isAlphaNum, isLetter)
import Data.Default (Default (..), def)
import Data.Foldable (for_)
-import Data.List (isPrefixOf)
-import Data.List.Split (wordsBy, splitWhen)
+import Data.List.Split (splitWhen)
import qualified Data.Map as M
import Data.Maybe (fromMaybe, isJust, isNothing)
import Data.Monoid (First (..))
@@ -62,8 +61,8 @@ import Text.Pandoc.Options (
extensionEnabled)
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
- extractSpaces, htmlSpanLikeElements,
- onlySimpleTableCells, safeRead, underlineSpan)
+ extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
+ onlySimpleTableCells, safeRead, underlineSpan, tshow)
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
@@ -93,14 +92,14 @@ readHtml opts inp = do
"source" tags
case result of
Right doc -> return doc
- Left err -> throwError $ PandocParseError $ getError err
+ Left err -> throwError $ PandocParseError $ T.pack $ getError err
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
return $ walk (replaceNotes' (noteTable st)) bs
-replaceNotes' :: [(String, Blocks)] -> Inline -> Inline
+replaceNotes' :: [(Text, Blocks)] -> Inline -> Inline
replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
replaceNotes' _ x = x
@@ -108,9 +107,9 @@ replaceNotes' _ x = x
data HTMLState =
HTMLState
{ parserState :: ParserState,
- noteTable :: [(String, Blocks)],
+ noteTable :: [(Text, Blocks)],
baseHref :: Maybe URI,
- identifiers :: Set.Set String,
+ identifiers :: Set.Set Text,
logMessages :: [LogMessage],
macros :: M.Map Text Macro
}
@@ -134,7 +133,7 @@ pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
for_ (lookup "lang" attr) $
- updateState . B.setMeta "lang" . B.text . T.unpack
+ updateState . B.setMeta "lang" . B.text
pInTags "html" block
pBody :: PandocMonad m => TagParser m Blocks
@@ -146,11 +145,11 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny)
setTitle t = mempty <$ updateState (B.setMeta "title" t)
pMetaTag = do
mt <- pSatisfy (matchTagOpen "meta" [])
- let name = T.unpack $ fromAttrib "name" mt
- if null name
+ let name = fromAttrib "name" mt
+ if T.null name
then return mempty
else do
- let content = T.unpack $ fromAttrib "content" mt
+ let content = fromAttrib "content" mt
updateState $ \s ->
let ps = parserState s in
s{ parserState = ps{
@@ -187,13 +186,13 @@ block = do
, pFigure
, pRawHtmlBlock
]
- trace (take 60 $ show $ B.toList res)
+ trace (T.take 60 $ tshow $ B.toList res)
return res
-namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
+namespaces :: PandocMonad m => [(Text, TagParser m Inlines)]
namespaces = [(mathMLNamespace, pMath True)]
-mathMLNamespace :: String
+mathMLNamespace :: Text
mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
eSwitch :: (PandocMonad m, Monoid a)
@@ -233,7 +232,7 @@ eFootnote = try $ do
content <- pInTags tag block
addNote ident content
-addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
+addNote :: PandocMonad m => Text -> Blocks -> TagParser m ()
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s})
eNoteref :: PandocMonad m => TagParser m Inlines
@@ -245,8 +244,8 @@ eNoteref = try $ do
-> (lookup "type" as <|> lookup "epub:type" as)
== Just "noteref"
_ -> False)
- ident <- case T.unpack <$> lookup "href" attr of
- Just ('#':rest) -> return rest
+ ident <- case lookup "href" attr >>= T.uncons of
+ Just ('#', rest) -> return rest
_ -> mzero
_ <- manyTill pAny (pSatisfy (\case
TagClose t -> t == tag
@@ -287,7 +286,7 @@ pListItem nonItem = do
maybe id addId (lookup "id" attr) <$>
pInTags "li" block <* skipMany nonItem
-parseListStyleType :: String -> ListNumberStyle
+parseListStyleType :: Text -> ListNumberStyle
parseListStyleType "lower-roman" = LowerRoman
parseListStyleType "upper-roman" = UpperRoman
parseListStyleType "lower-alpha" = LowerAlpha
@@ -295,7 +294,7 @@ parseListStyleType "upper-alpha" = UpperAlpha
parseListStyleType "decimal" = Decimal
parseListStyleType _ = DefaultStyle
-parseTypeAttr :: String -> ListNumberStyle
+parseTypeAttr :: Text -> ListNumberStyle
parseTypeAttr "i" = LowerRoman
parseTypeAttr "I" = UpperRoman
parseTypeAttr "a" = LowerAlpha
@@ -404,20 +403,19 @@ pDiv = try $ do
pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
pRawHtmlBlock = do
- raw <- T.unpack <$>
- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
- <|> pRawTag)
+ raw <- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
+ <|> pRawTag)
exts <- getOption readerExtensions
- if extensionEnabled Ext_raw_html exts && not (null raw)
+ if extensionEnabled Ext_raw_html exts && not (T.null raw)
then return $ B.rawBlock "html" raw
else ignore raw
-ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
+ignore :: (Monoid a, PandocMonad m) => Text -> TagParser m a
ignore raw = do
pos <- getPosition
-- raw can be null for tags like <!DOCTYPE>; see paRawTag
-- in this case we don't want a warning:
- unless (null raw) $
+ unless (T.null raw) $
logMessage $ SkippedContent raw pos
return mempty
@@ -438,7 +436,7 @@ eSection = try $ do
headerLevel :: Text -> TagParser m Int
headerLevel tagtype =
- case safeRead (T.unpack (T.drop 1 tagtype)) of
+ case safeRead (T.drop 1 tagtype) of
Just level ->
-- try (do
-- guardEnabled Ext_epub_html_exts
@@ -468,7 +466,7 @@ pHeader = try $ do
level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
- let classes = maybe [] words $ lookup "class" attr
+ let classes = maybe [] T.words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
attr'' <- registerHeader (ident, classes, keyvals) contents
return $ if bodyTitle
@@ -529,14 +527,14 @@ pCol = try $ do
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
let width = case lookup "width" attribs of
- Nothing -> case lookup "style" attribs of
- Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
- fromMaybe 0.0 $ safeRead (filter
- (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
- _ -> 0.0
- Just x | not (null x) && last x == '%' ->
- fromMaybe 0.0 $ safeRead (init x)
- _ -> 0.0
+ Nothing -> case lookup "style" attribs of
+ Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+ fromMaybe 0.0 $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> 0.0
+ Just (T.unsnoc -> Just (xs, '%')) ->
+ fromMaybe 0.0 $ safeRead xs
+ _ -> 0.0
if width > 0.0
then return $ width / 100.0
else return 0.0
@@ -562,7 +560,7 @@ pCell celltype = try $ do
let extractAlign' [] = ""
extractAlign' ("text-align":x:_) = x
extractAlign' (_:xs) = extractAlign' xs
- let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
+ let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:")
let align = case maybeFromAttrib "align" tag `mplus`
(extractAlign <$> maybeFromAttrib "style" tag) of
Just "left" -> AlignLeft
@@ -610,7 +608,7 @@ pFigure = try $ do
let caption = fromMaybe mempty mbcap
case B.toList <$> mbimg of
Just [Image attr _ (url, tit)] ->
- return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
+ return $ B.para $ B.imageWith attr url ("fig:" <> tit) caption
_ -> mzero
pCodeBlock :: PandocMonad m => TagParser m Blocks
@@ -618,21 +616,21 @@ pCodeBlock = try $ do
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
let attr = toStringAttr attr'
contents <- manyTill pAny (pCloses "pre" <|> eof)
- let rawText = concatMap tagToString contents
+ let rawText = T.concat $ map tagToText contents
-- drop leading newline if any
- let result' = case rawText of
- '\n':xs -> xs
- _ -> rawText
+ let result' = case T.uncons rawText of
+ Just ('\n', xs) -> xs
+ _ -> rawText
-- drop trailing newline if any
- let result = case reverse result' of
- '\n':_ -> init result'
- _ -> result'
+ let result = case T.unsnoc result' of
+ Just (result'', '\n') -> result''
+ _ -> result'
return $ B.codeBlockWith (mkAttr attr) result
-tagToString :: Tag Text -> String
-tagToString (TagText s) = T.unpack s
-tagToString (TagOpen "br" _) = "\n"
-tagToString _ = ""
+tagToText :: Tag Text -> Text
+tagToText (TagText s) = s
+tagToText (TagOpen "br" _) = "\n"
+tagToText _ = ""
inline :: PandocMonad m => TagParser m Inlines
inline = choice
@@ -667,7 +665,7 @@ pLocation = do
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
- token show (const pos) (\x -> if f x then Just x else Nothing)
+ token tshow (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
@@ -688,10 +686,10 @@ pQ = choice $ map try [citedQuote, normalQuote]
where citedQuote = do
tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst))
- url <- canonicalizeUrl $ T.unpack $ fromAttrib "cite" tag
- let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+ url <- canonicalizeUrl $ fromAttrib "cite" tag
+ let uid = fromMaybe (fromAttrib "name" tag) $
maybeFromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ let cls = T.words $ fromAttrib "class" tag
makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)])
normalQuote = do
@@ -729,7 +727,7 @@ pSpanLike =
TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True)
let (ids, cs, kvs) = mkAttr . toStringAttr $ attrs
content <- mconcat <$> manyTill inline (pCloses tagName <|> eof)
- return $ B.spanWith (ids, T.unpack tagName : cs, kvs) content
+ return $ B.spanWith (ids, tagName : cs, kvs) content
pSmall :: PandocMonad m => TagParser m Inlines
pSmall = pInlinesInTags "small" (B.spanWith ("",["small"],[]))
@@ -753,19 +751,18 @@ pLineBreak = do
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: String -> Tag Text -> Maybe String
-maybeFromAttrib name (TagOpen _ attrs) =
- T.unpack <$> lookup (T.pack name) attrs
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
maybeFromAttrib _ _ = Nothing
pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
- let title = T.unpack $ fromAttrib "title" tag
+ let title = fromAttrib "title" tag
-- take id from id attribute if present, otherwise name
- let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+ let uid = fromMaybe (fromAttrib "name" tag) $
maybeFromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ let cls = T.words $ fromAttrib "class" tag
lab <- mconcat <$> manyTill inline (pCloses "a")
-- check for href; if href, then a link, otherwise a span
case maybeFromAttrib "href" tag of
@@ -778,34 +775,33 @@ pLink = try $ do
pImage :: PandocMonad m => TagParser m Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
- url <- canonicalizeUrl $ T.unpack $ fromAttrib "src" tag
- let title = T.unpack $ fromAttrib "title" tag
- let alt = T.unpack $ fromAttrib "alt" tag
- let uid = T.unpack $ fromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ url <- canonicalizeUrl $ fromAttrib "src" tag
+ let title = fromAttrib "title" tag
+ let alt = fromAttrib "alt" tag
+ let uid = fromAttrib "id" tag
+ let cls = T.words $ fromAttrib "class" tag
let getAtt k = case fromAttrib k tag of
"" -> []
- v -> [(T.unpack k, T.unpack v)]
+ v -> [(k, v)]
let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"]
return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
-pCodeWithClass :: PandocMonad m => [(T.Text,String)] -> TagParser m Inlines
-pCodeWithClass elemToClass = try $ do
+pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines
+pCodeWithClass elemToClass = try $ do
let tagTest = flip elem . fmap fst $ elemToClass
TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True)
result <- manyTill pAny (pCloses open)
let (ids,cs,kvs) = mkAttr . toStringAttr $ attr'
cs' = maybe cs (:cs) . lookup open $ elemToClass
return . B.codeWith (ids,cs',kvs) .
- unwords . lines . T.unpack . innerText $ result
+ T.unwords . T.lines . innerText $ result
pCode :: PandocMonad m => TagParser m Inlines
pCode = try $ do
(TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
let attr = toStringAttr attr'
result <- manyTill pAny (pCloses open)
- return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $
- innerText result
+ return $ B.codeWith (mkAttr attr) $ T.unwords $ T.lines $ innerText result
pSpan :: PandocMonad m => TagParser m Inlines
pSpan = try $ do
@@ -817,7 +813,7 @@ pSpan = try $ do
where styleAttr = fromMaybe "" $ lookup "style" attr
fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
classes = fromMaybe [] $
- words <$> lookup "class" attr
+ T.words <$> lookup "class" attr
let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
return $ tag contents
@@ -829,18 +825,17 @@ pRawHtmlInline = do
then pSatisfy (not . isBlockTag)
else pSatisfy isInlineTag
exts <- getOption readerExtensions
- let raw = T.unpack $ renderTags' [result]
+ let raw = renderTags' [result]
if extensionEnabled Ext_raw_html exts
then return $ B.rawInline "html" raw
else ignore raw
-mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath :: Text -> Either Text Text
mathMLToTeXMath s = writeTeX <$> readMathML s
-toStringAttr :: [(Text, Text)] -> [(String, String)]
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go
- where go (x,y) = (T.unpack (fromMaybe x $ T.stripPrefix "data-" x),
- T.unpack y)
+ where go (x,y) = (fromMaybe x $ T.stripPrefix "data-" x, y)
pScriptMath :: PandocMonad m => TagParser m Inlines
pScriptMath = try $ do
@@ -849,8 +844,7 @@ pScriptMath = try $ do
Just x | "math/tex" `T.isPrefixOf` x
-> return $ "display" `T.isSuffixOf` x
_ -> mzero
- contents <- T.unpack . innerText <$>
- manyTill pAny (pSatisfy (matchTagClose "script"))
+ contents <- innerText <$> manyTill pAny (pSatisfy (matchTagClose "script"))
return $ (if isdisplay then B.displayMath else B.math) contents
pMath :: PandocMonad m => Bool -> TagParser m Inlines
@@ -862,11 +856,11 @@ pMath inCase = try $ do
unless inCase $
guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
contents <- manyTill pAny (pSatisfy (matchTagClose "math"))
- case mathMLToTeXMath (T.unpack $ renderTags $
+ case mathMLToTeXMath (renderTags $
[open] <> contents <> [TagClose "math"]) of
Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
- T.unpack $ innerText contents
- Right [] -> return mempty
+ innerText contents
+ Right "" -> return mempty
Right x -> return $ case lookup "display" attr of
Just "block" -> B.displayMath x
_ -> B.math x
@@ -925,7 +919,7 @@ pTagText = try $ do
parsed <- lift $ lift $
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
case parsed of
- Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
+ Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
Right result -> return $ mconcat result
pBlank :: PandocMonad m => TagParser m ()
@@ -954,11 +948,11 @@ pRawTeX = do
guardEnabled Ext_raw_tex
inp <- getInput
st <- getState
- res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" (T.unpack inp)
+ res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" inp
case res of
Left _ -> mzero
Right (contents, raw) -> do
- _ <- count (length raw) anyChar
+ _ <- count (T.length raw) anyChar
return $ B.rawInline "tex" contents
pStr :: PandocMonad m => InlinesParser m Inlines
@@ -966,7 +960,7 @@ pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
updateLastStrPos
- return $ B.str result
+ return $ B.str $ T.pack result
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -982,7 +976,7 @@ isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: PandocMonad m => InlinesParser m Inlines
-pSymbol = satisfy isSpecial >>= return . B.str . (:[])
+pSymbol = satisfy isSpecial >>= return . B.str . T.singleton
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
@@ -1019,7 +1013,7 @@ pBad = do
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
- return $ B.str [c']
+ return $ B.str $ T.singleton c'
pSpace :: PandocMonad m => InlinesParser m Inlines
pSpace = many1 (satisfy isSpace) >>= \xs ->
@@ -1156,8 +1150,8 @@ _ `closes` _ = False
-- | Matches a stretch of HTML in balanced tags.
htmlInBalanced :: Monad m
- => (Tag String -> Bool)
- -> ParserT String st m String
+ => (Tag Text -> Bool)
+ -> ParserT Text st m Text
htmlInBalanced f = try $ do
lookAhead (char '<')
inp <- getInput
@@ -1174,21 +1168,21 @@ htmlInBalanced f = try $ do
(TagClose _ : TagPosition er ec : _) -> do
let ls = er - sr
let cs = ec - sc
- lscontents <- unlines <$> count ls anyLine
+ lscontents <- T.unlines <$> count ls anyLine
cscontents <- count cs anyChar
closetag <- do
x <- many (satisfy (/='>'))
char '>'
return (x <> ">")
- return (lscontents <> cscontents <> closetag)
+ return $ lscontents <> T.pack cscontents <> T.pack closetag
_ -> mzero
_ -> mzero
-htmlInBalanced' :: String
- -> [Tag String]
- -> [Tag String]
+htmlInBalanced' :: Text
+ -> [Tag Text]
+ -> [Tag Text]
htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
- where go :: Int -> [Tag String] -> Maybe [Tag String]
+ where go :: Int -> [Tag Text] -> Maybe [Tag Text]
go n (t@(TagOpen tn' _):rest) | tn' == tagname =
(t :) <$> go (n + 1) rest
go 1 (t@(TagClose tn'):_) | tn' == tagname =
@@ -1204,8 +1198,8 @@ hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: (HasReaderOptions st, Monad m)
- => (Tag String -> Bool)
- -> ParserT [Char] st m (Tag String, String)
+ => (Tag Text -> Bool)
+ -> ParserT Text st m (Tag Text, Text)
htmlTag f = try $ do
lookAhead (char '<')
startpos <- getPosition
@@ -1213,7 +1207,7 @@ htmlTag f = try $ do
let ts = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = False
, optTagPosition = True }
- (inp ++ " ") -- add space to ensure that
+ (inp <> " ") -- add space to ensure that
-- we get a TagPosition after the tag
(next, ln, col) <- case ts of
(TagPosition{} : next : TagPosition ln col : _)
@@ -1225,13 +1219,12 @@ htmlTag f = try $ do
-- so we exclude . even though it's a valid character
-- in XML element names
let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
- let isName s = case s of
- [] -> False
- (c:cs) -> isLetter c && all isNameChar cs
- let isPI s = case s of
- ('?':_) -> True -- processing instruction
- _ -> False
-
+ let isName s = case T.uncons s of
+ Nothing -> False
+ Just (c, cs) -> isLetter c && T.all isNameChar cs
+ let isPI s = case T.uncons s of
+ Just ('?', _) -> True -- processing instruction
+ _ -> False
let endpos = if ln == 1
then setSourceColumn startpos
(sourceColumn startpos + (col - 1))
@@ -1247,18 +1240,18 @@ htmlTag f = try $ do
-- basic sanity check, since the parser is very forgiving
-- and finds tags in stuff like x<y)
guard $ isName tagname || isPI tagname
- guard $ not $ null tagname
+ guard $ not $ T.null tagname
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
- guard $ last tagname /= ':'
+ guard $ T.last tagname /= ':'
char '<'
rendered <- manyTill anyChar endAngle
- return (next, "<" ++ rendered ++ ">")
+ return (next, T.pack $ "<" ++ rendered ++ ">")
case next of
TagComment s
- | "<!--" `isPrefixOf` inp -> do
+ | "<!--" `T.isPrefixOf` inp -> do
string "<!--"
- count (length s) anyChar
+ count (T.length s) anyChar
string "-->"
stripComments <- getOption readerStripComments
if stripComments
@@ -1272,12 +1265,12 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
-mkAttr :: [(String, String)] -> Attr
+mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+ attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
- epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+ epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
@@ -1304,11 +1297,11 @@ isSpace _ = False
-- Utilities
-- | Adjusts a url according to the document's base URL.
-canonicalizeUrl :: PandocMonad m => String -> TagParser m String
+canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
canonicalizeUrl url = do
mbBaseHref <- baseHref <$> getState
- return $ case (parseURIReference url, mbBaseHref) of
- (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+ (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url