diff options
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
| -rw-r--r-- | src/Text/Pandoc/Shared.hs | 293 |
1 files changed, 175 insertions, 118 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 797a0a0b0..926116e23 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -8,6 +8,7 @@ {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Shared Copyright : Copyright (C) 2006-2019 John MacFarlane @@ -22,14 +23,20 @@ Utility functions and definitions used by the various Pandoc modules. module Text.Pandoc.Shared ( -- * List processing splitBy, + splitTextBy, splitByIndices, splitStringByIndices, + splitTextByIndices, substitute, ordNub, -- * Text processing ToString (..), + ToText (..), + tshow, backslashEscapes, escapeStringUsing, + elemText, + notElemText, stripTrailingNewlines, trim, triml, @@ -37,6 +44,7 @@ module Text.Pandoc.Shared ( trimMath, stripFirstAndLast, camelCaseToHyphenated, + camelCaseStrToHyphenated, toRomanNumeral, escapeURI, tabFilter, @@ -90,6 +98,7 @@ module Text.Pandoc.Shared ( defaultBlocksSeparator, -- * Safe read safeRead, + safeStrRead, -- * User data directory defaultUserDataDirs, -- * Version @@ -133,8 +142,8 @@ import Text.DocLayout (charWidth) import Text.Pandoc.Walk -- | Version number of pandoc library. -pandocVersion :: String -pandocVersion = showVersion version +pandocVersion :: T.Text +pandocVersion = T.pack $ showVersion version -- -- List processing @@ -148,6 +157,13 @@ splitBy isSep lst = rest' = dropWhile isSep rest in first:splitBy isSep rest' +splitTextBy :: (Char -> Bool) -> T.Text -> [T.Text] +splitTextBy isSep t + | T.null t = [] + | otherwise = let (first, rest) = T.break isSep t + rest' = T.dropWhile isSep rest + in first : splitTextBy isSep rest' + splitByIndices :: [Int] -> [a] -> [[a]] splitByIndices [] lst = [lst] splitByIndices (x:xs) lst = first:splitByIndices (map (\y -> y - x) xs) rest @@ -160,6 +176,9 @@ splitStringByIndices (x:xs) lst = let (first, rest) = splitAt' x lst in first : splitStringByIndices (map (\y -> y - x) xs) rest +splitTextByIndices :: [Int] -> T.Text -> [T.Text] +splitTextByIndices ns = fmap T.pack . splitStringByIndices ns . T.unpack + splitAt' :: Int -> [Char] -> ([Char],[Char]) splitAt' _ [] = ([],[]) splitAt' n xs | n <= 0 = ([],xs) @@ -195,89 +214,115 @@ instance ToString String where instance ToString T.Text where toString = T.unpack +class ToText a where + toText :: a -> T.Text + +instance ToText String where + toText = T.pack + +instance ToText T.Text where + toText = id + +tshow :: Show a => a -> T.Text +tshow = T.pack . show + -- | Returns an association list of backslash escapes for the -- designated characters. backslashEscapes :: [Char] -- ^ list of special characters to escape - -> [(Char, String)] -backslashEscapes = map (\ch -> (ch, ['\\',ch])) + -> [(Char, T.Text)] +backslashEscapes = map (\ch -> (ch, T.pack ['\\',ch])) -- | Escape a string of characters, using an association list of -- characters and strings. -escapeStringUsing :: [(Char, String)] -> String -> String -escapeStringUsing _ [] = "" -escapeStringUsing escapeTable (x:xs) = - case lookup x escapeTable of - Just str -> str ++ rest - Nothing -> x:rest - where rest = escapeStringUsing escapeTable xs +escapeStringUsing :: [(Char, T.Text)] -> T.Text -> T.Text +escapeStringUsing tbl = T.concatMap $ \c -> fromMaybe (T.singleton c) $ lookup c tbl + +-- | @True@ exactly when the @Char@ appears in the @Text@. +elemText :: Char -> T.Text -> Bool +elemText c = T.any (== c) + +-- | @True@ exactly when the @Char@ does not appear in the @Text@. +notElemText :: Char -> T.Text -> Bool +notElemText c = T.all (/= c) -- | Strip trailing newlines from string. -stripTrailingNewlines :: String -> String -stripTrailingNewlines = reverse . dropWhile (== '\n') . reverse +stripTrailingNewlines :: T.Text -> T.Text +stripTrailingNewlines = T.dropWhileEnd (== '\n') -- | Remove leading and trailing space (including newlines) from string. -trim :: String -> String -trim = triml . trimr +trim :: T.Text -> T.Text +trim = T.dropAround (`elemText` " \r\n\t") -- | Remove leading space (including newlines) from string. -triml :: String -> String -triml = dropWhile (`elem` " \r\n\t") +triml :: T.Text -> T.Text +triml = T.dropWhile (`elemText` " \r\n\t") -- | Remove trailing space (including newlines) from string. -trimr :: String -> String -trimr = reverse . triml . reverse +trimr :: T.Text -> T.Text +trimr = T.dropWhileEnd (`elemText` " \r\n\t") -- | Trim leading space and trailing space unless after \. -trimMath :: String -> String -trimMath = triml . reverse . stripspace . reverse +trimMath :: T.Text -> T.Text +trimMath = triml . T.reverse . stripBeginSpace . T.reverse -- no Text.spanEnd where - stripspace (c1:c2:cs) - | c1 `elem` [' ','\t','\n','\r'] - , c2 /= '\\' = stripspace (c2:cs) - stripspace cs = cs + stripBeginSpace t + | T.null pref = t + | Just ('\\', _) <- T.uncons suff = T.cons (T.last pref) suff + | otherwise = suff + where + (pref, suff) = T.span (`elemText` " \t\n\r") t -- | Strip leading and trailing characters from string -stripFirstAndLast :: String -> String -stripFirstAndLast str = - drop 1 $ take (length str - 1) str +stripFirstAndLast :: T.Text -> T.Text +stripFirstAndLast t = case T.uncons t of + Just (_, t') -> case T.unsnoc t' of + Just (t'', _) -> t'' + _ -> t' + _ -> "" -- | Change CamelCase word to hyphenated lowercase (e.g., camel-case). -camelCaseToHyphenated :: String -> String -camelCaseToHyphenated [] = "" -camelCaseToHyphenated (a:b:rest) +camelCaseToHyphenated :: T.Text -> T.Text +camelCaseToHyphenated = T.pack . camelCaseStrToHyphenated . T.unpack + +-- This may not work as expected on general Unicode, if it contains +-- letters with a longer lower case form than upper case. I don't know +-- what the camel case practices of affected scripts are, though. +camelCaseStrToHyphenated :: String -> String +camelCaseStrToHyphenated [] = "" +camelCaseStrToHyphenated (a:b:rest) | isLower a - , isUpper b = a:'-':toLower b:camelCaseToHyphenated rest + , isUpper b = a:'-':toLower b:camelCaseStrToHyphenated rest -- handle ABCDef = abc-def -camelCaseToHyphenated (a:b:c:rest) +camelCaseStrToHyphenated (a:b:c:rest) | isUpper a , isUpper b - , isLower c = toLower a:'-':toLower b:camelCaseToHyphenated (c:rest) -camelCaseToHyphenated (a:rest) = toLower a:camelCaseToHyphenated rest + , isLower c = toLower a:'-':toLower b:camelCaseStrToHyphenated (c:rest) +camelCaseStrToHyphenated (a:rest) = toLower a:camelCaseStrToHyphenated rest -- | Convert number < 4000 to uppercase roman numeral. -toRomanNumeral :: Int -> String +toRomanNumeral :: Int -> T.Text toRomanNumeral x | x >= 4000 || x < 0 = "?" - | x >= 1000 = "M" ++ toRomanNumeral (x - 1000) - | x >= 900 = "CM" ++ toRomanNumeral (x - 900) - | x >= 500 = "D" ++ toRomanNumeral (x - 500) - | x >= 400 = "CD" ++ toRomanNumeral (x - 400) - | x >= 100 = "C" ++ toRomanNumeral (x - 100) - | x >= 90 = "XC" ++ toRomanNumeral (x - 90) - | x >= 50 = "L" ++ toRomanNumeral (x - 50) - | x >= 40 = "XL" ++ toRomanNumeral (x - 40) - | x >= 10 = "X" ++ toRomanNumeral (x - 10) + | x >= 1000 = "M" <> toRomanNumeral (x - 1000) + | x >= 900 = "CM" <> toRomanNumeral (x - 900) + | x >= 500 = "D" <> toRomanNumeral (x - 500) + | x >= 400 = "CD" <> toRomanNumeral (x - 400) + | x >= 100 = "C" <> toRomanNumeral (x - 100) + | x >= 90 = "XC" <> toRomanNumeral (x - 90) + | x >= 50 = "L" <> toRomanNumeral (x - 50) + | x >= 40 = "XL" <> toRomanNumeral (x - 40) + | x >= 10 = "X" <> toRomanNumeral (x - 10) | x == 9 = "IX" - | x >= 5 = "V" ++ toRomanNumeral (x - 5) + | x >= 5 = "V" <> toRomanNumeral (x - 5) | x == 4 = "IV" - | x >= 1 = "I" ++ toRomanNumeral (x - 1) + | x >= 1 = "I" <> toRomanNumeral (x - 1) | otherwise = "" -- | Escape whitespace and some punctuation characters in URI. -escapeURI :: String -> String -escapeURI = escapeURIString (not . needsEscaping) - where needsEscaping c = isSpace c || c `elem` - ['<','>','|','"','{','}','[',']','^', '`'] +escapeURI :: T.Text -> T.Text +escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack + where needsEscaping c = isSpace c || c `elemText` "<>|\"{}[]^`" + -- | Convert tabs to spaces. Tabs will be preserved if tab stop is set to 0. tabFilter :: Int -- ^ Tab stop @@ -304,8 +349,11 @@ crFilter = T.filter (/= '\r') -- | Parse a date and convert (if possible) to "YYYY-MM-DD" format. We -- limit years to the range 1601-9999 (ISO 8601 accepts greater than -- or equal to 1583, but MS Word only accepts dates starting 1601). -normalizeDate :: String -> Maybe String -normalizeDate s = fmap (formatTime defaultTimeLocale "%F") +normalizeDate :: T.Text -> Maybe T.Text +normalizeDate = fmap T.pack . normalizeDate' . T.unpack + +normalizeDate' :: String -> Maybe String +normalizeDate' s = fmap (formatTime defaultTimeLocale "%F") (msum $ map (\fs -> parsetimeWith fs s >>= rejectBadYear) formats :: Maybe Day) where rejectBadYear day = case toGregorian day of (y, _, _) | y >= 1601 && y <= 9999 -> Just day @@ -321,26 +369,26 @@ normalizeDate s = fmap (formatTime defaultTimeLocale "%F") -- | Generate infinite lazy list of markers for an ordered list, -- depending on list attributes. -orderedListMarkers :: (Int, ListNumberStyle, ListNumberDelim) -> [String] +orderedListMarkers :: (Int, ListNumberStyle, ListNumberDelim) -> [T.Text] orderedListMarkers (start, numstyle, numdelim) = - let singleton c = [c] - nums = case numstyle of - DefaultStyle -> map show [start..] - Example -> map show [start..] - Decimal -> map show [start..] + let nums = case numstyle of + DefaultStyle -> map tshow [start..] + Example -> map tshow [start..] + Decimal -> map tshow [start..] UpperAlpha -> drop (start - 1) $ cycle $ - map singleton ['A'..'Z'] + map T.singleton ['A'..'Z'] LowerAlpha -> drop (start - 1) $ cycle $ - map singleton ['a'..'z'] + map T.singleton ['a'..'z'] UpperRoman -> map toRomanNumeral [start..] - LowerRoman -> map (map toLower . toRomanNumeral) [start..] + LowerRoman -> map (T.toLower . toRomanNumeral) [start..] inDelim str = case numdelim of - DefaultDelim -> str ++ "." - Period -> str ++ "." - OneParen -> str ++ ")" - TwoParens -> "(" ++ str ++ ")" + DefaultDelim -> str <> "." + Period -> str <> "." + OneParen -> str <> ")" + TwoParens -> "(" <> str <> ")" in map inDelim nums + -- | Extract the leading and trailing spaces from inside an inline element -- and place them outside the element. SoftBreaks count as Spaces for -- these purposes. @@ -387,15 +435,16 @@ deQuote x = x -- | Convert pandoc structure to a string with formatting removed. -- Footnotes are skipped (since we don't want their contents in link -- labels). -stringify :: Walkable Inline a => a -> String +stringify :: Walkable Inline a => a -> T.Text stringify = query go . walk (deNote . deQuote) - where go :: Inline -> [Char] + where go :: Inline -> T.Text go Space = " " go SoftBreak = " " go (Str x) = x go (Code _ x) = x go (Math _ x) = x - go (RawInline (Format "html") ('<':'b':'r':_)) = " " -- see #2105 + go (RawInline (Format "html") (T.unpack -> ('<':'b':'r':_))) + = " " -- see #2105 go LineBreak = " " go _ = "" @@ -407,7 +456,7 @@ stringify = query go . walk (deNote . deQuote) capitalize :: Walkable Inline a => a -> a capitalize = walk go where go :: Inline -> Inline - go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go (Str s) = Str $ T.toUpper s go x = x -- | Change final list item from @Para@ to @Plain@ if the list contains @@ -463,7 +512,7 @@ isPara _ = False -- | Convert Pandoc inline list to plain text identifier. HTML -- identifiers must start with a letter, and may contain only -- letters, digits, and the characters _-. -inlineListToIdentifier :: Extensions -> [Inline] -> String +inlineListToIdentifier :: Extensions -> [Inline] -> T.Text inlineListToIdentifier exts = dropNonLetter . filterAscii . toIdent . stringify . walk unEmojify where @@ -476,23 +525,23 @@ inlineListToIdentifier exts = unEmoji x = x dropNonLetter | extensionEnabled Ext_gfm_auto_identifiers exts = id - | otherwise = dropWhile (not . isAlpha) + | otherwise = T.dropWhile (not . isAlpha) filterAscii | extensionEnabled Ext_ascii_identifiers exts - = mapMaybe toAsciiChar + = T.pack . mapMaybe toAsciiChar . T.unpack | otherwise = id toIdent | extensionEnabled Ext_gfm_auto_identifiers exts = - filterPunct . spaceToDash . map toLower - | otherwise = intercalate "-" . words . filterPunct . map toLower - filterPunct = filter (\c -> isSpace c || isAlphaNum c || isAllowedPunct c) + filterPunct . spaceToDash . T.toLower + | otherwise = T.intercalate "-" . T.words . filterPunct . T.toLower + filterPunct = T.filter (\c -> isSpace c || isAlphaNum c || isAllowedPunct c) isAllowedPunct c | extensionEnabled Ext_gfm_auto_identifiers exts = c == '-' || c == '_' || generalCategory c `elem` [NonSpacingMark, SpacingCombiningMark, EnclosingMark, ConnectorPunctuation] | otherwise = c == '_' || c == '-' || c == '.' - spaceToDash = map (\c -> if isSpace c then '-' else c) + spaceToDash = T.map (\c -> if isSpace c then '-' else c) -- | Put a list of Pandoc blocks into a hierarchical structure: @@ -529,7 +578,7 @@ makeSections numbering mbBaseLevel bs = -- don't touch number if already present case lookup "number" kvs of Nothing | numbering -> - ("number", intercalate "." (map show newnum)) : kvs + ("number", T.intercalate "." (map tshow newnum)) : kvs _ -> kvs) return $ Div divattr (Header level' attr title' : sectionContents') : rest' @@ -542,7 +591,7 @@ makeSections numbering mbBaseLevel bs = let inner' = case inner of (Div (dident',dclasses',dkvs') zs@(Header{}:zs') : ws) - | null dident -> + | T.null dident -> Div (dident',dclasses' ++ dclasses,dkvs' ++ dkvs) zs : ws | otherwise -> -- keep id on header so we don't lose anchor Div (dident,dclasses ++ dclasses',dkvs ++ dkvs') @@ -564,7 +613,7 @@ headerLtEq _ _ = False -- | Generate a unique identifier from a list of inlines. -- Second argument is a list of already used identifiers. -uniqueIdent :: Extensions -> [Inline] -> Set.Set String -> String +uniqueIdent :: Extensions -> [Inline] -> Set.Set T.Text -> T.Text uniqueIdent exts title' usedIdents = if baseIdent `Set.member` usedIdents then case find (\x -> not $ numIdent x `Set.member` usedIdents) @@ -577,7 +626,7 @@ uniqueIdent exts title' usedIdents = baseIdent = case inlineListToIdentifier exts title' of "" -> "section" x -> x - numIdent n = baseIdent ++ "-" ++ show n + numIdent n = baseIdent <> "-" <> tshow n -- | True if block is a Header block. isHeaderBlock :: Block -> Bool @@ -664,7 +713,7 @@ handleTaskListItem handleInlines exts bls = -- | Set a field of a 'Meta' object. If the field already has a value, -- convert it into a list with the new value appended to the old value(s). addMetaField :: ToMetaValue a - => String + => T.Text -> a -> Meta -> Meta @@ -686,12 +735,16 @@ makeMeta title authors date = -- | Remove soft breaks between East Asian characters. eastAsianLineBreakFilter :: Pandoc -> Pandoc eastAsianLineBreakFilter = bottomUp go - where go (x:SoftBreak:y:zs) = - case (stringify x, stringify y) of - (xs@(_:_), c:_) - | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs - _ -> x:SoftBreak:y:zs - go xs = xs + where go (x:SoftBreak:y:zs) + | Just (_, b) <- T.unsnoc $ stringify x + , Just (c, _) <- T.uncons $ stringify y + , charWidth b == 2 + , charWidth c == 2 + = x:y:zs + | otherwise + = x:SoftBreak:y:zs + go xs + = xs -- | Builder for underline. -- This probably belongs in Builder.hs in pandoc-types. @@ -702,27 +755,28 @@ underlineSpan = B.spanWith ("", ["underline"], []) -- | Set of HTML elements that are represented as Span with a class equal as -- the element tag itself. htmlSpanLikeElements :: Set.Set T.Text -htmlSpanLikeElements = Set.fromList [T.pack "kbd", T.pack "mark", T.pack "dfn"] +htmlSpanLikeElements = Set.fromList ["kbd", "mark", "dfn"] -- | Returns the first sentence in a list of inlines, and the rest. breakSentence :: [Inline] -> ([Inline], [Inline]) breakSentence [] = ([],[]) breakSentence xs = - let isSentenceEndInline (Str ys@(_:_)) | last ys == '.' = True - isSentenceEndInline (Str ys@(_:_)) | last ys == '?' = True - isSentenceEndInline LineBreak = True - isSentenceEndInline _ = False + let isSentenceEndInline (Str ys) + | Just (_, c) <- T.unsnoc ys = c == '.' || c == '?' + isSentenceEndInline LineBreak = True + isSentenceEndInline _ = False (as, bs) = break isSentenceEndInline xs in case bs of - [] -> (as, []) - [c] -> (as ++ [c], []) - (c:Space:cs) -> (as ++ [c], cs) - (c:SoftBreak:cs) -> (as ++ [c], cs) - (Str ".":Str (')':ys):cs) -> (as ++ [Str ".", Str (')':ys)], cs) - (x@(Str ('.':')':_)):cs) -> (as ++ [x], cs) - (LineBreak:x@(Str ('.':_)):cs) -> (as ++[LineBreak], x:cs) - (c:cs) -> (as ++ [c] ++ ds, es) - where (ds, es) = breakSentence cs + [] -> (as, []) + [c] -> (as ++ [c], []) + (c:Space:cs) -> (as ++ [c], cs) + (c:SoftBreak:cs) -> (as ++ [c], cs) + (Str ".":Str s@(T.uncons -> Just (')',_)):cs) + -> (as ++ [Str ".", Str s], cs) + (x@(Str (T.stripPrefix ".)" -> Just _)):cs) -> (as ++ [x], cs) + (LineBreak:x@(Str (T.uncons -> Just ('.',_))):cs) -> (as ++[LineBreak], x:cs) + (c:cs) -> (as ++ [c] ++ ds, es) + where (ds, es) = breakSentence cs -- | Split a list of inlines into sentences. splitSentences :: [Inline] -> [[Inline]] @@ -763,10 +817,11 @@ filterIpynbOutput mode = walk go removeANSI (CodeBlock attr code) = CodeBlock attr (removeANSIEscapes code) removeANSI x = x - removeANSIEscapes [] = [] - removeANSIEscapes ('\x1b':'[':cs) = - removeANSIEscapes (drop 1 $ dropWhile (/='m') cs) - removeANSIEscapes (c:cs) = c : removeANSIEscapes cs + removeANSIEscapes t + | Just cs <- T.stripPrefix "\x1b[" t = + removeANSIEscapes $ T.drop 1 $ T.dropWhile (/='m') cs + | Just (c, cs) <- T.uncons t = T.cons c $ removeANSIEscapes cs + | otherwise = "" go x = x -- @@ -774,12 +829,12 @@ filterIpynbOutput mode = walk go -- -- | Render HTML tags. -renderTags' :: [Tag String] -> String +renderTags' :: [Tag T.Text] -> T.Text renderTags' = renderTagsOptions renderOptions{ optMinimize = matchTags ["hr", "br", "img", "meta", "link"] , optRawTag = matchTags ["script", "style"] } - where matchTags tags = flip elem tags . map toLower + where matchTags tags = flip elem tags . T.toLower -- -- File handling @@ -826,8 +881,8 @@ collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories -- Convert the path part of a file: URI to a regular path. -- On windows, @/c:/foo@ should be @c:/foo@. -- On linux, @/foo@ should be @/foo@. -uriPathToPath :: String -> FilePath -uriPathToPath path = +uriPathToPath :: T.Text -> FilePath +uriPathToPath (T.unpack -> path) = #ifdef _WINDOWS case path of '/':ps -> ps @@ -853,7 +908,7 @@ filteredFilesFromArchive zf f = -- | Schemes from http://www.iana.org/assignments/uri-schemes.html plus -- the unofficial schemes doi, javascript, isbn, pmid. -schemes :: Set.Set String +schemes :: Set.Set T.Text schemes = Set.fromList -- Official IANA schemes [ "aaa", "aaas", "about", "acap", "acct", "acr", "adiumxtra", "afp", "afs" @@ -905,11 +960,11 @@ schemes = Set.fromList -- | Check if the string is a valid URL with a IANA or frequently used but -- unofficial scheme (see @schemes@). -isURI :: String -> Bool -isURI = maybe False hasKnownScheme . parseURI +isURI :: T.Text -> Bool +isURI = maybe False hasKnownScheme . parseURI . T.unpack where - hasKnownScheme = (`Set.member` schemes) . map toLower . - filter (/= ':') . uriScheme + hasKnownScheme = (`Set.member` schemes) . T.toLower . + T.filter (/= ':') . T.pack . uriScheme --- --- Squash blocks into inlines @@ -962,12 +1017,14 @@ defaultBlocksSeparator = -- Safe read -- -safeRead :: (MonadPlus m, Read a) => String -> m a -safeRead s = case reads s of +safeRead :: (MonadPlus m, Read a) => T.Text -> m a +safeRead = safeStrRead . T.unpack + +safeStrRead :: (MonadPlus m, Read a) => String -> m a +safeStrRead s = case reads s of (d,x):_ | all isSpace x -> return d _ -> mzero - -- -- User data directory -- |
