aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2023-07-05 23:00:59 -0700
committerJohn MacFarlane <[email protected]>2023-07-05 23:12:28 -0700
commitc693a8107f16ed7304bb8310da897e9303f7cfad (patch)
treeb9250f0a0ac7417c2d459a9a035567bec5f7aede
parentd3e485fcb0841240df7991d85a764a5b8e701985 (diff)
MediaWiki reader: revise treatment of "link trail."
Previously we only included ASCII letters. That is correct for English but not for, e.g., Spanish (see comment in #8525). A safer approach is to include all letters except those in the CJK unified ideograph ranges.
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs22
1 files changed, 20 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index eafaa8aa4..9f3aa24a6 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -19,7 +19,7 @@ module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
import Control.Monad
import Control.Monad.Except (throwError)
-import Data.Char (isAscii, isDigit, isLetter, isSpace)
+import Data.Char (isDigit, isLetter, isSpace)
import qualified Data.Foldable as F
import Data.List (intersperse)
import Data.Maybe (fromMaybe, maybeToList)
@@ -664,7 +664,8 @@ internalLink = try $ do
-- [[Help:Contents|] -> "Contents"
<|> return (B.text $ T.drop 1 $ T.dropWhile (/=':') pagename) )
sym "]]"
- linktrail <- B.text <$> manyChar (satisfy (\c -> isAscii c && isLetter c))
+ -- see #8525:
+ linktrail <- B.text <$> manyChar (satisfy (\c -> isLetter c && not (isCJK c)))
let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail)
if "Category:" `T.isPrefixOf` pagename
then do
@@ -672,6 +673,23 @@ internalLink = try $ do
return mempty
else return link
+isCJK :: Char -> Bool
+isCJK c =
+ (c >= '\x3400' && c <= '\x4DBF') ||
+ (c >= '\x4E00' && c <= '\x9FFF') ||
+ (c >= '\x20000' && c <= '\x2A6DF') ||
+ (c >= '\x2A700' && c <= '\x2B73F') ||
+ (c >= '\x2B740' && c <= '\x2B81F') ||
+ (c >= '\x2B820' && c <= '\x2CEAF') ||
+ (c >= '\x2CEB0' && c <= '\x2EBEF') ||
+ (c >= '\x30000' && c <= '\x3134F') ||
+ (c >= '\x31350' && c <= '\x323AF') ||
+ (c >= '\xF900' && c <= '\xFAFF') ||
+ (c >= '\x2F800' && c <= '\x2FA1F') ||
+ (c >= '\x2F00' && c <= '\x2FDF') ||
+ (c >= '\x2E80' && c <= '\x2EFF') ||
+ (c >= '\x3000' && c <= '\x303F')
+
externalLink :: PandocMonad m => MWParser m Inlines
externalLink = try $ do
char '['