aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2024-03-10 11:14:41 -0700
committerJohn MacFarlane <[email protected]>2024-03-10 11:14:41 -0700
commit812f82ab098a6896b6752fdfe6dc12cbf8502271 (patch)
treeed1fd351aef33831ed594d6e04265b014670b07e
parent626ffd74b4cb0038eb7d4ee638486f156753da1c (diff)
LaTeX reader: improve tokenization of `@`.
Make tokenization sensitive to `\makeatletter`/`\makeatother`. Previously we just always treated `@` as a letter. This led to bad results, e.g. with the sequence `\@`. E.g., `a\@ b` would parse as "ab" and `a\@b` as "a". Closes #9555.
-rw-r--r--src/Text/Pandoc/Readers/LaTeX/Parsing.hs67
-rw-r--r--test/command/9555.md12
2 files changed, 51 insertions, 28 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
index 388c6fa09..f60a5dd35 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
@@ -354,41 +354,41 @@ getInputTokens = do
Sources ((_,t):rest) -> tokenizeSources $ Sources ((pos,t):rest)
tokenize :: SourcePos -> Text -> [Tok]
-tokenize = totoks
+tokenize = totoks False
where
- totoks pos t =
+ totoks atIsLetter pos t =
case T.uncons t of
Nothing -> []
Just (c, rest)
| c == '\n' ->
Tok pos Newline "\n"
- : totoks (setSourceColumn (incSourceLine pos 1) 1) rest
+ : totoks atIsLetter (setSourceColumn (incSourceLine pos 1) 1) rest
| isSpaceOrTab c ->
let (sps, rest') = T.span isSpaceOrTab t
in Tok pos Spaces sps
- : totoks (incSourceColumn pos (T.length sps))
+ : totoks atIsLetter (incSourceColumn pos (T.length sps))
rest'
| isAlphaNum c ->
let (ws, rest') = T.span isAlphaNum t
in Tok pos Word ws
- : totoks (incSourceColumn pos (T.length ws)) rest'
+ : totoks atIsLetter (incSourceColumn pos (T.length ws)) rest'
| c == '%' ->
let (cs, rest') = T.break (== '\n') rest
in Tok pos Comment ("%" <> cs)
- : totoks (incSourceColumn pos (1 + T.length cs)) rest'
+ : totoks atIsLetter (incSourceColumn pos (1 + T.length cs)) rest'
| c == '\\' ->
case T.uncons rest of
Nothing -> [Tok pos (CtrlSeq " ") "\\"]
Just (d, rest')
- | isLetterOrAt d ->
- -- \makeatletter is common in macro defs;
- -- ideally we should make tokenization sensitive
- -- to \makeatletter and \makeatother, but this is
- -- probably best for now
- let (ws, rest'') = T.span isLetterOrAt rest
+ | isLetter' atIsLetter d ->
+ let (ws, rest'') = T.span (isLetter' atIsLetter) rest
(ss, rest''') = T.span isSpaceOrTab rest''
+ atIsLetter' = case ws of
+ "makeatletter" -> True
+ "makeatother" -> False
+ _ -> atIsLetter
in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
- : totoks (incSourceColumn pos
+ : totoks atIsLetter' (incSourceColumn pos
(1 + T.length ws + T.length ss)) rest'''
| isSpaceOrTab d || d == '\n' ->
let (w1, r1) = T.span isSpaceOrTab rest
@@ -401,15 +401,15 @@ tokenize = totoks
in case T.uncons r3 of
Just ('\n', _) ->
Tok pos (CtrlSeq " ") ("\\" <> w1)
- : totoks (incSourceColumn pos (T.length ws))
- r1
+ : totoks atIsLetter
+ (incSourceColumn pos (T.length ws)) r1
_ ->
Tok pos (CtrlSeq " ") ws
- : totoks (incSourceColumn pos (T.length ws))
- r3
+ : totoks atIsLetter
+ (incSourceColumn pos (T.length ws)) r3
| otherwise ->
Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
- : totoks (incSourceColumn pos 2) rest'
+ : totoks atIsLetter (incSourceColumn pos 2) rest'
| c == '#' ->
case T.uncons rest of
Just ('#', t3) ->
@@ -417,18 +417,20 @@ tokenize = totoks
in case safeRead t1 of
Just i ->
Tok pos (DeferredArg i) ("##" <> t1)
- : totoks (incSourceColumn pos (2 + T.length t1)) t2
+ : totoks atIsLetter
+ (incSourceColumn pos (2 + T.length t1)) t2
Nothing -> Tok pos Symbol "#"
: Tok (incSourceColumn pos 1) Symbol "#"
- : totoks (incSourceColumn pos 1) t3
+ : totoks atIsLetter (incSourceColumn pos 1) t3
_ ->
let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
in case safeRead t1 of
Just i ->
Tok pos (Arg i) ("#" <> t1)
- : totoks (incSourceColumn pos (1 + T.length t1)) t2
+ : totoks atIsLetter
+ (incSourceColumn pos (1 + T.length t1)) t2
Nothing -> Tok pos Symbol "#"
- : totoks (incSourceColumn pos 1) rest
+ : totoks atIsLetter (incSourceColumn pos 1) rest
| c == '^' ->
case T.uncons rest of
Just ('^', rest') ->
@@ -438,26 +440,35 @@ tokenize = totoks
case T.uncons rest'' of
Just (e, rest''') | isLowerHex e ->
Tok pos Esc2 (T.pack ['^','^',d,e])
- : totoks (incSourceColumn pos 4) rest'''
+ : totoks atIsLetter
+ (incSourceColumn pos 4) rest'''
_ ->
Tok pos Esc1 (T.pack ['^','^',d])
- : totoks (incSourceColumn pos 3) rest''
+ : totoks atIsLetter
+ (incSourceColumn pos 3) rest''
| d < '\128' ->
Tok pos Esc1 (T.pack ['^','^',d])
- : totoks (incSourceColumn pos 3) rest''
+ : totoks atIsLetter
+ (incSourceColumn pos 3) rest''
_ -> Tok pos Symbol "^" :
Tok (incSourceColumn pos 1) Symbol "^" :
- totoks (incSourceColumn pos 2) rest'
+ totoks atIsLetter (incSourceColumn pos 2) rest'
_ -> Tok pos Symbol "^"
- : totoks (incSourceColumn pos 1) rest
+ : totoks atIsLetter (incSourceColumn pos 1) rest
| otherwise ->
- Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
+ Tok pos Symbol (T.singleton c) :
+ totoks atIsLetter (incSourceColumn pos 1) rest
isSpaceOrTab :: Char -> Bool
isSpaceOrTab ' ' = True
isSpaceOrTab '\t' = True
isSpaceOrTab _ = False
+-- First parameter is True if @ is letter
+isLetter' :: Bool -> Char -> Bool
+isLetter' True '@' = True
+isLetter' _ c = isLetter c
+
isLetterOrAt :: Char -> Bool
isLetterOrAt '@' = True
isLetterOrAt c = isLetter c
diff --git a/test/command/9555.md b/test/command/9555.md
new file mode 100644
index 000000000..0631f71af
--- /dev/null
+++ b/test/command/9555.md
@@ -0,0 +1,12 @@
+```
+% pandoc -t native -f latex
+a\@ b\@c
+
+\makeatletter
+a\@ b\@c
+\makeatother
+a\@ b\@c
+[ Para [ Str "a" , Space , Str "bc" ]
+, Para [ Str "ab" , SoftBreak , Str "a" , Space , Str "bc" ]
+]
+```