diff options
| author | John MacFarlane <[email protected]> | 2024-03-10 11:14:41 -0700 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2024-03-10 11:14:41 -0700 |
| commit | 812f82ab098a6896b6752fdfe6dc12cbf8502271 (patch) | |
| tree | ed1fd351aef33831ed594d6e04265b014670b07e | |
| parent | 626ffd74b4cb0038eb7d4ee638486f156753da1c (diff) | |
LaTeX reader: improve tokenization of `@`.
Make tokenization sensitive to `\makeatletter`/`\makeatother`.
Previously we just always treated `@` as a letter. This led
to bad results, e.g. with the sequence `\@`. E.g.,
`a\@ b` would parse as "ab" and `a\@b` as "a".
Closes #9555.
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Parsing.hs | 67 | ||||
| -rw-r--r-- | test/command/9555.md | 12 |
2 files changed, 51 insertions, 28 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index 388c6fa09..f60a5dd35 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -354,41 +354,41 @@ getInputTokens = do Sources ((_,t):rest) -> tokenizeSources $ Sources ((pos,t):rest) tokenize :: SourcePos -> Text -> [Tok] -tokenize = totoks +tokenize = totoks False where - totoks pos t = + totoks atIsLetter pos t = case T.uncons t of Nothing -> [] Just (c, rest) | c == '\n' -> Tok pos Newline "\n" - : totoks (setSourceColumn (incSourceLine pos 1) 1) rest + : totoks atIsLetter (setSourceColumn (incSourceLine pos 1) 1) rest | isSpaceOrTab c -> let (sps, rest') = T.span isSpaceOrTab t in Tok pos Spaces sps - : totoks (incSourceColumn pos (T.length sps)) + : totoks atIsLetter (incSourceColumn pos (T.length sps)) rest' | isAlphaNum c -> let (ws, rest') = T.span isAlphaNum t in Tok pos Word ws - : totoks (incSourceColumn pos (T.length ws)) rest' + : totoks atIsLetter (incSourceColumn pos (T.length ws)) rest' | c == '%' -> let (cs, rest') = T.break (== '\n') rest in Tok pos Comment ("%" <> cs) - : totoks (incSourceColumn pos (1 + T.length cs)) rest' + : totoks atIsLetter (incSourceColumn pos (1 + T.length cs)) rest' | c == '\\' -> case T.uncons rest of Nothing -> [Tok pos (CtrlSeq " ") "\\"] Just (d, rest') - | isLetterOrAt d -> - -- \makeatletter is common in macro defs; - -- ideally we should make tokenization sensitive - -- to \makeatletter and \makeatother, but this is - -- probably best for now - let (ws, rest'') = T.span isLetterOrAt rest + | isLetter' atIsLetter d -> + let (ws, rest'') = T.span (isLetter' atIsLetter) rest (ss, rest''') = T.span isSpaceOrTab rest'' + atIsLetter' = case ws of + "makeatletter" -> True + "makeatother" -> False + _ -> atIsLetter in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss) - : totoks (incSourceColumn pos + : totoks atIsLetter' (incSourceColumn pos (1 + T.length ws + T.length ss)) rest''' | isSpaceOrTab d || d == '\n' -> let (w1, r1) = T.span isSpaceOrTab rest @@ -401,15 +401,15 @@ tokenize = totoks in case T.uncons r3 of Just ('\n', _) -> Tok pos (CtrlSeq " ") ("\\" <> w1) - : totoks (incSourceColumn pos (T.length ws)) - r1 + : totoks atIsLetter + (incSourceColumn pos (T.length ws)) r1 _ -> Tok pos (CtrlSeq " ") ws - : totoks (incSourceColumn pos (T.length ws)) - r3 + : totoks atIsLetter + (incSourceColumn pos (T.length ws)) r3 | otherwise -> Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d]) - : totoks (incSourceColumn pos 2) rest' + : totoks atIsLetter (incSourceColumn pos 2) rest' | c == '#' -> case T.uncons rest of Just ('#', t3) -> @@ -417,18 +417,20 @@ tokenize = totoks in case safeRead t1 of Just i -> Tok pos (DeferredArg i) ("##" <> t1) - : totoks (incSourceColumn pos (2 + T.length t1)) t2 + : totoks atIsLetter + (incSourceColumn pos (2 + T.length t1)) t2 Nothing -> Tok pos Symbol "#" : Tok (incSourceColumn pos 1) Symbol "#" - : totoks (incSourceColumn pos 1) t3 + : totoks atIsLetter (incSourceColumn pos 1) t3 _ -> let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest in case safeRead t1 of Just i -> Tok pos (Arg i) ("#" <> t1) - : totoks (incSourceColumn pos (1 + T.length t1)) t2 + : totoks atIsLetter + (incSourceColumn pos (1 + T.length t1)) t2 Nothing -> Tok pos Symbol "#" - : totoks (incSourceColumn pos 1) rest + : totoks atIsLetter (incSourceColumn pos 1) rest | c == '^' -> case T.uncons rest of Just ('^', rest') -> @@ -438,26 +440,35 @@ tokenize = totoks case T.uncons rest'' of Just (e, rest''') | isLowerHex e -> Tok pos Esc2 (T.pack ['^','^',d,e]) - : totoks (incSourceColumn pos 4) rest''' + : totoks atIsLetter + (incSourceColumn pos 4) rest''' _ -> Tok pos Esc1 (T.pack ['^','^',d]) - : totoks (incSourceColumn pos 3) rest'' + : totoks atIsLetter + (incSourceColumn pos 3) rest'' | d < '\128' -> Tok pos Esc1 (T.pack ['^','^',d]) - : totoks (incSourceColumn pos 3) rest'' + : totoks atIsLetter + (incSourceColumn pos 3) rest'' _ -> Tok pos Symbol "^" : Tok (incSourceColumn pos 1) Symbol "^" : - totoks (incSourceColumn pos 2) rest' + totoks atIsLetter (incSourceColumn pos 2) rest' _ -> Tok pos Symbol "^" - : totoks (incSourceColumn pos 1) rest + : totoks atIsLetter (incSourceColumn pos 1) rest | otherwise -> - Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest + Tok pos Symbol (T.singleton c) : + totoks atIsLetter (incSourceColumn pos 1) rest isSpaceOrTab :: Char -> Bool isSpaceOrTab ' ' = True isSpaceOrTab '\t' = True isSpaceOrTab _ = False +-- First parameter is True if @ is letter +isLetter' :: Bool -> Char -> Bool +isLetter' True '@' = True +isLetter' _ c = isLetter c + isLetterOrAt :: Char -> Bool isLetterOrAt '@' = True isLetterOrAt c = isLetter c diff --git a/test/command/9555.md b/test/command/9555.md new file mode 100644 index 000000000..0631f71af --- /dev/null +++ b/test/command/9555.md @@ -0,0 +1,12 @@ +``` +% pandoc -t native -f latex +a\@ b\@c + +\makeatletter +a\@ b\@c +\makeatother +a\@ b\@c +[ Para [ Str "a" , Space , Str "bc" ] +, Para [ Str "ab" , SoftBreak , Str "a" , Space , Str "bc" ] +] +``` |
