aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2022-01-30 23:51:47 -0800
committerJohn MacFarlane <[email protected]>2022-02-04 10:03:52 -0800
commit34897031f4df5980cd529f82bc7827d4cb468dd0 (patch)
tree5117821bdd84147776902b1ac6eb901f7cf6cdb7
parente07c0e74ceaadc3974d94801b4c69e8754afc85b (diff)
Add endnote XML reader.
New input format: endnotexml New reader module: Text.Pandoc.Readers.EndNote, exporting `readEndNoteXML` and `readEndNoteXMLReferences`. [API change] This reader is still a bit rudimentary, but it should get be good enough to be helpful.
-rw-r--r--MANUAL.txt2
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Citeproc/BibTeX.hs3
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/EndNote.hs201
5 files changed, 209 insertions, 1 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index f56702899..db8e97840 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -235,6 +235,7 @@ header when requesting a document from a URL:
- `docbook` ([DocBook])
- `docx` ([Word docx])
- `dokuwiki` ([DokuWiki markup])
+ - `endnotexml` ([EndNote XML bibliography])
- `epub` ([EPUB])
- `fb2` ([FictionBook2] e-book)
- `gfm` ([GitHub-Flavored Markdown]),
@@ -504,6 +505,7 @@ header when requesting a document from a URL:
[BibTeX]: https://ctan.org/pkg/bibtex
[BibLaTeX]: https://ctan.org/pkg/biblatex
[Markua]: https://leanpub.com/markua/read
+[EndNote XML bibliography]: https://support.clarivate.com/Endnote/s/article/EndNote-XML-Document-Type-Definition
## Reader options {.options}
diff --git a/pandoc.cabal b/pandoc.cabal
index b4c3bdbe4..4ce391dde 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -545,6 +545,7 @@ library
Text.Pandoc.Readers.CommonMark,
Text.Pandoc.Readers.Creole,
Text.Pandoc.Readers.BibTeX,
+ Text.Pandoc.Readers.EndNote,
Text.Pandoc.Readers.CslJson,
Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.Vimwiki,
diff --git a/src/Text/Pandoc/Citeproc/BibTeX.hs b/src/Text/Pandoc/Citeproc/BibTeX.hs
index a8e5622ed..9d6b0b47e 100644
--- a/src/Text/Pandoc/Citeproc/BibTeX.hs
+++ b/src/Text/Pandoc/Citeproc/BibTeX.hs
@@ -19,6 +19,7 @@ module Text.Pandoc.Citeproc.BibTeX
( Variant(..)
, readBibtexString
, writeBibtexString
+ , toName
)
where
@@ -1173,7 +1174,7 @@ emptyName =
, nameStaticOrdering = False
}
-toName :: Options -> [Inline] -> Bib Name
+toName :: MonadPlus m => Options -> [Inline] -> m Name
toName _ [Str "others"] =
return emptyName{ nameLiteral = Just "others" }
toName _ [Span ("",[],[]) ils] = -- corporate author
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 3e094da60..19b22b041 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -55,6 +55,7 @@ module Text.Pandoc.Readers
, readCslJson
, readBibTeX
, readBibLaTeX
+ , readEndNoteXML
, readRTF
-- * Miscellaneous
, getReader
@@ -103,6 +104,7 @@ import Text.Pandoc.Readers.Man
import Text.Pandoc.Readers.CSV
import Text.Pandoc.Readers.CslJson
import Text.Pandoc.Readers.BibTeX
+import Text.Pandoc.Readers.EndNote
import Text.Pandoc.Readers.RTF
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Sources (ToSources(..), sourcesToText)
@@ -151,6 +153,7 @@ readers = [("native" , TextReader readNative)
,("csljson" , TextReader readCslJson)
,("bibtex" , TextReader readBibTeX)
,("biblatex" , TextReader readBibLaTeX)
+ ,("endnotexml" , TextReader readEndNoteXML)
,("rtf" , TextReader readRTF)
]
diff --git a/src/Text/Pandoc/Readers/EndNote.hs b/src/Text/Pandoc/Readers/EndNote.hs
new file mode 100644
index 000000000..9fe1496be
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EndNote.hs
@@ -0,0 +1,201 @@
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.EndNote
+ Copyright : Copyright (C) 2022 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Parses EndNote XML bibliographies into a Pandoc document
+with empty body and `references` and `nocite` fields
+in the metadata. A wildcard `nocite` is used so that
+if the document is rendered in another format, the
+entire bibliography will be printed.
+-}
+module Text.Pandoc.Readers.EndNote
+ ( readEndNoteXML
+ , readEndNoteXMLReferences
+ )
+where
+
+import Text.Pandoc.Options
+import Text.Pandoc.Definition
+import Citeproc (Reference(..), ItemId(..), Val(..), Date(..), DateParts(..))
+import Text.Pandoc.Builder as B
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue)
+import Text.Pandoc.Sources (Sources(..), ToSources(..), sourcesToText)
+import Text.Pandoc.Citeproc.BibTeX (toName)
+import Control.Applicative ((<|>))
+import Control.Monad.Except (throwError)
+import Control.Monad (mzero)
+import Text.Pandoc.XML.Light
+ ( filterElementName,
+ strContent,
+ QName(qName),
+ Element(..),
+ Content(..),
+ CData(..),
+ filterElementsName,
+ filterChildrenName,
+ findAttrBy,
+ parseXMLElement )
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text as T
+import Data.Text (Text)
+import qualified Data.Map as M
+import Safe (readMay)
+
+-- | Read EndNote XML from an input string and return a Pandoc document.
+-- The document will have only metadata, with an empty body.
+-- The metadata will contain a `references` field with the
+-- bibliography entries, and a `nocite` field with the wildcard `[@*]`.
+readEndNoteXML :: (PandocMonad m, ToSources a)
+ => ReaderOptions -> a -> m Pandoc
+readEndNoteXML _opts inp = do
+ let sources = toSources inp
+ refs <- readEndNoteXMLReferences sources
+ return $ setMeta "references" (map referenceToMetaValue refs) $ B.doc mempty
+
+readEndNoteXMLReferences :: PandocMonad m
+ => Sources -> m [Reference Inlines]
+readEndNoteXMLReferences sources = do
+ tree <- either (throwError . PandocXMLError "") return $
+ parseXMLElement (TL.fromStrict . sourcesToText $ sources)
+ let records = filterElementsName ((== "record") . qName) tree
+ return $ map recordToReference records
+
+
+recordToReference :: Element -> Reference Inlines
+recordToReference e =
+ Reference{ referenceId = ItemId refid,
+ referenceType = reftype,
+ referenceDisambiguation = Nothing,
+ referenceVariables = refvars }
+
+ where
+ -- get strContent, recursing inside style elements:
+ getText el = getText' (Elem el)
+ getText' (Elem el) = mconcat $ map getText' $ elContent el
+ getText' (Text cd) = cdData cd
+ getText' (CRef _) = mempty
+ -- mconcat . map cdData . onlyText . elContent
+ name t = (== t) . qName
+ refid = maybe mempty (T.strip . strContent)
+ (filterElementName (name "key") e
+ <|> filterElementName (name "rec-number") e)
+ reftype = maybe "document" toCslReferenceType
+ (filterElementName (name "ref-type") e >>=
+ findAttrBy (name "name"))
+ authors =
+ filterChildrenName (name "contributors") e >>=
+ filterChildrenName (name "authors") >>=
+ filterChildrenName (name "author") >>=
+ toName [] . B.toList . B.text . T.strip . getText
+ titles = do
+ x <- filterChildrenName (name "titles") e
+ (key, name') <- [("title", "title"),
+ ("container-title", "secondary-title")]
+ (key,) . FancyVal . B.text . T.strip . getText <$>
+ filterChildrenName (name name') x
+ pages = ("pages",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "pages") e
+ volume = ("volume",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "volume") e
+ number = ("number",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "number") e
+ isbn = ("isbn",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "isbn") e
+ publisher = ("publisher",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "publisher") e
+ originalPublisher =
+ ("original-publisher",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "orig-pub") e
+ publisherPlace =
+ ("publisher-place",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "pub-location") e
+ abstract = ("abstract",) . FancyVal . B.text. T.strip . getText <$>
+ filterChildrenName (name "abstract") e
+ dates = ("issued",) . toDate <$> filterChildrenName (name "dates") e
+ toDate e' = DateVal $
+ Date { dateParts = toDateParts e'
+ , dateCirca = False
+ , dateSeason = Nothing
+ , dateLiteral = Nothing }
+ toDateParts e' = do
+ x <- filterChildrenName (name "year") e'
+ case readMay . T.unpack . T.strip . getText $ x of
+ Nothing -> mzero
+ Just y -> return $ DateParts [y]
+
+ refvars = M.fromList $
+ [ ("author", NamesVal authors) | not (null authors) ] ++
+ titles ++
+ pages ++
+ volume ++
+ number ++
+ isbn ++
+ dates ++
+ publisher ++
+ originalPublisher ++
+ publisherPlace ++
+ abstract
+
+toCslReferenceType :: Text -> Text
+toCslReferenceType t =
+ case t of
+ "Aggregated Database" -> "dataset"
+ "Ancient Text" -> "classic"
+ "Artwork" -> "graphic"
+ "Audiovisual Material" -> "graphic"
+ "Bill" -> "legislation"
+ "Blog" -> "post-weblog"
+ "Book" -> "book"
+ "Book Section" -> "chapter"
+ "Case" -> "legal_case"
+ "Catalog" -> "document"
+ "Chart or Table" -> "graphic"
+ "Classical Work" -> "classic"
+ "Computer program" -> "software"
+ "Conference Paper" -> "article"
+ "Conference Proceedings" -> "periodical"
+ "Dataset" -> "dataset"
+ "Dictionary" -> "book"
+ "Edited Book" -> "book"
+ "Electronic Article" -> "article"
+ "Electronic Book" -> "book"
+ "Electronic Book Section" -> "chapter"
+ "Encyclopedia" -> "book"
+ "Equation" -> "document"
+ "Figure" -> "graphic"
+ "Film or Broadcast" -> "motion_picture"
+ "Government Document" -> "document"
+ "Grant" -> "document"
+ "Hearing" -> "hearing"
+ "Interview" -> "interview"
+ "Journal Article" -> "article-journal"
+ "Legal Rule or Regulation" -> "regulation"
+ "Magazine Article" -> "article-magazine"
+ "Manuscript" -> "manuscript"
+ "Map" -> "map"
+ "Music" -> "musical_score"
+ "Newspaper Article" -> "article-newspaper"
+ "Online Database" -> "dataset"
+ "Online Multimedia" -> "webpage"
+ "Pamphlet" -> "pamphlet"
+ "Patent" -> "patent"
+ "Personal Communication" -> "personal_communication"
+ "Podcast" -> "document"
+ "Press Release" -> "report"
+ "Report" -> "report"
+ "Serial" -> "periodical"
+ "Standard" -> "standard"
+ "Statute" -> "legislation"
+ "Thesis" -> "thesis"
+ "Unpublished Work" -> "unpublished"
+ "Web Page" -> "webpage"
+ _ -> "document"