aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2022-01-16 10:40:38 -0800
committerJohn MacFarlane <[email protected]>2022-01-16 10:41:04 -0800
commit1be49f11f72f166ce4f1266aa229b8b3d4da88d3 (patch)
tree94eeac2aea79518cbee6e2f98b244446c65e99b7
parent9e60142cc965a79aced702798966593950b000c0 (diff)
doc/custom-readers.lua: add example for "readable HTML."
-rw-r--r--doc/custom-readers.md84
1 files changed, 84 insertions, 0 deletions
diff --git a/doc/custom-readers.md b/doc/custom-readers.md
index 37b6d6a3e..6d8333d14 100644
--- a/doc/custom-readers.md
+++ b/doc/custom-readers.md
@@ -682,3 +682,87 @@ function Reader (input, opts)
return pandoc.Pandoc(input:map(to_code_block))
end
```
+
+# Example: "readable HTML" reader
+
+This reader uses the command-line program `readable`
+(install via `npm install -g readability-cli`)
+to clean out parts of HTML input that have to do with
+navigation, leaving only the content.
+
+``` lua
+-- Custom reader for "readable HTML." This pipes HTML content
+-- through the 'readable' program (npm install -g readability-cli)
+-- and then calls the HTML reader. In addition, Divs that seem
+-- to have only a layout function are removed to avoid clutter.
+
+function make_readable(source)
+ local result
+ if not pcall(function ()
+ local name = source.name
+ if not name:match("http") then
+ name = "file:///" .. name
+ end
+ result = pandoc.pipe("readable",
+ {"--keep-classes","--base",name},
+ source.text)
+ end) then
+ io.stderr:write("Error running 'readable': do you have it installed?\n")
+ io.stderr:write("npm install -g readability-cli\n")
+ os.exit(1)
+ end
+ return result
+end
+
+local boring_classes =
+ { row = true,
+ page = true,
+ container = true
+ }
+
+local boring_attributes = { "role" }
+
+local function is_boring_class(cl)
+ return boring_classes[cl] or cl:match("col%-") or cl:match("pull%-")
+end
+
+local function handle_div(el)
+ for i,class in ipairs(el.classes) do
+ if is_boring_class(class) then
+ el.classes[i] = nil
+ end
+ end
+ for i,k in ipairs(boring_attributes) do
+ el.attributes[k] = nil
+ end
+ if el.identifier:match("readability-") then
+ el.identifier = ""
+ end
+ if #el.classes == 0 and #el.attributes == 0 and #el.identifier == 0 then
+ return el.content
+ else
+ return el
+ end
+end
+
+function Reader(sources)
+ local readable = ''
+ for _,source in ipairs(sources) do
+ readable = readable .. make_readable(source)
+ end
+ local doc = pandoc.read(readable, "html", PANDOC_READER_OPTIONS)
+ -- Now remove Divs used only for layout
+ return doc:walk{ Div = handle_div }
+end
+```
+
+Example of use:
+
+```
+pandoc -f readable.lua -t markdown https://pandoc.org
+```
+and compare the output to
+```
+pandoc -f html -t markdown https://pandoc.org
+```
+