summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.txt42
-rw-r--r--wcm.lua56
2 files changed, 98 insertions, 0 deletions
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..8b671e1
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,42 @@
+This is a script copied from Pandoc's repository.
+
+Description
+----------
+
+From Pandoc's documentation:
+
+> This filter counts the words in the body of a document (omitting metadata
+> like titles and abstracts), including words in code. It should be more
+> accurate than wc -w run directly on a Markdown document, since the latter
+> will count markup characters, like the # in front of an ATX header, or tags
+> in HTML documents, as words.
+
+
+Setup
+----
+
+ $ cd ~/bin/ # cd to ~/bin/
+ $ wget https://git.yctct.com/repositories/wcm.lua # download the script
+ $ chmod +x wcm.lua # make the script executable
+
+Make sure ~/bin/ is on $PATH.
+
+Usage
+-----
+
+ $ pandoc --lua-filter=$HOME/bin/wcm.lua file.md
+
+Source
+------
+
+Pandoc's repository.
+
+Author
+-----
+
+jgm
+
+License
+-------
+
+MIT License
diff --git a/wcm.lua b/wcm.lua
new file mode 100644
index 0000000..1406f88
--- /dev/null
+++ b/wcm.lua
@@ -0,0 +1,56 @@
+-- counts words in a document
+
+words = 0
+characters = 0
+characters_and_spaces = 0
+process_anyway = false
+
+wordcount = {
+ Str = function(el)
+ -- we don't count a word if it's entirely punctuation:
+ if el.text:match("%P") then
+ words = words + 1
+ end
+ characters = characters + utf8.len(el.text)
+ characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+ end,
+
+ Space = function(el)
+ characters_and_spaces = characters_and_spaces + 1
+ end,
+
+ Code = function(el)
+ _,n = el.text:gsub("%S+","")
+ words = words + n
+ text_nospace = el.text:gsub("%s", "")
+ characters = characters + utf8.len(text_nospace)
+ characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+ end,
+
+ CodeBlock = function(el)
+ _,n = el.text:gsub("%S+","")
+ words = words + n
+ text_nospace = el.text:gsub("%s", "")
+ characters = characters + utf8.len(text_nospace)
+ characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+ end
+}
+
+-- check if the `wordcount` variable is set to `process-anyway`
+function Meta(meta)
+ if meta.wordcount and (meta.wordcount=="process-anyway"
+ or meta.wordcount=="process" or meta.wordcount=="convert") then
+ process_anyway = true
+ end
+end
+
+function Pandoc(el)
+ -- skip metadata, just count body:
+ pandoc.walk_block(pandoc.Div(el.blocks), wordcount)
+ print(words .. " words in body")
+ print(characters .. " characters in body")
+ print(characters_and_spaces .. " characters in body (including spaces)")
+ if not process_anyway then
+ os.exit(0)
+ end
+end