Add files

author: yctct <yctct@yctct.com> 2025-11-12 14:35:56 +0100
committer: yctct <yctct@yctct.com> 2025-11-12 14:35:56 +0100
commit: 319745e05e4fb518a30336a332ce89b1a45fe110 (patch)
tree: 18c9697283442bac29442b7ede66357f11488b2b
2 files changed, 98 insertions, 0 deletions
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..8b671e1
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,42 @@
+This is a script copied from Pandoc's repository.
+
+Description
+----------
+
+From Pandoc's documentation:
+
+> This filter counts the words in the body of a document (omitting metadata
+> like titles and abstracts), including words in code. It should be more
+> accurate than wc -w run directly on a Markdown document, since the latter
+> will count markup characters, like the # in front of an ATX header, or tags
+> in HTML documents, as words.  
+
+
+Setup
+----
+
+	$ cd ~/bin/						# cd to ~/bin/
+	$ wget https://git.yctct.com/repositories/wcm.lua 	# download the script
+	$ chmod +x wcm.lua 					# make the script executable
+
+Make sure ~/bin/ is on $PATH.
+
+Usage
+-----
+
+	$ pandoc --lua-filter=$HOME/bin/wcm.lua file.md
+
+Source
+------
+
+Pandoc's repository.
+
+Author
+-----
+
+jgm
+
+License
+-------
+
+MIT License
diff --git a/wcm.lua b/wcm.lua
new file mode 100644
index 0000000..1406f88
--- /dev/null
+++ b/wcm.lua
@@ -0,0 +1,56 @@
+-- counts words in a document
+
+words = 0
+characters = 0
+characters_and_spaces = 0
+process_anyway = false
+
+wordcount = {
+  Str = function(el)
+    -- we don't count a word if it's entirely punctuation:
+    if el.text:match("%P") then
+        words = words + 1
+    end
+    characters = characters + utf8.len(el.text)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end,
+
+  Space = function(el)
+    characters_and_spaces = characters_and_spaces + 1
+  end,
+
+  Code = function(el)
+    _,n = el.text:gsub("%S+","")
+    words = words + n
+    text_nospace = el.text:gsub("%s", "")
+    characters = characters + utf8.len(text_nospace)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end,
+
+  CodeBlock = function(el)
+    _,n = el.text:gsub("%S+","")
+    words = words + n
+    text_nospace = el.text:gsub("%s", "")
+    characters = characters + utf8.len(text_nospace)
+    characters_and_spaces = characters_and_spaces + utf8.len(el.text)
+  end
+}
+
+-- check if the `wordcount` variable is set to `process-anyway`
+function Meta(meta)
+  if meta.wordcount and (meta.wordcount=="process-anyway"
+    or meta.wordcount=="process" or meta.wordcount=="convert") then
+      process_anyway = true
+  end
+end
+
+function Pandoc(el)
+    -- skip metadata, just count body:
+    pandoc.walk_block(pandoc.Div(el.blocks), wordcount)
+    print(words .. " words in body")
+    print(characters .. " characters in body")
+    print(characters_and_spaces .. " characters in body (including spaces)")
+    if not process_anyway then
+      os.exit(0)
+    end
+end
author	yctct <yctct@yctct.com>	2025-11-12 14:35:56 +0100
committer	yctct <yctct@yctct.com>	2025-11-12 14:35:56 +0100
commit	319745e05e4fb518a30336a332ce89b1a45fe110 (patch)
tree	18c9697283442bac29442b7ede66357f11488b2b