diff options
| author | yctct <yctct@yctct.com> | 2025-11-12 14:35:56 +0100 |
|---|---|---|
| committer | yctct <yctct@yctct.com> | 2025-11-12 14:35:56 +0100 |
| commit | 319745e05e4fb518a30336a332ce89b1a45fe110 (patch) | |
| tree | 18c9697283442bac29442b7ede66357f11488b2b | |
Add files
| -rw-r--r-- | README.txt | 42 | ||||
| -rw-r--r-- | wcm.lua | 56 |
2 files changed, 98 insertions, 0 deletions
diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..8b671e1 --- /dev/null +++ b/README.txt @@ -0,0 +1,42 @@ +This is a script copied from Pandoc's repository. + +Description +---------- + +From Pandoc's documentation: + +> This filter counts the words in the body of a document (omitting metadata +> like titles and abstracts), including words in code. It should be more +> accurate than wc -w run directly on a Markdown document, since the latter +> will count markup characters, like the # in front of an ATX header, or tags +> in HTML documents, as words. + + +Setup +---- + + $ cd ~/bin/ # cd to ~/bin/ + $ wget https://git.yctct.com/repositories/wcm.lua # download the script + $ chmod +x wcm.lua # make the script executable + +Make sure ~/bin/ is on $PATH. + +Usage +----- + + $ pandoc --lua-filter=$HOME/bin/wcm.lua file.md + +Source +------ + +Pandoc's repository. + +Author +----- + +jgm + +License +------- + +MIT License @@ -0,0 +1,56 @@ +-- counts words in a document + +words = 0 +characters = 0 +characters_and_spaces = 0 +process_anyway = false + +wordcount = { + Str = function(el) + -- we don't count a word if it's entirely punctuation: + if el.text:match("%P") then + words = words + 1 + end + characters = characters + utf8.len(el.text) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end, + + Space = function(el) + characters_and_spaces = characters_and_spaces + 1 + end, + + Code = function(el) + _,n = el.text:gsub("%S+","") + words = words + n + text_nospace = el.text:gsub("%s", "") + characters = characters + utf8.len(text_nospace) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end, + + CodeBlock = function(el) + _,n = el.text:gsub("%S+","") + words = words + n + text_nospace = el.text:gsub("%s", "") + characters = characters + utf8.len(text_nospace) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end +} + +-- check if the `wordcount` variable is set to `process-anyway` +function Meta(meta) + if meta.wordcount and (meta.wordcount=="process-anyway" + or meta.wordcount=="process" or meta.wordcount=="convert") then + process_anyway = true + end +end + +function Pandoc(el) + -- skip metadata, just count body: + pandoc.walk_block(pandoc.Div(el.blocks), wordcount) + print(words .. " words in body") + print(characters .. " characters in body") + print(characters_and_spaces .. " characters in body (including spaces)") + if not process_anyway then + os.exit(0) + end +end |
