From 754fe9edbbcf329a0dc5c1b6190264cc24424178 Mon Sep 17 00:00:00 2001
From: Francois-Rene Rideau <fare@tunes.org>
Date: Sun, 29 Oct 2023 05:25:48 +0000
Subject: [PATCH] Tweak, test and document :std/text/char-set

---
 doc/.vuepress/config.js            |   1 +
 doc/reference/std/text/char-set.md | 197 +++++++++++++++++++++++++++++
 src/std/text/char-set-test.ss      |  29 +++++
 src/std/text/char-set.ss           |  11 +-
 4 files changed, 232 insertions(+), 6 deletions(-)
 create mode 100644 doc/reference/std/text/char-set.md
 create mode 100644 src/std/text/char-set-test.ss

diff --git a/doc/.vuepress/config.js b/doc/.vuepress/config.js
index f9ff0340b..c07f55274 100644
--- a/doc/.vuepress/config.js
+++ b/doc/.vuepress/config.js
@@ -137,6 +137,7 @@ module.exports = {
                         children: [
                             "text/",
                             "text/json",
+                            "text/char-set",
                             "text/csv",
                             "text/utf8",
                             "text/utf16",
diff --git a/doc/reference/std/text/char-set.md b/doc/reference/std/text/char-set.md
new file mode 100644
index 000000000..57f7ba62e
--- /dev/null
+++ b/doc/reference/std/text/char-set.md
@@ -0,0 +1,197 @@
+# Char Sets
+::: tip To use the bindings from this module:
+``` scheme
+(import :std/text/char-set)
+```
+:::
+
+
+## def-codepoint
+```scheme
+(def-codepoint (name x y ...) body ...)
+```
+This macro defines two functions, `codepoint-name` and and `char-name`
+(where the name is interpolated for the provided symbol).
+
+The first one takes a fixnum `x` as first argument
+and optionally more arguments `y ...`, and evaluates the `body ...`.
+Typically, it is a predicate returning a boolean, but could be anything.
+
+The second one takes any value `x` as first argument
+and optionally more arguments `y ...`;
+if `x` is a character, the previous function is called with its codepoint
+and the rest of the arguments, otherwise `#f` is returned.
+
+::: tip Examples:
+``` scheme
+> (def-codepoint (chess-piece? c) (<= 9812 c 9823))
+> (codepoint-chess-piece? 9817)
+#t
+> (codepoint-chess-piece? 9999)
+#f
+> (char-chess-piece? #\♞)
+#t
+> (char-chess-piece? #\A)
+#f
+```
+:::
+
+## codepoint-ascii? char-ascii?
+```scheme
+(codepoint-ascii? x) => bool
+(char-ascii? x) => bool
+```
+Returns true if the designated character is valid ASCII,
+with codepoint between 0 and 127 included.
+
+## codepoint-ascii-uppercase? char-ascii-uppercase?
+```scheme
+(codepoint-ascii-uppercase? x) => bool
+(char-ascii-uppercase? x) => bool
+```
+Returns true if the designated character is
+a valid ASCII uppercase letter from `#\a` to `#\z`,
+with codepoint between 65 and 90 included.
+
+## codepoint-ascii-lowercase? char-ascii-lowercase?
+```scheme
+(codepoint-ascii-lowercase? x) => bool
+(char-ascii-lowercase? x) => bool
+```
+Returns true if the designated character is
+a valid ASCII lowercase letter from `#\a` to `#\z`,
+with codepoint between 97 and 122 included.
+
+## codepoint-ascii-alphabetic? char-ascii-alphabetic?
+```scheme
+(codepoint-ascii-alphabetic? x) => bool
+(char-ascii-alphabetic? x) => bool
+```
+Returns true if the designated character is a valid ASCII letter,
+either uppercase from `#\A` to `#\Z` or lowercase from `#\a` to `#\z`.
+
+## codepoint-ascii-numeric? char-ascii-numeric?
+```scheme
+(codepoint-ascii-numeric? x) => bool
+(char-ascii-numeric? x) => bool
+```
+Returns true if the designated character is a valid ASCII digit
+between `#\0` and `#\9` included,
+with codepoint from 48 to 57 included.
+
+## codepoint-ascii-alphanumeric? char-ascii-alphanumeric?
+```scheme
+(codepoint-ascii-alphanumeric? x) => bool
+(char-ascii-alphanumeric? x) => bool
+```
+Returns true if the designated character is valid ASCII letter or digit.
+
+## codepoint-ascii-alphanumeric-or-underscore? char-ascii-alphanumeric-or-underscore?
+```scheme
+(codepoint-ascii-alphanumeric-or-underscore? x) => bool
+(char-ascii-alphanumeric-or-underscore? x) => bool
+```
+Returns true if the designated character is a valid ASCII letter or digit
+or the underscore character `#\_` (codepoint 95).
+
+## codepoint-ascii-printable? char-ascii-printable?
+```scheme
+(codepoint-ascii-printable? x) => bool
+(char-ascii-printable? x) => bool
+```
+
+Returns true if the designated character is a valid
+[ASCII](https://en.wikipedia.org/wiki/ASCII) graphic character,
+codepoint from 32 to 126 included.
+Note that codepoint 32 is actually the `#\space` that prints to a blank space,
+but that other whitespace characters are not included.
+Codepoint 127 is actually `#\delete` which isn’t printable.
+
+## codepoint-strict-whitespace? char-strict-whitespace?
+```scheme
+(codepoint-strict-whitespace? x) => bool
+(char-strict-whitespace? x) => bool
+```
+These functions are the first of several predicates that recognize *whitespace*.
+There is no consensus as to what is a
+[whitespace character](https://en.wikipedia.org/wiki/Whitespace_character)
+for either ASCII or Unicode,
+and these follow the strictest definition, as specified by HTML and JSON:
+whitespace characters are codepoints
+32 (`#\space`), 9 (`#\tab`), 10 (`#\newline`), 11 (`#\return`).
+The latest Scheme standard R7RS also specifies that this is
+the set of whitespace accepted by all Scheme implementations,
+though implementations may allow additional whitespace “such as page-break”.
+
+## codepoint-ascii-whitespace? char-ascii-whitespace?
+```scheme
+(codepoint-ascii-whitespace? x) => bool
+(char-ascii-whitespace? x) => bool
+```
+These predicates recognize ASCII whitespace characters
+as defined by C, C++ and Python.
+In addition to the four strict whitespace characters, they also accept
+codepoints 12 (`#\vtab`, vertical tab, C `'\v'`) and
+13 (`#\page`, page break, form feed, C `'\f'`).
+
+## codepoint-scheme-whitespace? char-scheme-whitespace?
+```scheme
+(codepoint-scheme-whitespace? x) => bool
+(char-scheme-whitespace? x) => bool
+```
+These predicates recognize the same whitespace characters
+as the underlying Scheme implementation.
+For Gambit and thus Gerbil (so far), it is the union of the ASCII whitespace above
+plus Unicode Space Separators
+(codepoints `#x20` `#xA0` `#x1680` `#x2000`-`#x200a` `#x202f` `#x205f` `#x3000`)
+plus Unicode Line Separators (codepoints `#x0A` `#x0D` `#x85` `#x2028` `#x2029`).
+
+Note that JavaScript accepts the ASCII whitespace, the Unicode Space Separators,
+`#xFEFF` (ZWNBSP), but doesn't consider the line separators whitespace;
+rather it considers `#x0A` `#x0D` `#x2028` `#x2029` as line terminators
+but not `#x85` (Next Line).
+
+Meanwhile Rust recognizes the ASCII whitespace plus `#x85` `#x200E` `#x200F` `#x2028` `#x2029`.
+
+Whichever language or grammar you parse, be sure to look at its latest specification
+to identify its specific definition of “whitespace”.
+
+## codepoint-ascii-printable-or-whitespace? char-ascii-printable-or-whitespace?
+```scheme
+(codepoint-ascii-printable-or-whitespace? x) => bool
+(char-ascii-printable-or-whitespace? x) => bool
+```
+These predicates recognize ASCII characters that are either printable or whitespace
+(the C definition, which also equals
+the intersection of the underlying Scheme definition and ASCII).
+
+## codepoint-ascii-digit char-ascii-digit
+```scheme
+(codepoint-ascii-digit x [base 10]) => number-or-false
+(char-ascii-digit x [base 10]) => number-or-false
+```
+Given a character `x` and a `base` from 2 to 36 (defaults to 10),
+if that character represents a digit in that base
+(with letters being the digits from 10 to 35),
+return the numerical value of the digit.
+Otherwise return `#f`.
+
+## digit-char
+```scheme
+(digit-char n [base 10] [upper-case? #f]) => char-or-false
+```
+Given a number `n` and a `base` from 2 to 36 (defaults to 10),
+if the number is an `exact-integer` between 0 (included) and `base` (excluded),
+then return an ASCII character that represents that digit in the given `base`.
+If the digit value is 10 to 35, then use a lowercase letter if `upper-case?` is false,
+an uppercase letter if `upper-case?` is true.
+If the argument `n` is not a valid digit for that `base`, return `#f`.
+
+## char-eol?
+```scheme
+(char-eol? x) => bool
+```
+Is `x`, a result from calling `read-char` or `peek-char`
+from a `Port` or `Reader`, a line terminator?
+This is the case if `x` is one of the characters `#\newline` or `#\return`,
+or the special object `#!eof`.
diff --git a/src/std/text/char-set-test.ss b/src/std/text/char-set-test.ss
new file mode 100644
index 000000000..4f74f9287
--- /dev/null
+++ b/src/std/text/char-set-test.ss
@@ -0,0 +1,29 @@
+(export char-set-test)
+(import
+  :std/iter
+  :std/sugar
+  :std/test
+  ./char-set)
+
+(def char-set-test
+  (test-suite "test :std/text/char-set"
+    (test-case "codepoint-scheme-whitespace?, char-scheme-whitespace?"
+      (for (i (in-range #x4000)) ;; we could go to (1+ max-char-code), but why bother?
+        (when-let ((c (with-catch false (cut integer->char i))))
+          (let (w? (char-whitespace? c))
+            (check (and (member i [#x09 #x0A #x0B #x0C #x0D #x20 #xA0
+                                   #x1680 (iota 11 #x2000)... #x202f #x205f #x3000
+                                   #x85 #x2028 #x2029]) #t) => w?)
+            (check (char-scheme-whitespace? c) => w?)
+            (check (codepoint-scheme-whitespace? i) => w?)))))
+    (test-case "char-digit, digit-char"
+      (defrule (checks (n c base ...) ...)
+        (begin
+          (begin (when c (check (char-ascii-digit c base ...) => n))
+                 (check (digit-char n base ...) => c)
+                 (check (digit-char n (or base ... 10) #t) => (and c (char-upcase c)))) ...))
+      (checks (0 #\0) (0 #\0 2) (0 #\0 35)
+              (9 #\9) (9 #f 3) (9 #\9 24)
+              (10 #f) (10 #\a 11) (10 #\a 33)
+              (15 #f) (15 #\f 16) (15 #\f 35)
+              (35 #f) (35 #f 35) (35 #\z 36)))))
diff --git a/src/std/text/char-set.ss b/src/std/text/char-set.ss
index 6e9ec25b3..b22026359 100644
--- a/src/std/text/char-set.ss
+++ b/src/std/text/char-set.ss
@@ -47,9 +47,8 @@
   (or (codepoint-ascii-alphanumeric? c) (= c 95)))
 
 ;; : Codepoint -> Bool
-(def-codepoint (ascii-graphic? c) ;; any ascii "graphic" character
-  (<= 32 c 127))
-
+(def-codepoint (ascii-printable? c) ;; any ascii printable character
+  (<= 32 c 126)) ;; see https://en.wikipedia.org/wiki/ASCII
 
 ;;; There is no consensus on what a Unicode "whitespace" is.
 ;; See https://en.wikipedia.org/wiki/Whitespace_character
@@ -91,11 +90,11 @@
 ;; to identify its specific definition of "whitespace".
 
 ;; : Codepoint -> Bool
-(def-codepoint (ascii-printable? c) ;; Should we really include 127 though?
-  (or (codepoint-ascii-graphic? c) (codepoint-ascii-whitespace? c)))
+(def-codepoint (ascii-printable-or-whitespace? c)
+  (or (codepoint-ascii-printable? c) (codepoint-ascii-whitespace? c)))
 
 ;; Assume ASCII, base 2 to 36
-;; : Codepoint -> (OrFalse (IntegerRange min: 0 max: 35))
+;; : Codepoint ?(IntegerRange min: 2 max: 36) -> (OrFalse (IntegerRange min: 0 max: 35))
 (def (codepoint-ascii-digit c (base 10))
   (let (found (lambda (d) (and (< d base) d)))
     (cond