From 70cdeae6b6b31c37039767d3dc9ce40d90fc4b81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nikola=20Stankovi=C4=87?=
 <3808545+nik-sta@users.noreply.github.com>
Date: Fri, 1 Sep 2023 16:42:55 +0200
Subject: [PATCH] feat: upgrade to v2.0.0 with enhanced nanoid functions and
 optimizations

---
 README.md                    | 135 ++++++++++++++++++++++++++++++-----
 dev/benchmark/a_nanoid.sql   |   1 +
 dev/benchmark/b_nanoid.sql   |   1 +
 dev/benchmark/benchmarks.sql |  60 ++++++++++++++++
 dev/test/unit_tests.sql      |  73 +++++++++++++++++++
 nanoid.sql                   | 103 +++++++++++++++++++-------
 6 files changed, 327 insertions(+), 46 deletions(-)
 create mode 100644 dev/benchmark/a_nanoid.sql
 create mode 100644 dev/benchmark/b_nanoid.sql
 create mode 100644 dev/benchmark/benchmarks.sql
 create mode 100644 dev/test/unit_tests.sql

diff --git a/README.md b/README.md
index 92bd3b0..fce6c5a 100644
--- a/README.md
+++ b/README.md
@@ -11,68 +11,165 @@ A tiny, secure, URL-friendly, unique string ID generator for Postgres.
 * **Small.** Just a simple Postgres function.
 * **Safe.** It uses pgcrypto random generator. Can be used in clusters.
 * **Short IDs.** It uses a larger alphabet than UUID (`A-Za-z0-9_-`). So ID size was reduced from 36 to 21 symbols.
-* **Portable**. Nano ID was ported to [over 20 programming languages](https://github.com/ai/nanoid/blob/main/README.md#other-programming-languages).
+* **Portable**. Nano ID was ported
+  to [over 20 programming languages](https://github.com/ai/nanoid/blob/main/README.md#other-programming-languages).
+
+## How to use
 
-## Use
 ```sql
-SELECT nanoid(); -- creates an id, with the defaults of the created nanoid() function.
+SELECT nanoid(); -- Simplest way to use this function. Creates an id, with the defaults of the created nanoid() function.
 SELECT nanoid(4); -- size parameter set to return 4 digit ids only
-SELECT nanoid(3, 'abcdefghij'); -- custom size and alphabet parameters defined. nanoid() generates ids concerning them
+SELECT nanoid(3, 'abcdefghij'); -- custom size and alphabet parameters defined. nanoid() generates ids concerning them.
+SELECT nanoid(10, '23456789abcdefghijklmnopqrstuvwxyz', 1.85); -- nanoid() could generates ids more performant with a custom defined additional bytes factor.
 ```
 
 ```sql
-CREATE TABLE mytable (
-  id char(21) DEFAULT nanoid() PRIMARY KEY
+CREATE TABLE mytable(
+    id char(21) DEFAULT nanoid() PRIMARY KEY
 );
 
 or
 
 -- To use a custom id size
-CREATE TABLE mytable (
-  id char(14) DEFAULT nanoid(14) PRIMARY KEY
+CREATE TABLE mytable(
+    id char(14) DEFAULT nanoid(14) PRIMARY KEY
 );
 
 or
 
 -- To use a custom id size and a custom alphabet
-CREATE TABLE mytable (
-  id char(12) DEFAULT nanoid(12, 'ABC123') PRIMARY KEY
+CREATE TABLE mytable(
+    id char(12) DEFAULT nanoid(12, 'ABC123') PRIMARY KEY
 );
 ```
 
 ## Getting Started
 
 ### Requirements
+
 * PostgreSQL 9.4 or newer
 
-Execute the file `nanoid.sql` to create the `nanoid()` function on your defined schema. The nanoid() function will only be available in the specific database where you run the SQL code provided.
+Execute the file `nanoid.sql` to create the `nanoid()` function on your defined schema. The nanoid() function will only
+be available in the specific database where you run the SQL code provided.
 
-**Manually create the function in each database:** You can connect to each database and create the function. This function can be created manually or through a script if you have many databases. Remember to manage updates to the function. If you change the function in one database, those changes will only be reflected in the other databases if you update each function.
+**Manually create the function in each database:** You can connect to each database and create the function. This
+function can be created manually or through a script if you have many databases. Remember to manage updates to the
+function. If you change the function in one database, those changes will only be reflected in the other databases if you
+update each function.
 
-### Adding to the default template database
+## Adding to the default template database
 
-**Use a template database:** If you often create databases that need to have the same set of functions, you could create a template database that includes these functions. Then, when you create a new database, you can specify this template, and PostgreSQL will make the new database a copy of the template.
+**Use a template database:** If you often create databases that need to have the same set of functions, you could create
+a template database that includes these functions. Then, when you create a new database, you can specify this template,
+and PostgreSQL will make the new database a copy of the template.
 
 Here's how to do that:
+
 1. Connect to template1 database:
 2. Then, run your nanoid() function creation code.
 
-*If the function is only needed for specific applications, it might be better to create it manually in each database where needed or create a custom template database that includes this function and use that template when creating new databases for these applications.*
+*If the function is only needed for specific applications, it might be better to create it manually in each database
+where needed or create a custom template database that includes this function and use that template when creating new
+databases for these applications.*
 
-Also, note that changes to template1 won't affect existing databases, only new ones created after the changes. Existing databases will need to have the function added manually if required.
+Also, note that changes to template1 won't affect existing databases, only new ones created after the changes. Existing
+databases will need to have the function added manually if required.
 
 Reference: [Template Databases](https://www.postgresql.org/docs/current/manage-ag-templatedbs.html)
 
+## Calculating the additional bytes factor for a custom alphabet
+
+If you change the alphabet of the `nanoid()` function, you could optimize the performance by calculating a new additional 
+bytes factor with the following SQL statement:
+
+```sql
+WITH input as (SELECT '23456789abcdefghijklmnopqrstuvwxyz' as alphabet)
+SELECT round(1 + abs((((2 << cast(floor(log(length(alphabet) - 1) / log(2)) as int)) - 1) - length(alphabet)::numeric) / length(alphabet)), 2) as "Optimal additional bytes factor"
+FROM input;
+
+-- The resulting value can then be used f.e. as follows:
+SELECT nanoid(10, '23456789abcdefghijklmnopqrstuvwxyz', 1.85);
+
+```
+
+Utilizing a custom-calculated additional bytes factor in `nanoid()`  enhances string generation performance. This factor 
+determines how many bytes are generated in a single batch, optimizing computational efficiency. Generating an optimal number 
+of bytes per batch minimizes redundant operations and conserves memory.
+
+## Usage Guide: `nanoid_optimized()`
+
+The `nanoid_optimized()` function is an advanced version of the `nanoid()` function designed for higher performance and
+lower memory overhead. While it provides a more efficient mechanism to generate unique identifiers, it assumes that you
+know precisely how you want to use it.
+
+🚫 **Warning**: No checks are performed inside `nanoid_optimized()`. Use it only if you're sure about the parameters you'
+re passing.
+
+### Function Signature
+
+```sql
+nanoid_optimized(
+    size int,
+    alphabet text,
+    mask int,
+    step int
+) RETURNS text;
+```
+
+### Parameters
+
+- `size`: The desired length of the generated string.
+- `alphabet`: The set of characters to choose from for generating the string.
+- `mask`: The mask used for mapping random bytes to alphabet indices. The value should be `(2^n) - 1`, where `n` is a
+  power of 2 less than or equal to the alphabet size.
+- `step`: The number of random bytes to generate in each iteration. A larger value might speed up the function but will
+  also increase memory usage.
+
+### Example Usage
+
+Generate a NanoId String of length 10 using the default alphabet set:
+
+```sql
+SELECT nanoid_optimized(10, '_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', 63, 16);
+```
+
+### Tips:
+
+- **Performance**: This function is optimized for performance, so it's ideal for scenarios where high-speed ID
+  generation is needed.
+- **Alphabet Set**: The larger your alphabet set, the more unique your generated IDs will be, but also consider the mask
+  and step parameters' adjustments.
+- **Customization**: Feel free to adjust the parameters to suit your specific needs, but always be cautious about the
+  values you're inputting.
+
+By following this guide, you can seamlessly integrate the `nanoid_optimized()` function into your projects and enjoy the
+benefits of its optimized performance.
+
 ## Using MySQL/MariaDB?
 
-If you're using MySQL or MariaDB and you found this library helpful, we have a similar library for MySQL/MariaDB, too! Check out our [Nano ID for MySQL/MariaDB](https://github.com/viascom/nanoid-mysql-mariadb) repository to use the same capabilities in your MySQL/MariaDB databases.
+If you're using MySQL or MariaDB and you found this library helpful, we have a similar library for MySQL/MariaDB, too!
+Check out our [Nano ID for MySQL/MariaDB](https://github.com/viascom/nanoid-mysql-mariadb) repository to use the same
+capabilities in your MySQL/MariaDB databases.
+
+## 🌱 Contributors Welcome
+
+- 🐛 **Encountered a Bug?** Let us know with an issue. Every bit of feedback helps enhance the project.
+
+- 💡 **Interested in Contributing Code?** Simply fork and submit a pull request. Every contribution, no matter its size, is valued.
+
+- 📣 **Have Some Ideas?** We're always open to suggestions. Initiate an issue for discussions or to share your insights.
+
+All relevant details about the project can be found in this README.
+
+Your active participation 🤝 is a cornerstone of **nanoid-postgres**. Thank you for joining us on this journey.
 
-## Authors 🖥️
+## 🖥️ Authors
 
 * **Patrick Bösch** - *Initial work* - [itsmefox](https://github.com/itsmefox)
 * **Nikola Stanković** - *Initial work* - [nik-sta](https://github.com/nik-sta)
 
-See also the list of [contributors](https://github.com/viascom/nanoid-postgres/contributors) who participated in this project. 💕
+See also the list of [contributors](https://github.com/viascom/nanoid-postgres/contributors) who participated in this
+project. 💕
 
 ## License
 
diff --git a/dev/benchmark/a_nanoid.sql b/dev/benchmark/a_nanoid.sql
new file mode 100644
index 0000000..efdc66a
--- /dev/null
+++ b/dev/benchmark/a_nanoid.sql
@@ -0,0 +1 @@
+-- PLACE IMPLEMENTATION A 'a_nanoid()' HERE
\ No newline at end of file
diff --git a/dev/benchmark/b_nanoid.sql b/dev/benchmark/b_nanoid.sql
new file mode 100644
index 0000000..e845964
--- /dev/null
+++ b/dev/benchmark/b_nanoid.sql
@@ -0,0 +1 @@
+-- PLACE IMPLEMENTATION B 'b_nanoid()' HERE
\ No newline at end of file
diff --git a/dev/benchmark/benchmarks.sql b/dev/benchmark/benchmarks.sql
new file mode 100644
index 0000000..36c34ba
--- /dev/null
+++ b/dev/benchmark/benchmarks.sql
@@ -0,0 +1,60 @@
+DO
+$$
+    DECLARE
+        startTime   timestamp;
+        endTime     timestamp;
+        durationA   interval;
+        durationB   interval;
+        numLoops    int := 100000;
+        counter     int;
+        dummyResult text;
+    BEGIN
+
+        -- Benchmarking A nanoid()
+        RAISE NOTICE '-----------------------------';
+        RAISE NOTICE 'Starting benchmark for A nanoid() for % loops...', numLoops;
+        startTime := clock_timestamp();
+        FOR counter IN 1..numLoops
+            LOOP
+                dummyResult := a_nanoid();
+                dummyResult := a_nanoid(5, '23456789abcdefghijklmnopqrstuvwxyz');
+                dummyResult := a_nanoid(11, '_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,');
+                dummyResult := a_nanoid(48, '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ');
+            END LOOP;
+        endTime := clock_timestamp();
+        durationA := endTime - startTime;
+        RAISE NOTICE 'A nanoid() took %', durationA;
+        RAISE NOTICE '-----------------------------';
+
+        -- Benchmarking B nanoid()
+        RAISE NOTICE 'Starting benchmark for B nanoid() for % loops...', numLoops;
+        startTime := clock_timestamp();
+        FOR counter IN 1..numLoops
+            LOOP
+                dummyResult := b_nanoid();
+                dummyResult := b_nanoid(5, '23456789abcdefghijklmnopqrstuvwxyz');
+                dummyResult := b_nanoid(11, '_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,');
+                dummyResult := b_nanoid(48, '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ');
+            END LOOP;
+        endTime := clock_timestamp();
+        durationB := endTime - startTime;
+        RAISE NOTICE 'B nanoid() took %', durationB;
+        RAISE NOTICE '-----------------------------';
+
+        -- Compare
+        IF durationA < durationB THEN
+            RAISE NOTICE 'A nanoid() is faster by %', durationB - durationA;
+        ELSIF durationA > durationB THEN
+            RAISE NOTICE 'B nanoid() is faster by %', durationA - durationB;
+        ELSE
+            RAISE NOTICE 'Both functions have comparable performance.';
+        END IF;
+
+        RAISE NOTICE '-----------------------------';
+
+    END
+$$;
+
+-- TODO:
+-- EXTRACT(EPOCH FROM (timestamp1 - timestamp2))
+-- ROUND()
\ No newline at end of file
diff --git a/dev/test/unit_tests.sql b/dev/test/unit_tests.sql
new file mode 100644
index 0000000..865f99f
--- /dev/null
+++ b/dev/test/unit_tests.sql
@@ -0,0 +1,73 @@
+DO
+$$
+    DECLARE
+        generated_id text;
+        counter      int;
+        numLoops     int := 1000;
+    BEGIN
+        -- Default parameters
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid();
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 21, 'Default nanoid length is incorrect';
+                ASSERT generated_id ~ '^[-_a-zA-Z0-9]*$', 'Default nanoid contains invalid characters';
+            END LOOP;
+
+        -- Size 12, default alphabet
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid(12);
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 12, 'Size 12 nanoid length is incorrect';
+                ASSERT generated_id ~ '^[-_a-zA-Z0-9]*$', 'Size 12 nanoid contains invalid characters';
+            END LOOP;
+
+        -- Size 25, default alphabet
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid(25);
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 25, 'Size 25 nanoid length is incorrect';
+                ASSERT generated_id ~ '^[-_a-zA-Z0-9]*$', 'Size 25 nanoid contains invalid characters';
+            END LOOP;
+
+        -- Default size (21), custom alphabet (only lowercase)
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid(21, 'abcdefghijklmnopqrstuvwxyz');
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 21, 'Size 21 (only lowercase) nanoid length is incorrect';
+                ASSERT generated_id ~ '^[a-z]*$', 'Size 21 (only lowercase) nanoid contains invalid characters';
+            END LOOP;
+
+        -- Size 15, custom alphabet (only numbers)
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid(15, '0123456789');
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 15, 'Size 15 (only numbers) nanoid length is incorrect';
+                ASSERT generated_id ~ '^[0-9]*$', 'Size 15 (only numbers) nanoid contains invalid characters';
+            END LOOP;
+
+        -- Size 17, custom alphabet (uppercase + numbers)
+        FOR counter IN 1..numLoops
+            LOOP
+                generated_id := nanoid(17, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789');
+                RAISE NOTICE '%', generated_id;
+                ASSERT LENGTH(generated_id) = 17, 'Size 17 (uppercase + numbers) nanoid length is incorrect';
+                ASSERT generated_id ~ '^[A-Z0-9]*$', 'Size 17 (uppercase + numbers) nanoid contains invalid characters';
+            END LOOP;
+
+        --         -- Intentional false positive: use default size but with a mismatched regex pattern
+--         FOR counter IN 1..numLoops
+--             LOOP
+--                 generated_id := nanoid();
+--                 RAISE NOTICE '%', generated_id;
+--                 -- This will fail because we're purposefully using a wrong pattern
+--                 ASSERT generated_id ~ '^[XYZ]*$', 'Intentional false positive detected';
+--             END LOOP;
+
+        RAISE NOTICE 'All tests passed successfully!';
+    END
+$$;
\ No newline at end of file
diff --git a/nanoid.sql b/nanoid.sql
index 63e6bcc..6f8c448 100644
--- a/nanoid.sql
+++ b/nanoid.sql
@@ -21,13 +21,70 @@
 
 CREATE EXTENSION IF NOT EXISTS pgcrypto;
 
+-- The `nanoid()` function generates a compact, URL-friendly unique identifier.
+-- Based on the given size and alphabet, it creates a randomized string that's ideal for
+-- use-cases requiring small, unpredictable IDs (e.g., URL shorteners, generated file names, etc.).
+-- While it comes with a default configuration, the function is designed to be flexible,
+-- allowing for customization to meet specific needs.
+DROP FUNCTION IF EXISTS nanoid(int, text, float);
 CREATE OR REPLACE FUNCTION nanoid(
-    size int DEFAULT 21,
-    alphabet text DEFAULT '_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    size int DEFAULT 21, -- The number of symbols in the NanoId String. Must be greater than 0.
+    alphabet text DEFAULT '_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', -- The symbols used in the NanoId String. Must contain between 1 and 255 symbols.
+    additionalBytesFactor float DEFAULT 1.6 -- The additional bytes factor used for calculating the step size. Must be equal or greater then 1.
 )
-    RETURNS text
+    RETURNS text -- A randomly generated NanoId String
     LANGUAGE plpgsql
-    volatile
+    VOLATILE
+    LEAKPROOF
+    PARALLEL SAFE
+AS
+$$
+DECLARE
+    alphabetArray  text[];
+    alphabetLength int := 64;
+    mask           int := 63;
+    step           int := 34;
+BEGIN
+    IF size IS NULL OR size < 1 THEN
+        RAISE EXCEPTION 'The size must be defined and greater than 0!';
+    END IF;
+
+    IF alphabet IS NULL OR length(alphabet) = 0 OR length(alphabet) > 255 THEN
+        RAISE EXCEPTION 'The alphabet can''t be undefined, zero or bigger than 255 symbols!';
+    END IF;
+
+    IF additionalBytesFactor IS NULL OR additionalBytesFactor < 1 THEN
+        RAISE EXCEPTION 'The additional bytes factor can''t be less than 1!';
+    END IF;
+
+    alphabetArray := regexp_split_to_array(alphabet, '');
+    alphabetLength := array_length(alphabetArray, 1);
+    mask := (2 << cast(floor(log(alphabetLength - 1) / log(2)) as int)) - 1;
+    step := cast(ceil(additionalBytesFactor * mask * size / alphabetLength) AS int);
+
+    IF step > 1024 THEN
+        step := 1024; -- The step size % can''t be bigger then 1024!
+    END IF;
+
+    RETURN nanoid_optimized(size, alphabet, mask, step);
+END
+$$;
+
+-- Generates an optimized random string of a specified size using the given alphabet, mask, and step.
+-- This optimized version is designed for higher performance and lower memory overhead.
+-- No checks are performed! Use it only if you really know what you are doing.
+DROP FUNCTION IF EXISTS nanoid_optimized(int, text, int, int);
+CREATE OR REPLACE FUNCTION nanoid_optimized(
+    size int, -- The desired length of the generated string.
+    alphabet text, -- The set of characters to choose from for generating the string.
+    mask int, -- The mask used for mapping random bytes to alphabet indices. Should be `(2^n) - 1` where `n` is a power of 2 less than or equal to the alphabet size.
+    step int -- The number of random bytes to generate in each iteration. A larger value may speed up the function but increase memory usage.
+)
+    RETURNS text -- A randomly generated NanoId String
+    LANGUAGE plpgsql
+    VOLATILE
+    LEAKPROOF
+    PARALLEL SAFE
 AS
 $$
 DECLARE
@@ -36,31 +93,23 @@ DECLARE
     bytes          bytea;
     alphabetIndex  int;
     alphabetArray  text[];
-    alphabetLength int;
-    mask           int;
-    step           int;
+    alphabetLength int  := 64;
 BEGIN
     alphabetArray := regexp_split_to_array(alphabet, '');
     alphabetLength := array_length(alphabetArray, 1);
-    mask := (2 << cast(floor(log(alphabetLength - 1) / log(2)) as int)) - 1;
-    step := cast(ceil(1.6 * mask * size / alphabetLength) AS int);
-
-    while true
-        loop
-            bytes := gen_random_bytes(step);
-            while counter < step
-                loop
-                    alphabetIndex := (get_byte(bytes, counter) & mask) + 1;
-                    if alphabetIndex <= alphabetLength then
-                        idBuilder := idBuilder || alphabetArray[alphabetIndex];
-                        if length(idBuilder) = size then
-                            return idBuilder;
-                        end if;
-                    end if;
-                    counter := counter + 1;
-                end loop;
 
-            counter := 0;
-        end loop;
+    LOOP
+        bytes := gen_random_bytes(step);
+        FOR counter IN 0..step - 1
+            LOOP
+                alphabetIndex := (get_byte(bytes, counter) & mask) + 1;
+                IF alphabetIndex <= alphabetLength THEN
+                    idBuilder := idBuilder || alphabetArray[alphabetIndex];
+                    IF length(idBuilder) = size THEN
+                        RETURN idBuilder;
+                    END IF;
+                END IF;
+            END LOOP;
+    END LOOP;
 END
-$$;
\ No newline at end of file
+$$;