WIP on cif2

eartharoid · Jul 1, 2024 · df8ce7d · df8ce7d
1 parent c335d76
commit df8ce7d
Show file tree

Hide file tree

Showing 12 changed files with 576 additions and 65 deletions.
diff --git a/api.js b/api.js
@@ -0,0 +1,58 @@
+const i18n = new I18nLite();
+i18n.loadParsed(
+	'en-GB',
+	ctom(
+		await fetch(`${window.location.origin}/en-GB/_common.cif`)
+	),
+	'common',
+);
+
+
+
+// new CIFLoader(i18n, 'modular') // mono | modular
+// = 
+const cif1 = new CIFLoader(i18n, {
+	id_regex: /((?<locale>[a-z0-9-_]+)\/)((_(?<namespace>[a-z0-9-_]+))|[a-z0-9-_]+)\.[a-z]+/i,
+	fetch: (path) => fetch(window.location.origin + path)
+});
+cif1.load('/en-GB/home.cif');
+cif1.load('/en-GB/_common.cif');
+
+
+const cif2 = new CIFLoader(i18n, {
+	regex: /a/i,
+	fetch: (name) => fetch(`${window.location.origin}/${locale}/${name}.cif`)
+});
+
+cif2.load('en-GB', ['home#page', '#common'])
+
+async function* foo() {
+	yield await Promise.resolve('a');
+	yield await Promise.resolve('b');
+	yield await Promise.resolve('c');
+}
+
+let str = '';
+
+async function generate() {
+	for await (const val of foo()) {
+		str = str + val;
+	}
+	console.log(str);
+}
+
+generate();
+
+res = await fetch("https://static.eartharoid.me/test.cif?t34");
+i = 0;
+// reader = res.body.getReader()
+reader = res.body.pipeThrough(new TextDecoderStream()).getReader();
+try {
+	while (true) {
+		const { done, value } = await reader.read();
+		console.log(i++, value);
+		if (done) break;
+	}
+} finally {
+	reader.releaseLock();
+}
diff --git a/packages/cif/i18nb.md b/packages/cif/i18nb.md
@@ -0,0 +1,135 @@
+# I18n Binary
+
+https://developer.mozilla.org/en-US/docs/Web/API/ReadableStreamBYOBReader
+
+## Benefits
+
+- Slightly smaller
+  - Multi-digit numbers (such as placeholder positions) will use fewer bytes when stored as numbers instead of text
+  - Knowing the length of each field should be faster for streamed parsing than checking for control characters mid-string constantly
+
+## Problems
+
+1. binary won't be compressed by CDNs
+   - could be solved by integrating compression in the format (with a flag)
+     - UTF-8 encoded sections will be passed through a GZIP (de)compression stream before/after the text en/decoder
+     - this requires decompression support in the client runtime (JS instead of being handled by the browser)
+   - compression is more effective on large chunks of data. compressing each message separately may be ineffective.
+   - JS supports GZIP but not the better (10%-20% smaller) Brotli algorithm
+   - **could serve as utf-8 text** (most will be valid) but read a binary? would allow brotli compression by CDNs
+
+
+Speed is the main goal. It will already be smaller than minifed input without compression.
+
+## UBJSON
+
+<https://ubjson.org/>
+
+
+## Structure
+
+- 8-bit,
+- fixed format (all fields must be present in every record)
+- `<position><data>...`
+- position points to the next pointer, at the end of the upcoming block of data
+- position can be 0 for unused fields
+- some fields store numbers in binary, others will be UTF-8 decoded 
+- for positions (length of next chunk) and flags, 8th bit (128) used for overflow indicator (for dynamic sizes)
+	- `127` can be stored in one byte: `01111111`
+	- `127`-`254` require a second byte: `01111111` + `00000001` = `128`
+	- `255` etc require more bytes
+- KV_DESCRIPTOR bytes:
+  1. descriptor byte length (including this - if 1 then field is empty)
+  2. [KEY, VALUE][] RELATIVE positions, single byte (keys and values can be no longer than 255 each)
+- LIST_DESCRIPTOR is same but
+- GROUP_DEPTH cannot be skipped (should be 0 with empty GROUP_PREFIX if not necessary)
+  - must be signed to allow negatives (±127)
+  - messages cannot be nested more than 127 levels as GROUP_DEPTH must be a single byte
+- LENGTH is in BYTES (may be more than character length)
+
+```md
+ <!-- start of file -->
+<FLAGS>
+ <!-- records -->
+<INT GROUP_DEPTH><LENGTH>[STR GROUP_PREFIX]<DESCRIPTOR>[STR[] GROUP_OPTS]<LENGTH>[STR MSG_KEY]<LENGTH>[STR MSG_TEXT]<DESCRIPTOR>[STR[] MSG_PLACEHOLDERS]
+...
+```
+
+As this would use 3 bytes (GROUP_DEPTH=0, LENGTH=0, DESCRIPTOR=1) bytes on every record even when no group change occurs,
+perhaps there should be 2 record types with a single byte at the start to say which it is?
+
+```md
+ <!-- start of file -->
+<TYPE 0><FLAGS>
+ <!-- records -->
+<TYPE 1><INT GROUP_DEPTH><LENGTH>[STR GROUP_PREFIX]<DESCRIPTOR>[STR[] GROUP_OPTS]
+<!-- or -->
+<TYPE 2><LENGTH>[STR MSG_KEY]<LENGTH>[STR MSG_TEXT]<DESCRIPTOR>[STR[] MSG_PLACEHOLDERS]
+...
+```
+
+or even better, start group types with 0:
+
+
+```md
+ <!-- start of file -->
+<VERSION>
+ <!-- records -->
+<00000000><GROUP_DEPTH><LIST_DESCRIPTOR>[GROUP_PREFIX]<KV_DESCRIPTOR>[GROUP_OPTS]
+<!-- or -->
+<LENGTH>[MSG_KEY]<LENGTH>[MSG_TEXT]<KV_DESCRIPTOR>[MSG_PLACEHOLDERS]
+...
+```
+
+because MSG_KEY is required so the first byte on a message record can never be 0.
+
+This is most similar to CIF 2.0.
+
+### GROUP_OPTS **keys**
+
+- keys and values are UTF-8 strings
+- If the first and only byte of a **key** is <= 31, it is a built-in property
+  - 0 = cardinal
+  - 1 = ordinal
+
+### MSG_PLACEHOLDERS **values**
+
+- UTF-8 string.
+- If first byte<=31 (`NUL`-`US`, 32 being the first printable character), it is an internal getter with special format.
+  - 0 = $t:
+    ```md
+    <TYPE><GETTER_KEY>
+    ```
+    e.g.
+    ```js
+    00 0A 63 69 72 63 75 6C 61 72 5F 31 // (12 B)
+    ->
+    {"g":"$t","d":{"k":"circular_1"}} // (33 B)
+    ```
+      - TYPE is 1 byte so no LENGTH necessary for GETTER_KEY
+- If first byte is 123 (`{`), it is a non-standard getter represented as stringified JSON (big and slow, but flexible).
+
+## Types
+
+Integers are 8-bit unless FlexiByte.
+Strings are UTF-8 encoded.
+
+```js
+VERSION: FlexiByte Int (varint)
+GROUP_DEPTH: Int
+LIST_DESCRIPTOR: [Int, ...Int]
+  // length, Str relative positions/lengths
+GROUP_PREFIX: [...Str]
+KV_DESCRIPTOR: [Int, ...Flexibyte Int]
+  // GROUP_OPTS: length, Str lengths
+  // MSG_PLACEHOLDERS: length, Int length, Str length
+LENGTH: FlexiByte Int
+MSG_KEY: Str
+MSG_TEXT: Str
+GETTER_KEY: Str
+```
+
+
+## varints
+
+https://protobuf.dev/programming-guides/encoding/#varints
diff --git a/packages/cif/package.json b/packages/cif/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@eartharoid/cif",
-	"version": "1.0.0-alpha.1",
+	"version": "2.0.0-alpha.1",
 	"description": "Convert to and from an efficient i18n message file format",
 	"main": "dist/index.js",
 	"type": "module",

diff --git a/packages/cif/spec/parser.pseudo b/packages/cif/spec/parser.pseudo
@@ -0,0 +1,55 @@
+set control = {
+	NUL	= "\x00" // Null
+	HT	= "\x09" // Horizontal Tabulation (\t)
+	VT	= "\x0B" // Vertical Tabulation (\v)
+	GS	= "\x1d" // Group Sepeaator
+	RS	= "\x1e" // Record Separator
+	US	= "\x1f" // Unit Separator
+}
+
+set modes = {
+	UNKNOWN	= 0
+	META	= 1
+}
+
+set stages = {
+	BUF	= 0
+	EOS	= 1 
+}
+
+function parse(chunks) {
+	set mode = modes.META
+	set stage = stages.BUF
+	// string, not a literal buffer
+	set buffer = ""
+
+	foreach (chunk in chunks) {
+		for (set i = 1, set length = chunk.length; i < length; i++) {
+			set char = chunk[i]
+			switch (mode) {
+				case modes.UNKNOWN
+				case modes.META: {
+					switch (stage) {
+						case stages.BUF: {
+							if (char === control.RS) {
+								set meta = parseQueryString(buffer)
+								if (meta.version !== "2") {
+									throw "Unsupported CIF version"
+								}
+								buffer = ""
+								mode = modes.UNKNOWN
+							} else {
+								buffer += char
+							}
+							break
+						}
+						case stages.EOS: {
+							break
+						} 
+					}
+					break
+				}
+			}
+		}
+	}
+}
diff --git a/packages/cif/src/control.ts b/packages/cif/src/control.ts
@@ -1,5 +1,7 @@
 export default {
 	NUL: '\x00',
+	HT: '\x09',
+	// VT: '\x0B',
 	GS: '\x1d',
 	RS: '\x1e',
 	US: '\x1f',

diff --git a/packages/cif/src/ctom.ts b/packages/cif/src/ctom.ts
@@ -31,7 +31,7 @@ export default function ctom(cif: string): ParsedMessages {
 			} else {
 				m = { t: fields[1] };
 				if (parts.length > 1) {
-					m.p =[];
+					m.p = [];
 					for (let i = 1; i < parts.length; i += 2) {
 						const pos = Number(parts[i]);
 						const name = parts[i + 1];

diff --git a/packages/cif/src/mtoc.ts b/packages/cif/src/mtoc.ts
@@ -2,33 +2,71 @@ import type { ExtractedMessageObject, ParsedMessages } from '@eartharoid/i18n/ty
 import control from './control.js';
 
 export default function mtoc(messages: ParsedMessages): string {
-	const sorted = messages.sort(([a], [b]) => a.split('.').length - b.split('.').length);
-	let cif = 'version=1' + control.RS;
-	let prefix = '';
-	for (const [key, value] of sorted) {
-		const parts = key.split('.');
-		if (parts.length > 1) {
-			const c_prefix = parts.slice(0, -1).join('.');
-			if (c_prefix !== prefix) {
-				prefix = c_prefix;
-				cif += control.GS + c_prefix + control.RS;
+	let cif = '';
+	let prefix_parts = [];
+	for (let i = 0; i < messages.length; i++) {
+		const [key, value] = messages[i];
+		const key_parts = key.split('.');
+		if (key_parts.length > 1) {
+			let depth = null;
+			for (let p = 0; p < key_parts.length - 1; p++) {
+				if (key_parts[p] !== prefix_parts[p]) {
+					depth = p;
+					cif += control.GS + depth + control.US;
+					continue;
+				}
 			}
+
+			if (depth !== null)  {
+				const new_parts = key_parts.slice(depth, key_parts.length - ('q' in value ? 0 : 1));
+				prefix_parts = [
+					...prefix_parts.slice(0, depth),
+					...new_parts,
+				];
+				cif += new_parts.join('.');
+				if ('q' in value) {
+					// TODO: cardinal -> #
+					cif += control.US + Object.entries(value.q).map(([k, v]) => k + control.HT + v).join(control.HT);
+				}
+				cif += control.RS;
+				console.log({depth, new_parts});
+			}
+		} else if (prefix_parts.length > 0) {
+			prefix_parts = [];
+			cif += control.GS + '-1' + control.RS;
 		}
-		cif += prefix.length > 0 ? key.slice(prefix.length + 1) : key;
+		/** FIXME: change -1 behaviour?
+{
+  key: 'placeholder_getters.together',
+  key_parts: [ 'placeholder_getters', 'together' ],
+  prefix_parts: [ 'placeholder_getters', 'girls' ],
+  revised_key: 'er',
+  q: false
+}
+		 */
+		console.log({
+			key,
+			key_parts,
+			prefix_parts,
+			revised_key: prefix_parts.length > 0 ? key.slice(prefix_parts.join('.').length + 1) : key,
+			q: 'q' in value && value.q,
+		}, '\n');
+		if ('q' in value) continue;
+		const trimmed_key = prefix_parts.length > 0 ? key.slice(prefix_parts.join('.').length + 1) : key;
+		cif += trimmed_key + control.US + (<ExtractedMessageObject>value).t;
 		if ('p' in value) {
-			value.p.forEach(([pos, data]) => {
-				let name: string;
-				if ('v' in data) name = data.v;
-				else name = JSON.stringify(data);	
-				cif += '\t' + pos + '\t' + name;
-			});
-		}
-		if ('q' in value) {
-			cif += control.US + control.NUL + new URLSearchParams(value.q).toString() + control.RS;
-		} else { // if ('t' in value)
-			cif += control.US + (<ExtractedMessageObject>value).t + control.RS;
+			const placeholders = value.p
+				.map(([pos, data]) => {
+					let name: string;
+					if ('v' in data) name = data.v;
+					else name = JSON.stringify(data);
+					return pos + control.HT + name;
+				})
+				.join(control.HT);
+			cif += control.US + placeholders;
 		}
+		if (i !== messages.length - 1) cif += control.RS;
 
 	}
-	return cif.slice(0, -1); // remove trailing new line
+	return cif;
 }