Initial commit: Text encoding component with UTF-8 polyfills

🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 14:54:44 +02:00
commit 3342f7e40b
10 changed files with 1430 additions and 0 deletions
--- a/TextEncoderPolyfill.ts
+++ b/TextEncoderPolyfill.ts
@@ -0,0 +1,180 @@
+/* eslint-disable no-bitwise */
+/**
+ * TextEncoder Polyfill Implementation
+ *
+ * React Native compatible UTF-8 text encoder
+ * Based on the Encoding Standard: https://encoding.spec.whatwg.org/
+ *
+ * @module text-encoding@1.0.0
+ */
+
+import type { ITextEncoder, TextEncoderEncodeIntoResult } from './interfaces';
+
+export class TextEncoderPolyfill implements ITextEncoder {
+  readonly encoding = 'utf-8';
+
+  /**
+   * Encode a string to UTF-8 bytes
+   * Handles multi-byte characters and surrogate pairs
+   */
+  encode(input: string = ''): Uint8Array {
+    const bytes: number[] = [];
+
+    for (let i = 0; i < input.length; i++) {
+      let charCode = input.charCodeAt(i);
+
+      // Handle UTF-16 surrogate pairs (for characters > 0xFFFF)
+      if (charCode >= 0xd800 && charCode <= 0xdbff) {
+        // High surrogate
+        if (i + 1 < input.length) {
+          const low = input.charCodeAt(i + 1);
+          if (low >= 0xdc00 && low <= 0xdfff) {
+            // Valid surrogate pair
+            charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
+            i++; // Skip the low surrogate
+          } else {
+            // Invalid surrogate pair - encode replacement character
+            bytes.push(0xef, 0xbf, 0xbd);
+            continue;
+          }
+        } else {
+          // Incomplete surrogate pair at end of string
+          bytes.push(0xef, 0xbf, 0xbd);
+          continue;
+        }
+      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
+        // Lone low surrogate - invalid
+        bytes.push(0xef, 0xbf, 0xbd);
+        continue;
+      }
+
+      // Encode the character to UTF-8
+      if (charCode < 0x80) {
+        // 1-byte sequence (ASCII)
+        bytes.push(charCode);
+      } else if (charCode < 0x800) {
+        // 2-byte sequence
+        bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
+      } else if (charCode < 0x10000) {
+        // 3-byte sequence
+        bytes.push(
+          0xe0 | (charCode >> 12),
+          0x80 | ((charCode >> 6) & 0x3f),
+          0x80 | (charCode & 0x3f),
+        );
+      } else if (charCode < 0x110000) {
+        // 4-byte sequence
+        bytes.push(
+          0xf0 | (charCode >> 18),
+          0x80 | ((charCode >> 12) & 0x3f),
+          0x80 | ((charCode >> 6) & 0x3f),
+          0x80 | (charCode & 0x3f),
+        );
+      } else {
+        // Invalid Unicode code point - encode replacement character
+        bytes.push(0xef, 0xbf, 0xbd);
+      }
+    }
+
+    return new Uint8Array(bytes);
+  }
+
+  /**
+   * Encode string into existing Uint8Array
+   * Returns how many characters were read and bytes written
+   */
+  encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
+    let read = 0;
+    let written = 0;
+    const destLength = destination.length;
+
+    for (let i = 0; i < source.length && written < destLength; i++) {
+      let charCode = source.charCodeAt(i);
+
+      // Calculate bytes needed for this character
+      let bytesNeeded = 1;
+
+      // Handle surrogate pairs
+      if (charCode >= 0xd800 && charCode <= 0xdbff) {
+        if (i + 1 < source.length) {
+          const low = source.charCodeAt(i + 1);
+          if (low >= 0xdc00 && low <= 0xdfff) {
+            charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
+            bytesNeeded = 4;
+          } else {
+            bytesNeeded = 3; // Replacement character
+          }
+        } else {
+          bytesNeeded = 3; // Replacement character
+        }
+      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
+        bytesNeeded = 3; // Replacement character
+      } else if (charCode < 0x80) {
+        bytesNeeded = 1;
+      } else if (charCode < 0x800) {
+        bytesNeeded = 2;
+      } else if (charCode < 0x10000) {
+        bytesNeeded = 3;
+      } else if (charCode < 0x110000) {
+        bytesNeeded = 4;
+      } else {
+        bytesNeeded = 3; // Replacement character
+      }
+
+      // Check if we have space
+      if (written + bytesNeeded > destLength) {
+        break;
+      }
+
+      // Write the bytes
+      if (charCode >= 0xd800 && charCode <= 0xdbff) {
+        if (i + 1 < source.length) {
+          const low = source.charCodeAt(i + 1);
+          if (low >= 0xdc00 && low <= 0xdfff) {
+            // Valid surrogate pair
+            const codePoint = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
+            destination[written++] = 0xf0 | (codePoint >> 18);
+            destination[written++] = 0x80 | ((codePoint >> 12) & 0x3f);
+            destination[written++] = 0x80 | ((codePoint >> 6) & 0x3f);
+            destination[written++] = 0x80 | (codePoint & 0x3f);
+            i++; // Skip low surrogate
+            read += 2;
+            continue;
+          }
+        }
+        // Invalid surrogate - write replacement
+        destination[written++] = 0xef;
+        destination[written++] = 0xbf;
+        destination[written++] = 0xbd;
+      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
+        // Lone low surrogate - write replacement
+        destination[written++] = 0xef;
+        destination[written++] = 0xbf;
+        destination[written++] = 0xbd;
+      } else if (charCode < 0x80) {
+        destination[written++] = charCode;
+      } else if (charCode < 0x800) {
+        destination[written++] = 0xc0 | (charCode >> 6);
+        destination[written++] = 0x80 | (charCode & 0x3f);
+      } else if (charCode < 0x10000) {
+        destination[written++] = 0xe0 | (charCode >> 12);
+        destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
+        destination[written++] = 0x80 | (charCode & 0x3f);
+      } else if (charCode < 0x110000) {
+        destination[written++] = 0xf0 | (charCode >> 18);
+        destination[written++] = 0x80 | ((charCode >> 12) & 0x3f);
+        destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
+        destination[written++] = 0x80 | (charCode & 0x3f);
+      } else {
+        // Invalid code point - write replacement
+        destination[written++] = 0xef;
+        destination[written++] = 0xbf;
+        destination[written++] = 0xbd;
+      }
+
+      read++;
+    }
+
+    return { read, written };
+  }
+}