Initial commit: Text encoding component with UTF-8 polyfills

🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 14:54:44 +02:00
commit 3342f7e40b
10 changed files with 1430 additions and 0 deletions
--- a/TextDecoderPolyfill.ts
+++ b/TextDecoderPolyfill.ts
@@ -0,0 +1,187 @@
+/* eslint-disable no-bitwise */
+/**
+ * TextDecoder Polyfill Implementation
+ *
+ * React Native compatible UTF-8 text decoder
+ * Based on the Encoding Standard: https://encoding.spec.whatwg.org/
+ *
+ * @module text-encoding@1.0.0
+ */
+
+import type {
+  ITextDecoder,
+  TextDecodeOptions,
+  TextDecoderOptions,
+} from './interfaces';
+
+export class TextDecoderPolyfill implements ITextDecoder {
+  readonly encoding: string;
+  readonly fatal: boolean;
+  readonly ignoreBOM: boolean;
+
+  private pendingBytes: number[] = [];
+
+  constructor(label: string = 'utf-8', options: TextDecoderOptions = {}) {
+    // Only support UTF-8 for now
+    if (label.toLowerCase() !== 'utf-8' && label.toLowerCase() !== 'utf8') {
+      throw new RangeError(`Unsupported encoding: ${label}. Only UTF-8 is supported.`);
+    }
+
+    this.encoding = 'utf-8';
+    this.fatal = options.fatal || false;
+    this.ignoreBOM = options.ignoreBOM || false;
+  }
+
+  /**
+   * Decode UTF-8 bytes to string
+   * Handles multi-byte sequences and validates UTF-8 encoding
+   */
+  decode(input?: ArrayBufferView | ArrayBuffer | null, options: TextDecodeOptions = {}): string {
+    let bytes: number[] = [];
+
+    // Convert input to byte array
+    if (input) {
+      if (input instanceof ArrayBuffer) {
+        bytes = Array.from(new Uint8Array(input));
+      } else if (input instanceof Uint8Array) {
+        bytes = Array.from(input);
+      } else if ('buffer' in input && input.buffer instanceof ArrayBuffer) {
+        const view = input as ArrayBufferView;
+        bytes = Array.from(new Uint8Array(view.buffer, view.byteOffset || 0, view.byteLength));
+      } else if (Array.isArray(input)) {
+        bytes = input as number[];
+      }
+    }
+
+    // Handle streaming mode - prepend pending bytes
+    if (this.pendingBytes.length > 0) {
+      bytes = [...this.pendingBytes, ...bytes];
+      this.pendingBytes = [];
+    }
+
+    let result = '';
+    let i = 0;
+
+    // Skip BOM if present and not ignored
+    if (!this.ignoreBOM && bytes.length >= 3) {
+      if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
+        i = 3; // Skip UTF-8 BOM
+      }
+    }
+
+    while (i < bytes.length) {
+      const byte1 = bytes[i];
+
+      // Determine the number of bytes in this character
+      let bytesNeeded: number;
+      let codePoint: number;
+
+      if ((byte1 & 0x80) === 0) {
+        // 1-byte sequence (0xxxxxxx) - ASCII
+        codePoint = byte1;
+        bytesNeeded = 1;
+      } else if ((byte1 & 0xe0) === 0xc0) {
+        // 2-byte sequence (110xxxxx 10xxxxxx)
+        bytesNeeded = 2;
+        codePoint = byte1 & 0x1f;
+      } else if ((byte1 & 0xf0) === 0xe0) {
+        // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+        bytesNeeded = 3;
+        codePoint = byte1 & 0x0f;
+      } else if ((byte1 & 0xf8) === 0xf0) {
+        // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+        bytesNeeded = 4;
+        codePoint = byte1 & 0x07;
+      } else {
+        // Invalid start byte
+        if (this.fatal) {
+          throw new TypeError(`Invalid UTF-8 sequence start byte: 0x${byte1.toString(16)}`);
+        }
+        result += '\uFFFD'; // Replacement character
+        i++;
+        continue;
+      }
+
+      // Check if we have enough bytes
+      if (i + bytesNeeded > bytes.length) {
+        // Incomplete sequence at end
+        if (options.stream) {
+          // Save pending bytes for next call
+          this.pendingBytes = bytes.slice(i);
+          break;
+        } else {
+          // Not streaming - this is an error
+          if (this.fatal) {
+            throw new TypeError('Incomplete UTF-8 sequence at end of input');
+          }
+          result += '\uFFFD';
+          break;
+        }
+      }
+
+      // Read continuation bytes
+      let valid = true;
+      for (let j = 1; j < bytesNeeded; j++) {
+        const byte = bytes[i + j];
+        if ((byte & 0xc0) !== 0x80) {
+          // Invalid continuation byte
+          valid = false;
+          break;
+        }
+        codePoint = (codePoint << 6) | (byte & 0x3f);
+      }
+
+      if (!valid) {
+        if (this.fatal) {
+          throw new TypeError('Invalid UTF-8 continuation byte');
+        }
+        result += '\uFFFD';
+        i++;
+        continue;
+      }
+
+      // Validate code point
+      if (bytesNeeded === 2 && codePoint < 0x80) {
+        // Overlong encoding
+        valid = false;
+      } else if (bytesNeeded === 3 && codePoint < 0x800) {
+        // Overlong encoding
+        valid = false;
+      } else if (bytesNeeded === 4 && codePoint < 0x10000) {
+        // Overlong encoding
+        valid = false;
+      } else if (codePoint > 0x10ffff) {
+        // Code point out of Unicode range
+        valid = false;
+      } else if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
+        // UTF-16 surrogate - invalid in UTF-8
+        valid = false;
+      }
+
+      if (!valid) {
+        if (this.fatal) {
+          throw new TypeError(`Invalid UTF-8 code point: 0x${codePoint.toString(16)}`);
+        }
+        result += '\uFFFD';
+        i += bytesNeeded;
+        continue;
+      }
+
+      // Convert code point to string
+      if (codePoint < 0x10000) {
+        // BMP character
+        result += String.fromCharCode(codePoint);
+      } else {
+        // Supplementary character - use surrogate pair
+        codePoint -= 0x10000;
+        const high = 0xd800 + (codePoint >> 10);
+        const low = 0xdc00 + (codePoint & 0x3ff);
+        result += String.fromCharCode(high, low);
+      }
+
+      i += bytesNeeded;
+    }
+
+    return result;
+  }
+}