text-encoding/TextEncoderPolyfill.ts

/* eslint-disable no-bitwise */
/**
 * TextEncoder Polyfill Implementation
 *
 * React Native compatible UTF-8 text encoder
 * Based on the Encoding Standard: https://encoding.spec.whatwg.org/
 *
 * @module text-encoding@1.0.0
 */

import type { ITextEncoder, TextEncoderEncodeIntoResult } from './interfaces';

export class TextEncoderPolyfill implements ITextEncoder {
  readonly encoding = 'utf-8';

  /**
   * Encode a string to UTF-8 bytes
   * Handles multi-byte characters and surrogate pairs
   */
  encode(input: string = ''): Uint8Array {
    const bytes: number[] = [];

    for (let i = 0; i < input.length; i++) {
      let charCode = input.charCodeAt(i);

      // Handle UTF-16 surrogate pairs (for characters > 0xFFFF)
      if (charCode >= 0xd800 && charCode <= 0xdbff) {
        // High surrogate
        if (i + 1 < input.length) {
          const low = input.charCodeAt(i + 1);
          if (low >= 0xdc00 && low <= 0xdfff) {
            // Valid surrogate pair
            charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
            i++; // Skip the low surrogate
          } else {
            // Invalid surrogate pair - encode replacement character
            bytes.push(0xef, 0xbf, 0xbd);
            continue;
          }
        } else {
          // Incomplete surrogate pair at end of string
          bytes.push(0xef, 0xbf, 0xbd);
          continue;
        }
      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
        // Lone low surrogate - invalid
        bytes.push(0xef, 0xbf, 0xbd);
        continue;
      }

      // Encode the character to UTF-8
      if (charCode < 0x80) {
        // 1-byte sequence (ASCII)
        bytes.push(charCode);
      } else if (charCode < 0x800) {
        // 2-byte sequence
        bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
      } else if (charCode < 0x10000) {
        // 3-byte sequence
        bytes.push(
          0xe0 | (charCode >> 12),
          0x80 | ((charCode >> 6) & 0x3f),
          0x80 | (charCode & 0x3f),
        );
      } else if (charCode < 0x110000) {
        // 4-byte sequence
        bytes.push(
          0xf0 | (charCode >> 18),
          0x80 | ((charCode >> 12) & 0x3f),
          0x80 | ((charCode >> 6) & 0x3f),
          0x80 | (charCode & 0x3f),
        );
      } else {
        // Invalid Unicode code point - encode replacement character
        bytes.push(0xef, 0xbf, 0xbd);
      }
    }

    return new Uint8Array(bytes);
  }

  /**
   * Encode string into existing Uint8Array
   * Returns how many characters were read and bytes written
   */
  encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
    let read = 0;
    let written = 0;
    const destLength = destination.length;

    for (let i = 0; i < source.length && written < destLength; i++) {
      let charCode = source.charCodeAt(i);

      // Calculate bytes needed for this character
      let bytesNeeded = 1;

      // Handle surrogate pairs
      if (charCode >= 0xd800 && charCode <= 0xdbff) {
        if (i + 1 < source.length) {
          const low = source.charCodeAt(i + 1);
          if (low >= 0xdc00 && low <= 0xdfff) {
            charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
            bytesNeeded = 4;
          } else {
            bytesNeeded = 3; // Replacement character
          }
        } else {
          bytesNeeded = 3; // Replacement character
        }
      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
        bytesNeeded = 3; // Replacement character
      } else if (charCode < 0x80) {
        bytesNeeded = 1;
      } else if (charCode < 0x800) {
        bytesNeeded = 2;
      } else if (charCode < 0x10000) {
        bytesNeeded = 3;
      } else if (charCode < 0x110000) {
        bytesNeeded = 4;
      } else {
        bytesNeeded = 3; // Replacement character
      }

      // Check if we have space
      if (written + bytesNeeded > destLength) {
        break;
      }

      // Write the bytes
      if (charCode >= 0xd800 && charCode <= 0xdbff) {
        if (i + 1 < source.length) {
          const low = source.charCodeAt(i + 1);
          if (low >= 0xdc00 && low <= 0xdfff) {
            // Valid surrogate pair
            const codePoint = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
            destination[written++] = 0xf0 | (codePoint >> 18);
            destination[written++] = 0x80 | ((codePoint >> 12) & 0x3f);
            destination[written++] = 0x80 | ((codePoint >> 6) & 0x3f);
            destination[written++] = 0x80 | (codePoint & 0x3f);
            i++; // Skip low surrogate
            read += 2;
            continue;
          }
        }
        // Invalid surrogate - write replacement
        destination[written++] = 0xef;
        destination[written++] = 0xbf;
        destination[written++] = 0xbd;
      } else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
        // Lone low surrogate - write replacement
        destination[written++] = 0xef;
        destination[written++] = 0xbf;
        destination[written++] = 0xbd;
      } else if (charCode < 0x80) {
        destination[written++] = charCode;
      } else if (charCode < 0x800) {
        destination[written++] = 0xc0 | (charCode >> 6);
        destination[written++] = 0x80 | (charCode & 0x3f);
      } else if (charCode < 0x10000) {
        destination[written++] = 0xe0 | (charCode >> 12);
        destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
        destination[written++] = 0x80 | (charCode & 0x3f);
      } else if (charCode < 0x110000) {
        destination[written++] = 0xf0 | (charCode >> 18);
        destination[written++] = 0x80 | ((charCode >> 12) & 0x3f);
        destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
        destination[written++] = 0x80 | (charCode & 0x3f);
      } else {
        // Invalid code point - write replacement
        destination[written++] = 0xef;
        destination[written++] = 0xbf;
        destination[written++] = 0xbd;
      }

      read++;
    }

    return { read, written };
  }
}