text-encoding/TextDecoderPolyfill.ts

/* eslint-disable no-bitwise */
/**
 * TextDecoder Polyfill Implementation
 *
 * React Native compatible UTF-8 text decoder
 * Based on the Encoding Standard: https://encoding.spec.whatwg.org/
 *
 * @module text-encoding@1.0.0
 */

import type {
  ITextDecoder,
  TextDecodeOptions,
  TextDecoderOptions,
} from './interfaces';

export class TextDecoderPolyfill implements ITextDecoder {
  readonly encoding: string;
  readonly fatal: boolean;
  readonly ignoreBOM: boolean;

  private pendingBytes: number[] = [];

  constructor(label: string = 'utf-8', options: TextDecoderOptions = {}) {
    // Only support UTF-8 for now
    if (label.toLowerCase() !== 'utf-8' && label.toLowerCase() !== 'utf8') {
      throw new RangeError(`Unsupported encoding: ${label}. Only UTF-8 is supported.`);
    }

    this.encoding = 'utf-8';
    this.fatal = options.fatal || false;
    this.ignoreBOM = options.ignoreBOM || false;
  }

  /**
   * Decode UTF-8 bytes to string
   * Handles multi-byte sequences and validates UTF-8 encoding
   */
  decode(input?: ArrayBufferView | ArrayBuffer | null, options: TextDecodeOptions = {}): string {
    let bytes: number[] = [];

    // Convert input to byte array
    if (input) {
      if (input instanceof ArrayBuffer) {
        bytes = Array.from(new Uint8Array(input));
      } else if (input instanceof Uint8Array) {
        bytes = Array.from(input);
      } else if ('buffer' in input && input.buffer instanceof ArrayBuffer) {
        const view = input as ArrayBufferView;
        bytes = Array.from(new Uint8Array(view.buffer, view.byteOffset || 0, view.byteLength));
      } else if (Array.isArray(input)) {
        bytes = input as number[];
      }
    }

    // Handle streaming mode - prepend pending bytes
    if (this.pendingBytes.length > 0) {
      bytes = [...this.pendingBytes, ...bytes];
      this.pendingBytes = [];
    }

    let result = '';
    let i = 0;

    // Skip BOM if present and not ignored
    if (!this.ignoreBOM && bytes.length >= 3) {
      if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
        i = 3; // Skip UTF-8 BOM
      }
    }

    while (i < bytes.length) {
      const byte1 = bytes[i];

      // Determine the number of bytes in this character
      let bytesNeeded: number;
      let codePoint: number;

      if ((byte1 & 0x80) === 0) {
        // 1-byte sequence (0xxxxxxx) - ASCII
        codePoint = byte1;
        bytesNeeded = 1;
      } else if ((byte1 & 0xe0) === 0xc0) {
        // 2-byte sequence (110xxxxx 10xxxxxx)
        bytesNeeded = 2;
        codePoint = byte1 & 0x1f;
      } else if ((byte1 & 0xf0) === 0xe0) {
        // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
        bytesNeeded = 3;
        codePoint = byte1 & 0x0f;
      } else if ((byte1 & 0xf8) === 0xf0) {
        // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
        bytesNeeded = 4;
        codePoint = byte1 & 0x07;
      } else {
        // Invalid start byte
        if (this.fatal) {
          throw new TypeError(`Invalid UTF-8 sequence start byte: 0x${byte1.toString(16)}`);
        }
        result += '\uFFFD'; // Replacement character
        i++;
        continue;
      }

      // Check if we have enough bytes
      if (i + bytesNeeded > bytes.length) {
        // Incomplete sequence at end
        if (options.stream) {
          // Save pending bytes for next call
          this.pendingBytes = bytes.slice(i);
          break;
        } else {
          // Not streaming - this is an error
          if (this.fatal) {
            throw new TypeError('Incomplete UTF-8 sequence at end of input');
          }
          result += '\uFFFD';
          break;
        }
      }

      // Read continuation bytes
      let valid = true;
      for (let j = 1; j < bytesNeeded; j++) {
        const byte = bytes[i + j];
        if ((byte & 0xc0) !== 0x80) {
          // Invalid continuation byte
          valid = false;
          break;
        }
        codePoint = (codePoint << 6) | (byte & 0x3f);
      }

      if (!valid) {
        if (this.fatal) {
          throw new TypeError('Invalid UTF-8 continuation byte');
        }
        result += '\uFFFD';
        i++;
        continue;
      }

      // Validate code point
      if (bytesNeeded === 2 && codePoint < 0x80) {
        // Overlong encoding
        valid = false;
      } else if (bytesNeeded === 3 && codePoint < 0x800) {
        // Overlong encoding
        valid = false;
      } else if (bytesNeeded === 4 && codePoint < 0x10000) {
        // Overlong encoding
        valid = false;
      } else if (codePoint > 0x10ffff) {
        // Code point out of Unicode range
        valid = false;
      } else if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
        // UTF-16 surrogate - invalid in UTF-8
        valid = false;
      }

      if (!valid) {
        if (this.fatal) {
          throw new TypeError(`Invalid UTF-8 code point: 0x${codePoint.toString(16)}`);
        }
        result += '\uFFFD';
        i += bytesNeeded;
        continue;
      }

      // Convert code point to string
      if (codePoint < 0x10000) {
        // BMP character
        result += String.fromCharCode(codePoint);
      } else {
        // Supplementary character - use surrogate pair
        codePoint -= 0x10000;
        const high = 0xd800 + (codePoint >> 10);
        const low = 0xdc00 + (codePoint & 0x3ff);
        result += String.fromCharCode(high, low);
      }

      i += bytesNeeded;
    }

    return result;
  }
}