🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
187 lines
5.4 KiB
TypeScript
187 lines
5.4 KiB
TypeScript
/* eslint-disable no-bitwise */
|
|
/**
|
|
* TextDecoder Polyfill Implementation
|
|
*
|
|
* React Native compatible UTF-8 text decoder
|
|
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
|
|
*
|
|
* @module text-encoding@1.0.0
|
|
*/
|
|
|
|
import type {
|
|
ITextDecoder,
|
|
TextDecodeOptions,
|
|
TextDecoderOptions,
|
|
} from './interfaces';
|
|
|
|
export class TextDecoderPolyfill implements ITextDecoder {
|
|
readonly encoding: string;
|
|
readonly fatal: boolean;
|
|
readonly ignoreBOM: boolean;
|
|
|
|
private pendingBytes: number[] = [];
|
|
|
|
constructor(label: string = 'utf-8', options: TextDecoderOptions = {}) {
|
|
// Only support UTF-8 for now
|
|
if (label.toLowerCase() !== 'utf-8' && label.toLowerCase() !== 'utf8') {
|
|
throw new RangeError(`Unsupported encoding: ${label}. Only UTF-8 is supported.`);
|
|
}
|
|
|
|
this.encoding = 'utf-8';
|
|
this.fatal = options.fatal || false;
|
|
this.ignoreBOM = options.ignoreBOM || false;
|
|
}
|
|
|
|
/**
|
|
* Decode UTF-8 bytes to string
|
|
* Handles multi-byte sequences and validates UTF-8 encoding
|
|
*/
|
|
decode(input?: ArrayBufferView | ArrayBuffer | null, options: TextDecodeOptions = {}): string {
|
|
let bytes: number[] = [];
|
|
|
|
// Convert input to byte array
|
|
if (input) {
|
|
if (input instanceof ArrayBuffer) {
|
|
bytes = Array.from(new Uint8Array(input));
|
|
} else if (input instanceof Uint8Array) {
|
|
bytes = Array.from(input);
|
|
} else if ('buffer' in input && input.buffer instanceof ArrayBuffer) {
|
|
const view = input as ArrayBufferView;
|
|
bytes = Array.from(new Uint8Array(view.buffer, view.byteOffset || 0, view.byteLength));
|
|
} else if (Array.isArray(input)) {
|
|
bytes = input as number[];
|
|
}
|
|
}
|
|
|
|
// Handle streaming mode - prepend pending bytes
|
|
if (this.pendingBytes.length > 0) {
|
|
bytes = [...this.pendingBytes, ...bytes];
|
|
this.pendingBytes = [];
|
|
}
|
|
|
|
let result = '';
|
|
let i = 0;
|
|
|
|
// Skip BOM if present and not ignored
|
|
if (!this.ignoreBOM && bytes.length >= 3) {
|
|
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
|
|
i = 3; // Skip UTF-8 BOM
|
|
}
|
|
}
|
|
|
|
while (i < bytes.length) {
|
|
const byte1 = bytes[i];
|
|
|
|
// Determine the number of bytes in this character
|
|
let bytesNeeded: number;
|
|
let codePoint: number;
|
|
|
|
if ((byte1 & 0x80) === 0) {
|
|
// 1-byte sequence (0xxxxxxx) - ASCII
|
|
codePoint = byte1;
|
|
bytesNeeded = 1;
|
|
} else if ((byte1 & 0xe0) === 0xc0) {
|
|
// 2-byte sequence (110xxxxx 10xxxxxx)
|
|
bytesNeeded = 2;
|
|
codePoint = byte1 & 0x1f;
|
|
} else if ((byte1 & 0xf0) === 0xe0) {
|
|
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
|
bytesNeeded = 3;
|
|
codePoint = byte1 & 0x0f;
|
|
} else if ((byte1 & 0xf8) === 0xf0) {
|
|
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
bytesNeeded = 4;
|
|
codePoint = byte1 & 0x07;
|
|
} else {
|
|
// Invalid start byte
|
|
if (this.fatal) {
|
|
throw new TypeError(`Invalid UTF-8 sequence start byte: 0x${byte1.toString(16)}`);
|
|
}
|
|
result += '\uFFFD'; // Replacement character
|
|
i++;
|
|
continue;
|
|
}
|
|
|
|
// Check if we have enough bytes
|
|
if (i + bytesNeeded > bytes.length) {
|
|
// Incomplete sequence at end
|
|
if (options.stream) {
|
|
// Save pending bytes for next call
|
|
this.pendingBytes = bytes.slice(i);
|
|
break;
|
|
} else {
|
|
// Not streaming - this is an error
|
|
if (this.fatal) {
|
|
throw new TypeError('Incomplete UTF-8 sequence at end of input');
|
|
}
|
|
result += '\uFFFD';
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Read continuation bytes
|
|
let valid = true;
|
|
for (let j = 1; j < bytesNeeded; j++) {
|
|
const byte = bytes[i + j];
|
|
if ((byte & 0xc0) !== 0x80) {
|
|
// Invalid continuation byte
|
|
valid = false;
|
|
break;
|
|
}
|
|
codePoint = (codePoint << 6) | (byte & 0x3f);
|
|
}
|
|
|
|
if (!valid) {
|
|
if (this.fatal) {
|
|
throw new TypeError('Invalid UTF-8 continuation byte');
|
|
}
|
|
result += '\uFFFD';
|
|
i++;
|
|
continue;
|
|
}
|
|
|
|
// Validate code point
|
|
if (bytesNeeded === 2 && codePoint < 0x80) {
|
|
// Overlong encoding
|
|
valid = false;
|
|
} else if (bytesNeeded === 3 && codePoint < 0x800) {
|
|
// Overlong encoding
|
|
valid = false;
|
|
} else if (bytesNeeded === 4 && codePoint < 0x10000) {
|
|
// Overlong encoding
|
|
valid = false;
|
|
} else if (codePoint > 0x10ffff) {
|
|
// Code point out of Unicode range
|
|
valid = false;
|
|
} else if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
|
|
// UTF-16 surrogate - invalid in UTF-8
|
|
valid = false;
|
|
}
|
|
|
|
if (!valid) {
|
|
if (this.fatal) {
|
|
throw new TypeError(`Invalid UTF-8 code point: 0x${codePoint.toString(16)}`);
|
|
}
|
|
result += '\uFFFD';
|
|
i += bytesNeeded;
|
|
continue;
|
|
}
|
|
|
|
// Convert code point to string
|
|
if (codePoint < 0x10000) {
|
|
// BMP character
|
|
result += String.fromCharCode(codePoint);
|
|
} else {
|
|
// Supplementary character - use surrogate pair
|
|
codePoint -= 0x10000;
|
|
const high = 0xd800 + (codePoint >> 10);
|
|
const low = 0xdc00 + (codePoint & 0x3ff);
|
|
result += String.fromCharCode(high, low);
|
|
}
|
|
|
|
i += bytesNeeded;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
} |