Files
text-encoding/TextDecoderPolyfill.ts
Chris Daßler 3342f7e40b Initial commit: Text encoding component with UTF-8 polyfills
🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 14:54:44 +02:00

187 lines
5.4 KiB
TypeScript

/* eslint-disable no-bitwise */
/**
* TextDecoder Polyfill Implementation
*
* React Native compatible UTF-8 text decoder
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
*
* @module text-encoding@1.0.0
*/
import type {
ITextDecoder,
TextDecodeOptions,
TextDecoderOptions,
} from './interfaces';
export class TextDecoderPolyfill implements ITextDecoder {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
private pendingBytes: number[] = [];
constructor(label: string = 'utf-8', options: TextDecoderOptions = {}) {
// Only support UTF-8 for now
if (label.toLowerCase() !== 'utf-8' && label.toLowerCase() !== 'utf8') {
throw new RangeError(`Unsupported encoding: ${label}. Only UTF-8 is supported.`);
}
this.encoding = 'utf-8';
this.fatal = options.fatal || false;
this.ignoreBOM = options.ignoreBOM || false;
}
/**
* Decode UTF-8 bytes to string
* Handles multi-byte sequences and validates UTF-8 encoding
*/
decode(input?: ArrayBufferView | ArrayBuffer | null, options: TextDecodeOptions = {}): string {
let bytes: number[] = [];
// Convert input to byte array
if (input) {
if (input instanceof ArrayBuffer) {
bytes = Array.from(new Uint8Array(input));
} else if (input instanceof Uint8Array) {
bytes = Array.from(input);
} else if ('buffer' in input && input.buffer instanceof ArrayBuffer) {
const view = input as ArrayBufferView;
bytes = Array.from(new Uint8Array(view.buffer, view.byteOffset || 0, view.byteLength));
} else if (Array.isArray(input)) {
bytes = input as number[];
}
}
// Handle streaming mode - prepend pending bytes
if (this.pendingBytes.length > 0) {
bytes = [...this.pendingBytes, ...bytes];
this.pendingBytes = [];
}
let result = '';
let i = 0;
// Skip BOM if present and not ignored
if (!this.ignoreBOM && bytes.length >= 3) {
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
i = 3; // Skip UTF-8 BOM
}
}
while (i < bytes.length) {
const byte1 = bytes[i];
// Determine the number of bytes in this character
let bytesNeeded: number;
let codePoint: number;
if ((byte1 & 0x80) === 0) {
// 1-byte sequence (0xxxxxxx) - ASCII
codePoint = byte1;
bytesNeeded = 1;
} else if ((byte1 & 0xe0) === 0xc0) {
// 2-byte sequence (110xxxxx 10xxxxxx)
bytesNeeded = 2;
codePoint = byte1 & 0x1f;
} else if ((byte1 & 0xf0) === 0xe0) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
bytesNeeded = 3;
codePoint = byte1 & 0x0f;
} else if ((byte1 & 0xf8) === 0xf0) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
bytesNeeded = 4;
codePoint = byte1 & 0x07;
} else {
// Invalid start byte
if (this.fatal) {
throw new TypeError(`Invalid UTF-8 sequence start byte: 0x${byte1.toString(16)}`);
}
result += '\uFFFD'; // Replacement character
i++;
continue;
}
// Check if we have enough bytes
if (i + bytesNeeded > bytes.length) {
// Incomplete sequence at end
if (options.stream) {
// Save pending bytes for next call
this.pendingBytes = bytes.slice(i);
break;
} else {
// Not streaming - this is an error
if (this.fatal) {
throw new TypeError('Incomplete UTF-8 sequence at end of input');
}
result += '\uFFFD';
break;
}
}
// Read continuation bytes
let valid = true;
for (let j = 1; j < bytesNeeded; j++) {
const byte = bytes[i + j];
if ((byte & 0xc0) !== 0x80) {
// Invalid continuation byte
valid = false;
break;
}
codePoint = (codePoint << 6) | (byte & 0x3f);
}
if (!valid) {
if (this.fatal) {
throw new TypeError('Invalid UTF-8 continuation byte');
}
result += '\uFFFD';
i++;
continue;
}
// Validate code point
if (bytesNeeded === 2 && codePoint < 0x80) {
// Overlong encoding
valid = false;
} else if (bytesNeeded === 3 && codePoint < 0x800) {
// Overlong encoding
valid = false;
} else if (bytesNeeded === 4 && codePoint < 0x10000) {
// Overlong encoding
valid = false;
} else if (codePoint > 0x10ffff) {
// Code point out of Unicode range
valid = false;
} else if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
// UTF-16 surrogate - invalid in UTF-8
valid = false;
}
if (!valid) {
if (this.fatal) {
throw new TypeError(`Invalid UTF-8 code point: 0x${codePoint.toString(16)}`);
}
result += '\uFFFD';
i += bytesNeeded;
continue;
}
// Convert code point to string
if (codePoint < 0x10000) {
// BMP character
result += String.fromCharCode(codePoint);
} else {
// Supplementary character - use surrogate pair
codePoint -= 0x10000;
const high = 0xd800 + (codePoint >> 10);
const low = 0xdc00 + (codePoint & 0x3ff);
result += String.fromCharCode(high, low);
}
i += bytesNeeded;
}
return result;
}
}