Initial commit: Text encoding component with UTF-8 polyfills
🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
187
TextDecoderPolyfill.ts
Normal file
187
TextDecoderPolyfill.ts
Normal file
@@ -0,0 +1,187 @@
|
||||
/* eslint-disable no-bitwise */
|
||||
/**
|
||||
* TextDecoder Polyfill Implementation
|
||||
*
|
||||
* React Native compatible UTF-8 text decoder
|
||||
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
|
||||
*
|
||||
* @module text-encoding@1.0.0
|
||||
*/
|
||||
|
||||
import type {
|
||||
ITextDecoder,
|
||||
TextDecodeOptions,
|
||||
TextDecoderOptions,
|
||||
} from './interfaces';
|
||||
|
||||
export class TextDecoderPolyfill implements ITextDecoder {
|
||||
readonly encoding: string;
|
||||
readonly fatal: boolean;
|
||||
readonly ignoreBOM: boolean;
|
||||
|
||||
private pendingBytes: number[] = [];
|
||||
|
||||
constructor(label: string = 'utf-8', options: TextDecoderOptions = {}) {
|
||||
// Only support UTF-8 for now
|
||||
if (label.toLowerCase() !== 'utf-8' && label.toLowerCase() !== 'utf8') {
|
||||
throw new RangeError(`Unsupported encoding: ${label}. Only UTF-8 is supported.`);
|
||||
}
|
||||
|
||||
this.encoding = 'utf-8';
|
||||
this.fatal = options.fatal || false;
|
||||
this.ignoreBOM = options.ignoreBOM || false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode UTF-8 bytes to string
|
||||
* Handles multi-byte sequences and validates UTF-8 encoding
|
||||
*/
|
||||
decode(input?: ArrayBufferView | ArrayBuffer | null, options: TextDecodeOptions = {}): string {
|
||||
let bytes: number[] = [];
|
||||
|
||||
// Convert input to byte array
|
||||
if (input) {
|
||||
if (input instanceof ArrayBuffer) {
|
||||
bytes = Array.from(new Uint8Array(input));
|
||||
} else if (input instanceof Uint8Array) {
|
||||
bytes = Array.from(input);
|
||||
} else if ('buffer' in input && input.buffer instanceof ArrayBuffer) {
|
||||
const view = input as ArrayBufferView;
|
||||
bytes = Array.from(new Uint8Array(view.buffer, view.byteOffset || 0, view.byteLength));
|
||||
} else if (Array.isArray(input)) {
|
||||
bytes = input as number[];
|
||||
}
|
||||
}
|
||||
|
||||
// Handle streaming mode - prepend pending bytes
|
||||
if (this.pendingBytes.length > 0) {
|
||||
bytes = [...this.pendingBytes, ...bytes];
|
||||
this.pendingBytes = [];
|
||||
}
|
||||
|
||||
let result = '';
|
||||
let i = 0;
|
||||
|
||||
// Skip BOM if present and not ignored
|
||||
if (!this.ignoreBOM && bytes.length >= 3) {
|
||||
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
|
||||
i = 3; // Skip UTF-8 BOM
|
||||
}
|
||||
}
|
||||
|
||||
while (i < bytes.length) {
|
||||
const byte1 = bytes[i];
|
||||
|
||||
// Determine the number of bytes in this character
|
||||
let bytesNeeded: number;
|
||||
let codePoint: number;
|
||||
|
||||
if ((byte1 & 0x80) === 0) {
|
||||
// 1-byte sequence (0xxxxxxx) - ASCII
|
||||
codePoint = byte1;
|
||||
bytesNeeded = 1;
|
||||
} else if ((byte1 & 0xe0) === 0xc0) {
|
||||
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||
bytesNeeded = 2;
|
||||
codePoint = byte1 & 0x1f;
|
||||
} else if ((byte1 & 0xf0) === 0xe0) {
|
||||
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
bytesNeeded = 3;
|
||||
codePoint = byte1 & 0x0f;
|
||||
} else if ((byte1 & 0xf8) === 0xf0) {
|
||||
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
bytesNeeded = 4;
|
||||
codePoint = byte1 & 0x07;
|
||||
} else {
|
||||
// Invalid start byte
|
||||
if (this.fatal) {
|
||||
throw new TypeError(`Invalid UTF-8 sequence start byte: 0x${byte1.toString(16)}`);
|
||||
}
|
||||
result += '\uFFFD'; // Replacement character
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if we have enough bytes
|
||||
if (i + bytesNeeded > bytes.length) {
|
||||
// Incomplete sequence at end
|
||||
if (options.stream) {
|
||||
// Save pending bytes for next call
|
||||
this.pendingBytes = bytes.slice(i);
|
||||
break;
|
||||
} else {
|
||||
// Not streaming - this is an error
|
||||
if (this.fatal) {
|
||||
throw new TypeError('Incomplete UTF-8 sequence at end of input');
|
||||
}
|
||||
result += '\uFFFD';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Read continuation bytes
|
||||
let valid = true;
|
||||
for (let j = 1; j < bytesNeeded; j++) {
|
||||
const byte = bytes[i + j];
|
||||
if ((byte & 0xc0) !== 0x80) {
|
||||
// Invalid continuation byte
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
codePoint = (codePoint << 6) | (byte & 0x3f);
|
||||
}
|
||||
|
||||
if (!valid) {
|
||||
if (this.fatal) {
|
||||
throw new TypeError('Invalid UTF-8 continuation byte');
|
||||
}
|
||||
result += '\uFFFD';
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Validate code point
|
||||
if (bytesNeeded === 2 && codePoint < 0x80) {
|
||||
// Overlong encoding
|
||||
valid = false;
|
||||
} else if (bytesNeeded === 3 && codePoint < 0x800) {
|
||||
// Overlong encoding
|
||||
valid = false;
|
||||
} else if (bytesNeeded === 4 && codePoint < 0x10000) {
|
||||
// Overlong encoding
|
||||
valid = false;
|
||||
} else if (codePoint > 0x10ffff) {
|
||||
// Code point out of Unicode range
|
||||
valid = false;
|
||||
} else if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
|
||||
// UTF-16 surrogate - invalid in UTF-8
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if (!valid) {
|
||||
if (this.fatal) {
|
||||
throw new TypeError(`Invalid UTF-8 code point: 0x${codePoint.toString(16)}`);
|
||||
}
|
||||
result += '\uFFFD';
|
||||
i += bytesNeeded;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert code point to string
|
||||
if (codePoint < 0x10000) {
|
||||
// BMP character
|
||||
result += String.fromCharCode(codePoint);
|
||||
} else {
|
||||
// Supplementary character - use surrogate pair
|
||||
codePoint -= 0x10000;
|
||||
const high = 0xd800 + (codePoint >> 10);
|
||||
const low = 0xdc00 + (codePoint & 0x3ff);
|
||||
result += String.fromCharCode(high, low);
|
||||
}
|
||||
|
||||
i += bytesNeeded;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user