🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
180 lines
5.9 KiB
TypeScript
180 lines
5.9 KiB
TypeScript
/* eslint-disable no-bitwise */
|
|
/**
|
|
* TextEncoder Polyfill Implementation
|
|
*
|
|
* React Native compatible UTF-8 text encoder
|
|
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
|
|
*
|
|
* @module text-encoding@1.0.0
|
|
*/
|
|
|
|
import type { ITextEncoder, TextEncoderEncodeIntoResult } from './interfaces';
|
|
|
|
export class TextEncoderPolyfill implements ITextEncoder {
|
|
readonly encoding = 'utf-8';
|
|
|
|
/**
|
|
* Encode a string to UTF-8 bytes
|
|
* Handles multi-byte characters and surrogate pairs
|
|
*/
|
|
encode(input: string = ''): Uint8Array {
|
|
const bytes: number[] = [];
|
|
|
|
for (let i = 0; i < input.length; i++) {
|
|
let charCode = input.charCodeAt(i);
|
|
|
|
// Handle UTF-16 surrogate pairs (for characters > 0xFFFF)
|
|
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
|
// High surrogate
|
|
if (i + 1 < input.length) {
|
|
const low = input.charCodeAt(i + 1);
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
// Valid surrogate pair
|
|
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
|
i++; // Skip the low surrogate
|
|
} else {
|
|
// Invalid surrogate pair - encode replacement character
|
|
bytes.push(0xef, 0xbf, 0xbd);
|
|
continue;
|
|
}
|
|
} else {
|
|
// Incomplete surrogate pair at end of string
|
|
bytes.push(0xef, 0xbf, 0xbd);
|
|
continue;
|
|
}
|
|
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
|
// Lone low surrogate - invalid
|
|
bytes.push(0xef, 0xbf, 0xbd);
|
|
continue;
|
|
}
|
|
|
|
// Encode the character to UTF-8
|
|
if (charCode < 0x80) {
|
|
// 1-byte sequence (ASCII)
|
|
bytes.push(charCode);
|
|
} else if (charCode < 0x800) {
|
|
// 2-byte sequence
|
|
bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
|
|
} else if (charCode < 0x10000) {
|
|
// 3-byte sequence
|
|
bytes.push(
|
|
0xe0 | (charCode >> 12),
|
|
0x80 | ((charCode >> 6) & 0x3f),
|
|
0x80 | (charCode & 0x3f),
|
|
);
|
|
} else if (charCode < 0x110000) {
|
|
// 4-byte sequence
|
|
bytes.push(
|
|
0xf0 | (charCode >> 18),
|
|
0x80 | ((charCode >> 12) & 0x3f),
|
|
0x80 | ((charCode >> 6) & 0x3f),
|
|
0x80 | (charCode & 0x3f),
|
|
);
|
|
} else {
|
|
// Invalid Unicode code point - encode replacement character
|
|
bytes.push(0xef, 0xbf, 0xbd);
|
|
}
|
|
}
|
|
|
|
return new Uint8Array(bytes);
|
|
}
|
|
|
|
/**
|
|
* Encode string into existing Uint8Array
|
|
* Returns how many characters were read and bytes written
|
|
*/
|
|
encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
|
|
let read = 0;
|
|
let written = 0;
|
|
const destLength = destination.length;
|
|
|
|
for (let i = 0; i < source.length && written < destLength; i++) {
|
|
let charCode = source.charCodeAt(i);
|
|
|
|
// Calculate bytes needed for this character
|
|
let bytesNeeded = 1;
|
|
|
|
// Handle surrogate pairs
|
|
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
|
if (i + 1 < source.length) {
|
|
const low = source.charCodeAt(i + 1);
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
|
bytesNeeded = 4;
|
|
} else {
|
|
bytesNeeded = 3; // Replacement character
|
|
}
|
|
} else {
|
|
bytesNeeded = 3; // Replacement character
|
|
}
|
|
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
|
bytesNeeded = 3; // Replacement character
|
|
} else if (charCode < 0x80) {
|
|
bytesNeeded = 1;
|
|
} else if (charCode < 0x800) {
|
|
bytesNeeded = 2;
|
|
} else if (charCode < 0x10000) {
|
|
bytesNeeded = 3;
|
|
} else if (charCode < 0x110000) {
|
|
bytesNeeded = 4;
|
|
} else {
|
|
bytesNeeded = 3; // Replacement character
|
|
}
|
|
|
|
// Check if we have space
|
|
if (written + bytesNeeded > destLength) {
|
|
break;
|
|
}
|
|
|
|
// Write the bytes
|
|
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
|
if (i + 1 < source.length) {
|
|
const low = source.charCodeAt(i + 1);
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
// Valid surrogate pair
|
|
const codePoint = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
|
destination[written++] = 0xf0 | (codePoint >> 18);
|
|
destination[written++] = 0x80 | ((codePoint >> 12) & 0x3f);
|
|
destination[written++] = 0x80 | ((codePoint >> 6) & 0x3f);
|
|
destination[written++] = 0x80 | (codePoint & 0x3f);
|
|
i++; // Skip low surrogate
|
|
read += 2;
|
|
continue;
|
|
}
|
|
}
|
|
// Invalid surrogate - write replacement
|
|
destination[written++] = 0xef;
|
|
destination[written++] = 0xbf;
|
|
destination[written++] = 0xbd;
|
|
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
|
// Lone low surrogate - write replacement
|
|
destination[written++] = 0xef;
|
|
destination[written++] = 0xbf;
|
|
destination[written++] = 0xbd;
|
|
} else if (charCode < 0x80) {
|
|
destination[written++] = charCode;
|
|
} else if (charCode < 0x800) {
|
|
destination[written++] = 0xc0 | (charCode >> 6);
|
|
destination[written++] = 0x80 | (charCode & 0x3f);
|
|
} else if (charCode < 0x10000) {
|
|
destination[written++] = 0xe0 | (charCode >> 12);
|
|
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
|
|
destination[written++] = 0x80 | (charCode & 0x3f);
|
|
} else if (charCode < 0x110000) {
|
|
destination[written++] = 0xf0 | (charCode >> 18);
|
|
destination[written++] = 0x80 | ((charCode >> 12) & 0x3f);
|
|
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
|
|
destination[written++] = 0x80 | (charCode & 0x3f);
|
|
} else {
|
|
// Invalid code point - write replacement
|
|
destination[written++] = 0xef;
|
|
destination[written++] = 0xbf;
|
|
destination[written++] = 0xbd;
|
|
}
|
|
|
|
read++;
|
|
}
|
|
|
|
return { read, written };
|
|
}
|
|
} |