Files
text-encoding/TextEncoderPolyfill.ts
Chris Daßler 3342f7e40b Initial commit: Text encoding component with UTF-8 polyfills
🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-29 14:54:44 +02:00

180 lines
5.9 KiB
TypeScript

/* eslint-disable no-bitwise */
/**
* TextEncoder Polyfill Implementation
*
* React Native compatible UTF-8 text encoder
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
*
* @module text-encoding@1.0.0
*/
import type { ITextEncoder, TextEncoderEncodeIntoResult } from './interfaces';
export class TextEncoderPolyfill implements ITextEncoder {
readonly encoding = 'utf-8';
/**
* Encode a string to UTF-8 bytes
* Handles multi-byte characters and surrogate pairs
*/
encode(input: string = ''): Uint8Array {
const bytes: number[] = [];
for (let i = 0; i < input.length; i++) {
let charCode = input.charCodeAt(i);
// Handle UTF-16 surrogate pairs (for characters > 0xFFFF)
if (charCode >= 0xd800 && charCode <= 0xdbff) {
// High surrogate
if (i + 1 < input.length) {
const low = input.charCodeAt(i + 1);
if (low >= 0xdc00 && low <= 0xdfff) {
// Valid surrogate pair
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
i++; // Skip the low surrogate
} else {
// Invalid surrogate pair - encode replacement character
bytes.push(0xef, 0xbf, 0xbd);
continue;
}
} else {
// Incomplete surrogate pair at end of string
bytes.push(0xef, 0xbf, 0xbd);
continue;
}
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
// Lone low surrogate - invalid
bytes.push(0xef, 0xbf, 0xbd);
continue;
}
// Encode the character to UTF-8
if (charCode < 0x80) {
// 1-byte sequence (ASCII)
bytes.push(charCode);
} else if (charCode < 0x800) {
// 2-byte sequence
bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0x10000) {
// 3-byte sequence
bytes.push(
0xe0 | (charCode >> 12),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
} else if (charCode < 0x110000) {
// 4-byte sequence
bytes.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
} else {
// Invalid Unicode code point - encode replacement character
bytes.push(0xef, 0xbf, 0xbd);
}
}
return new Uint8Array(bytes);
}
/**
* Encode string into existing Uint8Array
* Returns how many characters were read and bytes written
*/
encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
let read = 0;
let written = 0;
const destLength = destination.length;
for (let i = 0; i < source.length && written < destLength; i++) {
let charCode = source.charCodeAt(i);
// Calculate bytes needed for this character
let bytesNeeded = 1;
// Handle surrogate pairs
if (charCode >= 0xd800 && charCode <= 0xdbff) {
if (i + 1 < source.length) {
const low = source.charCodeAt(i + 1);
if (low >= 0xdc00 && low <= 0xdfff) {
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
bytesNeeded = 4;
} else {
bytesNeeded = 3; // Replacement character
}
} else {
bytesNeeded = 3; // Replacement character
}
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
bytesNeeded = 3; // Replacement character
} else if (charCode < 0x80) {
bytesNeeded = 1;
} else if (charCode < 0x800) {
bytesNeeded = 2;
} else if (charCode < 0x10000) {
bytesNeeded = 3;
} else if (charCode < 0x110000) {
bytesNeeded = 4;
} else {
bytesNeeded = 3; // Replacement character
}
// Check if we have space
if (written + bytesNeeded > destLength) {
break;
}
// Write the bytes
if (charCode >= 0xd800 && charCode <= 0xdbff) {
if (i + 1 < source.length) {
const low = source.charCodeAt(i + 1);
if (low >= 0xdc00 && low <= 0xdfff) {
// Valid surrogate pair
const codePoint = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
destination[written++] = 0xf0 | (codePoint >> 18);
destination[written++] = 0x80 | ((codePoint >> 12) & 0x3f);
destination[written++] = 0x80 | ((codePoint >> 6) & 0x3f);
destination[written++] = 0x80 | (codePoint & 0x3f);
i++; // Skip low surrogate
read += 2;
continue;
}
}
// Invalid surrogate - write replacement
destination[written++] = 0xef;
destination[written++] = 0xbf;
destination[written++] = 0xbd;
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
// Lone low surrogate - write replacement
destination[written++] = 0xef;
destination[written++] = 0xbf;
destination[written++] = 0xbd;
} else if (charCode < 0x80) {
destination[written++] = charCode;
} else if (charCode < 0x800) {
destination[written++] = 0xc0 | (charCode >> 6);
destination[written++] = 0x80 | (charCode & 0x3f);
} else if (charCode < 0x10000) {
destination[written++] = 0xe0 | (charCode >> 12);
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
destination[written++] = 0x80 | (charCode & 0x3f);
} else if (charCode < 0x110000) {
destination[written++] = 0xf0 | (charCode >> 18);
destination[written++] = 0x80 | ((charCode >> 12) & 0x3f);
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
destination[written++] = 0x80 | (charCode & 0x3f);
} else {
// Invalid code point - write replacement
destination[written++] = 0xef;
destination[written++] = 0xbf;
destination[written++] = 0xbd;
}
read++;
}
return { read, written };
}
}