Initial commit: Text encoding component with UTF-8 polyfills
🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
180
TextEncoderPolyfill.ts
Normal file
180
TextEncoderPolyfill.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
/* eslint-disable no-bitwise */
|
||||
/**
|
||||
* TextEncoder Polyfill Implementation
|
||||
*
|
||||
* React Native compatible UTF-8 text encoder
|
||||
* Based on the Encoding Standard: https://encoding.spec.whatwg.org/
|
||||
*
|
||||
* @module text-encoding@1.0.0
|
||||
*/
|
||||
|
||||
import type { ITextEncoder, TextEncoderEncodeIntoResult } from './interfaces';
|
||||
|
||||
export class TextEncoderPolyfill implements ITextEncoder {
|
||||
readonly encoding = 'utf-8';
|
||||
|
||||
/**
|
||||
* Encode a string to UTF-8 bytes
|
||||
* Handles multi-byte characters and surrogate pairs
|
||||
*/
|
||||
encode(input: string = ''): Uint8Array {
|
||||
const bytes: number[] = [];
|
||||
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
let charCode = input.charCodeAt(i);
|
||||
|
||||
// Handle UTF-16 surrogate pairs (for characters > 0xFFFF)
|
||||
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
||||
// High surrogate
|
||||
if (i + 1 < input.length) {
|
||||
const low = input.charCodeAt(i + 1);
|
||||
if (low >= 0xdc00 && low <= 0xdfff) {
|
||||
// Valid surrogate pair
|
||||
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
||||
i++; // Skip the low surrogate
|
||||
} else {
|
||||
// Invalid surrogate pair - encode replacement character
|
||||
bytes.push(0xef, 0xbf, 0xbd);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Incomplete surrogate pair at end of string
|
||||
bytes.push(0xef, 0xbf, 0xbd);
|
||||
continue;
|
||||
}
|
||||
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
||||
// Lone low surrogate - invalid
|
||||
bytes.push(0xef, 0xbf, 0xbd);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Encode the character to UTF-8
|
||||
if (charCode < 0x80) {
|
||||
// 1-byte sequence (ASCII)
|
||||
bytes.push(charCode);
|
||||
} else if (charCode < 0x800) {
|
||||
// 2-byte sequence
|
||||
bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
|
||||
} else if (charCode < 0x10000) {
|
||||
// 3-byte sequence
|
||||
bytes.push(
|
||||
0xe0 | (charCode >> 12),
|
||||
0x80 | ((charCode >> 6) & 0x3f),
|
||||
0x80 | (charCode & 0x3f),
|
||||
);
|
||||
} else if (charCode < 0x110000) {
|
||||
// 4-byte sequence
|
||||
bytes.push(
|
||||
0xf0 | (charCode >> 18),
|
||||
0x80 | ((charCode >> 12) & 0x3f),
|
||||
0x80 | ((charCode >> 6) & 0x3f),
|
||||
0x80 | (charCode & 0x3f),
|
||||
);
|
||||
} else {
|
||||
// Invalid Unicode code point - encode replacement character
|
||||
bytes.push(0xef, 0xbf, 0xbd);
|
||||
}
|
||||
}
|
||||
|
||||
return new Uint8Array(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode string into existing Uint8Array
|
||||
* Returns how many characters were read and bytes written
|
||||
*/
|
||||
encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
|
||||
let read = 0;
|
||||
let written = 0;
|
||||
const destLength = destination.length;
|
||||
|
||||
for (let i = 0; i < source.length && written < destLength; i++) {
|
||||
let charCode = source.charCodeAt(i);
|
||||
|
||||
// Calculate bytes needed for this character
|
||||
let bytesNeeded = 1;
|
||||
|
||||
// Handle surrogate pairs
|
||||
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
||||
if (i + 1 < source.length) {
|
||||
const low = source.charCodeAt(i + 1);
|
||||
if (low >= 0xdc00 && low <= 0xdfff) {
|
||||
charCode = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
||||
bytesNeeded = 4;
|
||||
} else {
|
||||
bytesNeeded = 3; // Replacement character
|
||||
}
|
||||
} else {
|
||||
bytesNeeded = 3; // Replacement character
|
||||
}
|
||||
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
||||
bytesNeeded = 3; // Replacement character
|
||||
} else if (charCode < 0x80) {
|
||||
bytesNeeded = 1;
|
||||
} else if (charCode < 0x800) {
|
||||
bytesNeeded = 2;
|
||||
} else if (charCode < 0x10000) {
|
||||
bytesNeeded = 3;
|
||||
} else if (charCode < 0x110000) {
|
||||
bytesNeeded = 4;
|
||||
} else {
|
||||
bytesNeeded = 3; // Replacement character
|
||||
}
|
||||
|
||||
// Check if we have space
|
||||
if (written + bytesNeeded > destLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Write the bytes
|
||||
if (charCode >= 0xd800 && charCode <= 0xdbff) {
|
||||
if (i + 1 < source.length) {
|
||||
const low = source.charCodeAt(i + 1);
|
||||
if (low >= 0xdc00 && low <= 0xdfff) {
|
||||
// Valid surrogate pair
|
||||
const codePoint = 0x10000 + ((charCode - 0xd800) << 10) + (low - 0xdc00);
|
||||
destination[written++] = 0xf0 | (codePoint >> 18);
|
||||
destination[written++] = 0x80 | ((codePoint >> 12) & 0x3f);
|
||||
destination[written++] = 0x80 | ((codePoint >> 6) & 0x3f);
|
||||
destination[written++] = 0x80 | (codePoint & 0x3f);
|
||||
i++; // Skip low surrogate
|
||||
read += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Invalid surrogate - write replacement
|
||||
destination[written++] = 0xef;
|
||||
destination[written++] = 0xbf;
|
||||
destination[written++] = 0xbd;
|
||||
} else if (charCode >= 0xdc00 && charCode <= 0xdfff) {
|
||||
// Lone low surrogate - write replacement
|
||||
destination[written++] = 0xef;
|
||||
destination[written++] = 0xbf;
|
||||
destination[written++] = 0xbd;
|
||||
} else if (charCode < 0x80) {
|
||||
destination[written++] = charCode;
|
||||
} else if (charCode < 0x800) {
|
||||
destination[written++] = 0xc0 | (charCode >> 6);
|
||||
destination[written++] = 0x80 | (charCode & 0x3f);
|
||||
} else if (charCode < 0x10000) {
|
||||
destination[written++] = 0xe0 | (charCode >> 12);
|
||||
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
|
||||
destination[written++] = 0x80 | (charCode & 0x3f);
|
||||
} else if (charCode < 0x110000) {
|
||||
destination[written++] = 0xf0 | (charCode >> 18);
|
||||
destination[written++] = 0x80 | ((charCode >> 12) & 0x3f);
|
||||
destination[written++] = 0x80 | ((charCode >> 6) & 0x3f);
|
||||
destination[written++] = 0x80 | (charCode & 0x3f);
|
||||
} else {
|
||||
// Invalid code point - write replacement
|
||||
destination[written++] = 0xef;
|
||||
destination[written++] = 0xbf;
|
||||
destination[written++] = 0xbd;
|
||||
}
|
||||
|
||||
read++;
|
||||
}
|
||||
|
||||
return { read, written };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user