[libc][wctype] Create generation script for classification lookup tables (#172042)

[#172040](https://github.com/llvm/llvm-project/issues/172040)

This patch implements the scripts for generating the lookup tables and
associated utils for wctype classification functions. Not all Unicode
properties are covered as not all need a lookup table, the rest will be
hardcoded. The size of the generated tables is 47,8KB.
This commit is contained in:
Marcell Leleszi
2026-01-06 12:13:28 +01:00
committed by GitHub
parent 2bfb984a7c
commit 9373dbdc00
11 changed files with 4630 additions and 0 deletions

3
libc/.gitignore vendored
View File

@@ -1,3 +1,6 @@
# Sphinx documentation
docs/_build/
build/
# Unicode data used for wctype functions
UnicodeData.txt

View File

@@ -119,6 +119,10 @@ function(_get_compile_options_from_config output_var)
list(APPEND config_options "-DLIBC_TRAP_ON_RAISE_FP_EXCEPT")
endif()
if(LIBC_CONF_WCTYPE_MODE)
list(APPEND config_options "-DLIBC_CONF_WCTYPE_MODE=${LIBC_CONF_WCTYPE_MODE}")
endif()
if(LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT)
list(APPEND config_options "-DLIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT=${LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT}")
endif()

View File

@@ -413,6 +413,7 @@ add_subdirectory(time)
# Therefore, cannot currently build this on macos in overlay mode
if(NOT (LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
add_subdirectory(wctype)
endif()
add_subdirectory(math)

View File

@@ -0,0 +1,12 @@
add_header_library(
wctype_classification_utils
HDRS
wctype_classification_utils.h
DEPENDS
libc.hdr.types.wchar_t
libc.hdr.stdint_proxy
libc.src.__support.macros.attributes
libc.src.__support.macros.config
libc.src.__support.CPP.limits
libc.src.__support.libc_assert
)

File diff suppressed because it is too large Load Diff

View File

@@ -293,4 +293,5 @@ add_subdirectory(threads)
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
add_subdirectory(wctype)
endif()

View File

@@ -0,0 +1,13 @@
add_custom_target(libc-support-wctype-tests)
add_libc_test(
wctype_classification_utils_test
SUITE
libc-support-tests
SRCS
wctype_classification_utils_test.cpp
DEPENDS
libc.hdr.stdint_proxy
libc.src.__support.wctype.wctype_classification_utils
)

View File

@@ -0,0 +1,540 @@
//===-- Unittests for wctype classification utils -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "hdr/stdint_proxy.h"
#include "src/__support/wctype/wctype_classification_utils.h"
#include "test/UnitTest/Test.h"
namespace {
// Some platform (like Windows) have a 16 bit wchar_t. We guard the cases that
// do not fit within 16 bits to prevent narrowing conversion and incorrect test
// results.
struct TestCase {
uint32_t wc;
const char *name;
bool expected;
};
TEST(LlvmLibcWctypeClassificationUtilsTest, Lower) {
TestCase cases[] = {// ASCII lowercase
{0x0061, "LATIN SMALL LETTER A", true},
{0x007A, "LATIN SMALL LETTER Z", true},
// ASCII uppercase
{0x0041, "LATIN CAPITAL LETTER A", false},
{0x005A, "LATIN CAPITAL LETTER Z", false},
// ASCII non-letters
{0x0030, "DIGIT ZERO", false},
{0x0020, "SPACE", false},
{0x0021, "EXCLAMATION MARK", false},
// Latin Extended lowercase
{0x00E0, "LATIN SMALL LETTER A WITH GRAVE", true},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
// Latin Extended uppercase
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
{0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", false},
// Greek lowercase
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
{0x03C9, "GREEK SMALL LETTER OMEGA", true},
// Greek uppercase
{0x0391, "GREEK CAPITAL LETTER ALPHA", false},
{0x03A9, "GREEK CAPITAL LETTER OMEGA", false},
// Cyrillic lowercase
{0x0430, "CYRILLIC SMALL LETTER A", true},
{0x044F, "CYRILLIC SMALL LETTER YA", true},
// Cyrillic uppercase
{0x0410, "CYRILLIC CAPITAL LETTER A", false},
{0x042F, "CYRILLIC CAPITAL LETTER YA", false},
// Caseless scripts
{0x05D0, "HEBREW LETTER ALEF", false},
{0x0627, "ARABIC LETTER ALEF", false},
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::LOWER;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Upper) {
TestCase cases[] = {
// ASCII lowercase
{0x0061, "LATIN SMALL LETTER A", false},
{0x007A, "LATIN SMALL LETTER Z", false},
// ASCII uppercase
{0x0041, "LATIN CAPITAL LETTER A", true},
{0x005A, "LATIN CAPITAL LETTER Z", true},
// ASCII non-letters
{0x0030, "DIGIT ZERO", false},
{0x0020, "SPACE", false},
{0x0021, "EXCLAMATION MARK", false},
// Titlecase
{0x01C5, "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON", true},
// Latin Extended lowercase
{0x00E0, "LATIN SMALL LETTER A WITH GRAVE", false},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", false},
// Latin Extended uppercase
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
{0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", true},
// Greek lowercase
{0x03B1, "GREEK SMALL LETTER ALPHA", false},
{0x03C9, "GREEK SMALL LETTER OMEGA", false},
// Greek uppercase
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
{0x03A9, "GREEK CAPITAL LETTER OMEGA", true},
// Cyrillic lowercase
{0x0430, "CYRILLIC SMALL LETTER A", false},
{0x044F, "CYRILLIC SMALL LETTER YA", false},
// Cyrillic uppercase
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
{0x042F, "CYRILLIC CAPITAL LETTER YA", true},
// Caseless scripts
{0x05D0, "HEBREW LETTER ALEF", false},
{0x0627, "ARABIC LETTER ALEF", false},
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::UPPER;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Alpha) {
TestCase cases[] = {
// ASCII letters
{0x0041, "LATIN CAPITAL LETTER A", true},
{0x0061, "LATIN SMALL LETTER A", true},
{0x005A, "LATIN CAPITAL LETTER Z", true},
{0x007A, "LATIN SMALL LETTER Z", true},
// ASCII non-letters
{0x0030, "DIGIT ZERO", false},
{0x0039, "DIGIT NINE", false},
{0x0020, "SPACE", false},
{0x0021, "EXCLAMATION MARK", false},
{0x007E, "TILDE", false},
// Modified letters
{0x02B0, "MODIFIED LETTER SMALL H", true},
// Latin Extended
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
// Greek
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
{0x03C9, "GREEK SMALL LETTER OMEGA", true},
// Cyrillic
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
{0x0430, "CYRILLIC SMALL LETTER A", true},
{0x044F, "CYRILLIC SMALL LETTER YA", true},
// Arabic
{0x0627, "ARABIC LETTER ALEF", true},
{0x0628, "ARABIC LETTER BEH", true},
// CJK
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00 (first)", true},
{0x4E01, "CJK UNIFIED IDEOGRAPH-4E01", true},
{0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF (last in BMP)", true},
// Emoji and symbols
{0x2764, "HEAVY BLACK HEART", false},
// Special cases
{0x0000, "NULL", false},
{0xFFFD, "REPLACEMENT CHARACTER", false},
// Roman numerals
{0x2160, "ROMAN NUMERAL ONE", true},
{0x2161, "ROMAN NUMERAL TWO", true},
{0x2162, "ROMAN NUMERAL THREE", true},
{0x2169, "ROMAN NUMERAL TEN", true},
{0x216C, "ROMAN NUMERAL FIFTY", true},
{0x216D, "ROMAN NUMERAL ONE HUNDRED", true},
{0x216E, "ROMAN NUMERAL FIVE HUNDRED", true},
{0x216F, "ROMAN NUMERAL ONE THOUSAND", true},
// ASCII digits
{0x0030, "DIGIT ZERO", false},
{0x0031, "DIGIT ONE", false},
// Non ASCII digits
{0x0660, "ARABIC-INDIC DIGIT ZERO", true},
{0x09e6, "BENGALI DIGIT ZERO", true},
// Combining marks
{0x0300, "COMBINING GRAVE ACCENT", false},
#if WCHAR_MAX > 0xFFFF
{0x1F600, "GRINNING FACE", false},
{0x20000, "CJK UNIFIED IDEOGRAPH-20000", true},
#endif
};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::ALPHA;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Punct) {
TestCase cases[] = {// ASCII punctuation
{0x0021, "EXCLAMATION MARK", true},
{0x0022, "QUOTATION MARK", true},
{0x0023, "NUMBER SIGN", true},
{0x002C, "COMMA", true},
{0x002E, "FULL STOP", true},
{0x002F, "SOLIDUS", true},
{0x003A, "COLON", true},
{0x003B, "SEMICOLON", true},
{0x003F, "QUESTION MARK", true},
{0x0040, "COMMERCIAL AT", true},
{0x005B, "LEFT SQUARE BRACKET", true},
{0x005D, "RIGHT SQUARE BRACKET", true},
{0x007B, "LEFT CURLY BRACKET", true},
{0x007D, "RIGHT CURLY BRACKET", true},
// ASCII non-punctuation
{0x0041, "LATIN CAPITAL LETTER A", false},
{0x0061, "LATIN SMALL LETTER A", false},
{0x0030, "DIGIT ZERO", false},
{0x0020, "SPACE", false},
// Unicode punctuation
{0x00A1, "INVERTED EXCLAMATION MARK", true},
{0x00BF, "INVERTED QUESTION MARK", true},
{0x2013, "EN DASH", true},
{0x2014, "EM DASH", true},
{0x2018, "LEFT SINGLE QUOTATION MARK", true},
{0x2019, "RIGHT SINGLE QUOTATION MARK", true},
{0x201C, "LEFT DOUBLE QUOTATION MARK", true},
{0x201D, "RIGHT DOUBLE QUOTATION MARK", true},
{0x2026, "HORIZONTAL ELLIPSIS", true},
{0x2030, "PER MILLE SIGN", true},
{0x3001, "IDEOGRAPHIC COMMA", true},
{0x3002, "IDEOGRAPHIC FULL STOP", true},
{0xFF01, "FULLWIDTH EXCLAMATION MARK", true},
{0xFF1F, "FULLWIDTH QUESTION MARK", true},
// Symbols (treated as punct in C.UTF-8)
{0x00A9, "COPYRIGHT SIGN", true},
{0x20AC, "EURO SIGN", true},
{0x2764, "HEAVY BLACK HEART", true},
{0x002B, "PLUS SIGN", true},
{0x00B6, "PILCROW SIGN", true},
{0x00A7, "SECTION SIGN", true},
{0x2022, "BULLET", true},
{0x2023, "TRIANGULAR BULLET", true},
{0x2020, "DAGGER", true},
{0x2021, "DOUBLE DAGGER", true},
// Math symbols (treated as punct in C.UTF-8)
{0x00D7, "MULTIPLICATION SIGN", true},
{0x00F7, "DIVISION SIGN", true},
{0x2212, "MINUS SIGN", true},
{0x221E, "INFINITY", true}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::PUNCT;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Print) {
TestCase cases[] = {
// ASCII printable characters
{0x0020, "SPACE", true},
{0x0021, "EXCLAMATION MARK", true},
{0x0030, "DIGIT ZERO", true},
{0x0041, "LATIN CAPITAL LETTER A", true},
{0x0061, "LATIN SMALL LETTER A", true},
{0x007E, "TILDE", true},
// ASCII control characters
{0x0000, "NULL", false},
{0x0009, "TAB", false},
{0x000A, "LINE FEED", false},
{0x000D, "CARRIAGE RETURN", false},
{0x001F, "UNIT SEPARATOR", false},
{0x007F, "DELETE", false},
// Non ASCII printable
{0x00A0, "NO-BREAK SPACE", true},
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
{0x0430, "CYRILLIC SMALL LETTER A", true},
{0x0627, "ARABIC LETTER ALEF", true},
{0x05D0, "HEBREW LETTER ALEF", true},
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", true},
{0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF", true},
{0x3042, "HIRAGANA LETTER A", true},
{0x30A2, "KATAKANA LETTER A", true},
{0xAC00, "HANGUL SYLLABLE GA", true},
// Emoji and symbols
{0x2764, "HEAVY BLACK HEART", true},
// Punctuation
{0x002E, "FULL STOP", true},
{0x002C, "COMMA", true},
{0x003A, "COLON", true},
// C1 control characters
{0x0080, "PADDING CHARACTER", false},
{0x009F, "APPLICATION PROGRAM COMMAND", false},
{0xFFFD, "REPLACEMENT CHARACTER", true},
// Format characters
{0x00AD, "SOFT HYPHEN", false},
{0x200C, "ZERO WIDTH NON-JOINER", false},
// Combining marks
{0x0300, "COMBINING GRAVE ACCENT", true},
// Private use area
{0xE000, "PRIVATE USE AREA (first)", true},
{0xF000, "PRIVATE USE AREA (last)", true},
#if WCHAR_MAX > 0xFFFF
{0x10FFFD, "SUPPLEMENTARY PRIVATE USE AREA B", true},
{0x1F600, "GRINNING FACE", true},
#endif
};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::PRINT;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Control) {
TestCase cases[] = {// ASCII control characters
{0x0000, "NULL", true},
{0x0001, "START OF HEADING", true},
{0x0009, "TAB", true},
{0x000A, "LINE FEED", true},
{0x000D, "CARRIAGE RETURN", true},
{0x001B, "ESCAPE", true},
{0x001F, "UNIT SEPARATOR", true},
// ASCII printable characters
{0x0020, "SPACE", false},
{0x0021, "EXCLAMATION MARK", false},
{0x0030, "DIGIT ZERO", false},
{0x0041, "LATIN CAPITAL LETTER A", false},
{0x0061, "LATIN SMALL LETTER A", false},
{0x007E, "TILDE", false},
// DELETE character
{0x007F, "DELETE", true},
// C1 control characters
{0x0080, "PADDING CHARACTER", true},
{0x0081, "HIGH OCTET PRESET", true},
{0x0090, "DEVICE CONTROL STRING", true},
{0x009F, "APPLICATION PROGRAM COMMAND", true},
// Non-control characters after C1 range
{0x00A0, "NO-BREAK SPACE", false},
{0x00A1, "INVERTED EXCLAMATION MARK", false},
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
// Letters
{0x0391, "GREEK CAPITAL LETTER ALPHA", false},
{0x0410, "CYRILLIC CAPITAL LETTER A", false},
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::CNTRL;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Space) {
TestCase cases[] = {// ASCII whitespace
{0x0020, "SPACE", true},
{0x0009, "CHARACTER TABULATION (TAB)", true},
{0x000A, "LINE FEED", true},
{0x000B, "LINE TABULATION", true},
{0x000C, "FORM FEED", true},
{0x000D, "CARRIAGE RETURN", true},
// ASCII non-whitespace
{0x0041, "LATIN CAPITAL LETTER A", false},
{0x0030, "DIGIT ZERO", false},
{0x0021, "EXCLAMATION MARK", false},
// Unicode whitespace
{0x1680, "OGHAM SPACE MARK", true},
{0x2000, "EN QUAD", true},
{0x2001, "EM QUAD", true},
{0x2002, "EN SPACE", true},
{0x2003, "EM SPACE", true},
{0x2004, "THREE-PER-EM SPACE", true},
{0x2005, "FOUR-PER-EM SPACE", true},
{0x2006, "SIX-PER-EM SPACE", true},
{0x2008, "PUNCTUATION SPACE", true},
{0x2009, "THIN SPACE", true},
{0x200A, "HAIR SPACE", true},
{0x2028, "LINE SEPARATOR", true},
{0x2029, "PARAGRAPH SEPARATOR", true},
{0x205F, "MEDIUM MATHEMATICAL SPACE", true},
{0x3000, "IDEOGRAPHIC SPACE", true},
// Unicode non-whitespace
{0x202F, "NARROW NO-BREAK SPACE", false},
{0x0085, "NEXT LINE", false},
{0x00A0, "NO-BREAK SPACE", false},
{0x2007, "FIGURE SPACE", false},
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
{0x2764, "HEAVY BLACK HEART", false}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::SPACE;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Blank) {
TestCase cases[] = {// Blank characters
{0x0020, "SPACE", true},
{0x0009, "CHARACTER TABULATION (TAB)", true},
// Non-blank whitespace
{0x000A, "LINE FEED", false},
{0x000D, "CARRIAGE RETURN", false},
{0x000B, "LINE TABULATION", false},
{0x000C, "FORM FEED", false},
// Unicode blank characters
{0x1680, "OGHAM SPACE MARK", true},
{0x2000, "EN QUAD", true},
{0x2001, "EM QUAD", true},
{0x2002, "EN SPACE", true},
{0x2003, "EM SPACE", true},
{0x2004, "THREE-PER-EM SPACE", true},
{0x2005, "FOUR-PER-EM SPACE", true},
{0x2006, "SIX-PER-EM SPACE", true},
{0x2008, "PUNCTUATION SPACE", true},
{0x2009, "THIN SPACE", true},
{0x200A, "HAIR SPACE", true},
{0x3000, "IDEOGRAPHIC SPACE", true},
// Non-blank characters
{0x0041, "LATIN CAPITAL LETTER A", false},
{0x0030, "DIGIT ZERO", false},
{0x0021, "EXCLAMATION MARK", false},
{0x00A0, "NO-BREAK SPACE", false},
{0x2007, "FIGURE SPACE", false},
{0x202F, "NARROW NO-BREAK SPACE", false},
{0x205F, "MEDIUM MATHEMATICAL SPACE", true},
{0x2028, "LINE SEPARATOR", false}};
for (const auto &tc : cases) {
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
LIBC_NAMESPACE::PropertyFlag::BLANK;
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, InvalidCodepoints) {
struct InvalidTestCase {
uint32_t wc;
const char *name;
};
InvalidTestCase cases[] = {
// Surrogate pair range
{0xD800, "HIGH SURROGATE START"}, {0xD900, "HIGH SURROGATE MIDDLE"},
{0xDBFF, "HIGH SURROGATE END"}, {0xDC00, "LOW SURROGATE START"},
{0xDD00, "LOW SURROGATE MIDDLE"}, {0xDFFF, "LOW SURROGATE END"},
#if WCHAR_MAX > 0xFFFF
{0x110000, "Beyond max Unicode"},
#endif
};
for (const auto &tc : cases) {
uint8_t props =
LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
}
}
TEST(LlvmLibcWctypeClassificationUtilsTest, Noncharacters) {
struct NoncharacterTestCase {
uint32_t wc;
const char *name;
};
NoncharacterTestCase cases[] = {
// BMP noncharacters
{0xFFFE, "BMP NONCHARACTER U+FFFE"},
{0xFFFF, "BMP NONCHARACTER U+FFFF"},
// Arabic Presentation Forms noncharacters
{0xFDD0, "NONCHARACTER U+FDD0"},
{0xFDD5, "NONCHARACTER U+FDD5"},
#if WCHAR_MAX > 0xFFFF
// Supplementary plane noncharacters
{0x1FFFE, "PLANE 1 NONCHARACTER"},
{0x2FFFE, "PLANE 2 NONCHARACTER"},
{0x3FFFE, "PLANE 3 NONCHARACTER"},
{0x10FFFE, "PLANE 16 NONCHARACTER"},
{0x10FFFF, "PLANE 16 NONCHARACTER"},
#endif
};
for (const auto &tc : cases) {
uint8_t props =
LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
}
}
} // namespace

View File

@@ -0,0 +1,3 @@
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

View File

@@ -0,0 +1,308 @@
# ===- Generate classification tables for wctype utils -----*- python -*----==#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ==------------------------------------------------------------------------==#
from enum import IntFlag
from dataclasses import dataclass
from collections import defaultdict
from sys import argv
# WARNING: If you modify this enum, you must update the generated C++ enum
# in generate_code as well
class PropertyFlag(IntFlag):
UPPER = 1 << 0
LOWER = 1 << 1
ALPHA = 1 << 2
SPACE = 1 << 3
PRINT = 1 << 4
BLANK = 1 << 5
CNTRL = 1 << 6
PUNCT = 1 << 7
@dataclass
class UnicodeEntry:
codepoint: int
name: str
category: str
def read_unicode_data(filename: str) -> list[UnicodeEntry]:
"""Reads Unicode data from file and returns list of entries."""
entries: list[UnicodeEntry] = []
try:
with open(filename, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
if not line or line.startswith("#"):
continue
fields = line.split(";")
if len(fields) < 3:
continue
codepoint_str = fields[0].strip()
name = fields[1].strip()
category = fields[2].strip()
codepoint = int(codepoint_str, 16)
entries.append(UnicodeEntry(codepoint, name, category))
except FileNotFoundError:
raise RuntimeError(f"Cannot open file: {filename}")
return entries
from dataclasses import dataclass
# Non-whitespace spaces in C.UTF-8
NON_WHITESPACE_SPACES = {0x00A0, 0x2007, 0x202F}
ASCII_DIGITS = {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39}
def handle_ranges(
properties: defaultdict[int, int], entries: list[UnicodeEntry]
) -> None:
"""Handles Unicode ranges defined by <First> and <Last>."""
range_start: int | None = None
range_props: int | None = None
for entry in entries:
if ", First>" in entry.name:
range_start = entry.codepoint
range_props = properties[entry.codepoint]
elif ", Last>" in entry.name and range_start and range_props:
for cp in range(range_start, entry.codepoint + 1):
properties[cp] = range_props
range_start = None
range_props = None
def get_props(entry: UnicodeEntry) -> int:
"""Creates the property flag for a given UnicodeEntry."""
codepoint = entry.codepoint
category = entry.category
props = 0
match category[0]:
case "L":
props |= PropertyFlag.ALPHA
if category in ("Lu", "Lt"):
props |= PropertyFlag.UPPER
elif category == "Ll":
props |= PropertyFlag.LOWER
case "N":
# In C.UTF8, non-ASCII digits/letter-numbers are alpha
if category in ("Nd", "Nl") and codepoint not in ASCII_DIGITS:
props |= PropertyFlag.ALPHA
case "P" | "S":
# Symbols are considered punctuation in C.UTF8
props |= PropertyFlag.PUNCT
case "Z":
if codepoint not in NON_WHITESPACE_SPACES:
props |= PropertyFlag.SPACE
if category == "Zs":
props |= PropertyFlag.BLANK
case "C":
if category == "Cc":
props |= PropertyFlag.CNTRL
# Print = all except control, unassigned, surrogate, format
if category not in ("Cc", "Cs", "Cn", "Cf"):
props |= PropertyFlag.PRINT
return props
def handle_special_cases(properties: defaultdict[int, int]) -> None:
"""Handles special cases not parseable from UnicodeData.txt."""
# ASCII whitespace characters
properties[0x0020] |= PropertyFlag.SPACE # SPACE
properties[0x0009] |= PropertyFlag.SPACE # TAB
properties[0x000A] |= PropertyFlag.SPACE # LINE FEED
properties[0x000D] |= PropertyFlag.SPACE # CARRIAGE RETURN
properties[0x000B] |= PropertyFlag.SPACE # VERTICAL TAB
properties[0x000C] |= PropertyFlag.SPACE # FORM FEED
# Blank
properties[0x0020] |= PropertyFlag.BLANK # SPACE
properties[0x0009] |= PropertyFlag.BLANK # TAB
def parse_unicode_data(entries: list[UnicodeEntry]) -> defaultdict[int, int]:
"""Returns codepoint -> property flag mappings."""
properties: defaultdict[int, int] = defaultdict(int)
for entry in entries:
codepoint = entry.codepoint
# Skip surrogate pairs
if 0xD800 <= codepoint <= 0xDFFF:
continue
properties[codepoint] = get_props(entry)
handle_ranges(properties, entries)
handle_special_cases(properties)
return properties
@dataclass
class StagedLookupTable:
level1: list[int] # Maps codepoint >> 8 to level2 offset
level2: list[int] # Actual properties
def build_lookup_tables(properties: defaultdict[int, int]) -> StagedLookupTable:
"""Builds two-level lookup tables."""
UNICODE_MAX = 0x110000
BLOCK_SIZE = 256
NUM_BLOCKS = UNICODE_MAX // BLOCK_SIZE
# Maps block content -> block index in level2
blocks: defaultdict[tuple[int, ...], int] = defaultdict(int)
level1: list[int] = []
level2: list[int] = []
for block_num in range(NUM_BLOCKS):
block_content = tuple(
properties.get((block_num << 8) | offset, 0) for offset in range(BLOCK_SIZE)
)
if block_content in blocks:
# Reuse existing block
level1.append(blocks[block_content])
else:
# New block - add to level2
block_index = len(level2)
blocks[block_content] = block_index
level2.extend(block_content)
level1.append(block_index)
print("Table statistics:")
print(f" Level 1 entries: {len(level1)}")
print(f" Level 2 entries: {len(level2)}")
print(f" Size: {len(level1) * 2 + len(level2)} bytes")
return StagedLookupTable(level1, level2)
def generate_code(lookup_table: StagedLookupTable, llvm_project_root_path: str) -> None:
"""Generates C++ header with lookup tables."""
level1 = lookup_table.level1
level2 = lookup_table.level2
with open(
f"{llvm_project_root_path}/libc/src/__support/wctype/wctype_classification_utils.h",
"w",
) as f:
f.write(
f"""//===-- Utils for wctype classification functions ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// DO NOT EDIT MANUALLY.
// This file is generated by libc/utils/wctype_utils scripts.
#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
#include "hdr/stdint_proxy.h"
#include "hdr/types/wchar_t.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include "src/__support/libc_assert.h"
#include "src/__support/CPP/limits.h"
namespace LIBC_NAMESPACE_DECL {{
// Property flags for Unicode categories
enum PropertyFlag : uint8_t {{
UPPER = 1 << 0,
LOWER = 1 << 1,
ALPHA = 1 << 2,
SPACE = 1 << 3,
PRINT = 1 << 4,
BLANK = 1 << 5,
CNTRL = 1 << 6,
PUNCT = 1 << 7,
}};
static_assert({len(level1)} <= cpp::numeric_limits<unsigned short>::max());
static_assert({len(level2)} <= cpp::numeric_limits<unsigned short>::max());
LIBC_INLINE_VAR constexpr uint16_t LEVEL1_SIZE = {len(level1)};
LIBC_INLINE_VAR constexpr uint16_t LEVEL2_SIZE = {len(level2)};
// Level 1 table: indexed by (codepoint >> 8), stores level2 block offsets
LIBC_INLINE_VAR constexpr uint16_t level1[LEVEL1_SIZE] = {{
"""
)
for i in range(0, len(level1), 11):
f.write(" ")
for j in range(i, min(i + 11, len(level1))):
f.write(f"{level1[j]:7d}")
if j + 1 < len(level1):
f.write(",")
f.write("\n")
f.write(
f"""}};
// Level 2 table: blocks of 256 property flags
LIBC_INLINE_VAR constexpr uint8_t level2[LEVEL2_SIZE] = {{
"""
)
for i in range(0, len(level2), 11):
f.write(" ")
for j in range(i, min(i + 11, len(level2))):
f.write(f"0x{level2[j]:02x}")
if j + 1 < len(level2):
f.write(", ")
f.write("\n")
f.write(
f"""}};
// Returns the Unicode property flag for a given wide character.
LIBC_INLINE constexpr uint8_t lookup_properties(const wchar_t wc) {{
// Out of Unicode range
if (static_cast<uint32_t>(wc) > 0x10FFFF) {{
return 0;
}}
uint16_t l1_idx = static_cast<uint16_t>(wc >> 8);
LIBC_ASSERT(l1_idx < LEVEL1_SIZE);
uint16_t l2_offset = level1[l1_idx];
uint16_t l2_idx = l2_offset + (wc & 0xFF);
LIBC_ASSERT(l2_idx < LEVEL2_SIZE);
return level2[l2_idx];
}}
}} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
"""
)

View File

@@ -10,6 +10,12 @@
from conversion.gen_conversion_data import extract_maps_from_unicode_file
from conversion.hex_writer import write_hex_conversions
from classification.gen_classification_data import (
read_unicode_data,
parse_unicode_data,
build_lookup_tables,
generate_code,
)
from sys import argv
from sys import exit
@@ -31,6 +37,16 @@ def write_wctype_conversion_data(
)
def write_wctype_classification_data(
llvm_project_root_path: str, unicode_data_folder_path: str
) -> None:
"""Generates wctype classification utils"""
entries = read_unicode_data(f"{unicode_data_folder_path}/UnicodeData.txt")
properties = parse_unicode_data(entries)
tables = build_lookup_tables(properties)
generate_code(tables, llvm_project_root_path)
def main() -> None:
if len(argv) != 3:
print("Codegen: wctype data generator script")
@@ -45,6 +61,9 @@ def main() -> None:
write_wctype_conversion_data(
llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
)
write_wctype_classification_data(
llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
)
print(f"wctype conversion data is written to {argv[1]}/libc/src/__support/wctype/")