[libc][wctype] Create generation script for classification lookup tables (#172042)
[#172040](https://github.com/llvm/llvm-project/issues/172040) This patch implements the scripts for generating the lookup tables and associated utils for wctype classification functions. Not all Unicode properties are covered as not all need a lookup table, the rest will be hardcoded. The size of the generated tables is 47,8KB.
This commit is contained in:
3
libc/.gitignore
vendored
3
libc/.gitignore
vendored
@@ -1,3 +1,6 @@
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
build/
|
||||
|
||||
# Unicode data used for wctype functions
|
||||
UnicodeData.txt
|
||||
|
||||
@@ -119,6 +119,10 @@ function(_get_compile_options_from_config output_var)
|
||||
list(APPEND config_options "-DLIBC_TRAP_ON_RAISE_FP_EXCEPT")
|
||||
endif()
|
||||
|
||||
if(LIBC_CONF_WCTYPE_MODE)
|
||||
list(APPEND config_options "-DLIBC_CONF_WCTYPE_MODE=${LIBC_CONF_WCTYPE_MODE}")
|
||||
endif()
|
||||
|
||||
if(LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT)
|
||||
list(APPEND config_options "-DLIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT=${LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT}")
|
||||
endif()
|
||||
|
||||
@@ -413,6 +413,7 @@ add_subdirectory(time)
|
||||
# Therefore, cannot currently build this on macos in overlay mode
|
||||
if(NOT (LIBC_TARGET_OS_IS_DARWIN))
|
||||
add_subdirectory(wchar)
|
||||
add_subdirectory(wctype)
|
||||
endif()
|
||||
|
||||
add_subdirectory(math)
|
||||
|
||||
12
libc/src/__support/wctype/CMakeLists.txt
Normal file
12
libc/src/__support/wctype/CMakeLists.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
add_header_library(
|
||||
wctype_classification_utils
|
||||
HDRS
|
||||
wctype_classification_utils.h
|
||||
DEPENDS
|
||||
libc.hdr.types.wchar_t
|
||||
libc.hdr.stdint_proxy
|
||||
libc.src.__support.macros.attributes
|
||||
libc.src.__support.macros.config
|
||||
libc.src.__support.CPP.limits
|
||||
libc.src.__support.libc_assert
|
||||
)
|
||||
3726
libc/src/__support/wctype/wctype_classification_utils.h
Normal file
3726
libc/src/__support/wctype/wctype_classification_utils.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -293,4 +293,5 @@ add_subdirectory(threads)
|
||||
# Cannot currently build this on MacOS in overlay mode
|
||||
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
|
||||
add_subdirectory(wchar)
|
||||
add_subdirectory(wctype)
|
||||
endif()
|
||||
|
||||
13
libc/test/src/__support/wctype/CMakeLists.txt
Normal file
13
libc/test/src/__support/wctype/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
add_custom_target(libc-support-wctype-tests)
|
||||
|
||||
add_libc_test(
|
||||
wctype_classification_utils_test
|
||||
SUITE
|
||||
libc-support-tests
|
||||
SRCS
|
||||
wctype_classification_utils_test.cpp
|
||||
DEPENDS
|
||||
libc.hdr.stdint_proxy
|
||||
libc.src.__support.wctype.wctype_classification_utils
|
||||
)
|
||||
|
||||
@@ -0,0 +1,540 @@
|
||||
//===-- Unittests for wctype classification utils -------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "hdr/stdint_proxy.h"
|
||||
#include "src/__support/wctype/wctype_classification_utils.h"
|
||||
#include "test/UnitTest/Test.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// Some platform (like Windows) have a 16 bit wchar_t. We guard the cases that
|
||||
// do not fit within 16 bits to prevent narrowing conversion and incorrect test
|
||||
// results.
|
||||
struct TestCase {
|
||||
uint32_t wc;
|
||||
const char *name;
|
||||
bool expected;
|
||||
};
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Lower) {
|
||||
TestCase cases[] = {// ASCII lowercase
|
||||
{0x0061, "LATIN SMALL LETTER A", true},
|
||||
{0x007A, "LATIN SMALL LETTER Z", true},
|
||||
|
||||
// ASCII uppercase
|
||||
{0x0041, "LATIN CAPITAL LETTER A", false},
|
||||
{0x005A, "LATIN CAPITAL LETTER Z", false},
|
||||
|
||||
// ASCII non-letters
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0020, "SPACE", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
|
||||
// Latin Extended lowercase
|
||||
{0x00E0, "LATIN SMALL LETTER A WITH GRAVE", true},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
|
||||
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
|
||||
|
||||
// Latin Extended uppercase
|
||||
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
|
||||
{0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", false},
|
||||
|
||||
// Greek lowercase
|
||||
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
|
||||
{0x03C9, "GREEK SMALL LETTER OMEGA", true},
|
||||
|
||||
// Greek uppercase
|
||||
{0x0391, "GREEK CAPITAL LETTER ALPHA", false},
|
||||
{0x03A9, "GREEK CAPITAL LETTER OMEGA", false},
|
||||
|
||||
// Cyrillic lowercase
|
||||
{0x0430, "CYRILLIC SMALL LETTER A", true},
|
||||
{0x044F, "CYRILLIC SMALL LETTER YA", true},
|
||||
|
||||
// Cyrillic uppercase
|
||||
{0x0410, "CYRILLIC CAPITAL LETTER A", false},
|
||||
{0x042F, "CYRILLIC CAPITAL LETTER YA", false},
|
||||
|
||||
// Caseless scripts
|
||||
{0x05D0, "HEBREW LETTER ALEF", false},
|
||||
{0x0627, "ARABIC LETTER ALEF", false},
|
||||
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::LOWER;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Upper) {
|
||||
TestCase cases[] = {
|
||||
// ASCII lowercase
|
||||
{0x0061, "LATIN SMALL LETTER A", false},
|
||||
{0x007A, "LATIN SMALL LETTER Z", false},
|
||||
|
||||
// ASCII uppercase
|
||||
{0x0041, "LATIN CAPITAL LETTER A", true},
|
||||
{0x005A, "LATIN CAPITAL LETTER Z", true},
|
||||
|
||||
// ASCII non-letters
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0020, "SPACE", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
|
||||
// Titlecase
|
||||
{0x01C5, "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON", true},
|
||||
|
||||
// Latin Extended lowercase
|
||||
{0x00E0, "LATIN SMALL LETTER A WITH GRAVE", false},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
|
||||
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", false},
|
||||
|
||||
// Latin Extended uppercase
|
||||
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
|
||||
{0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", true},
|
||||
|
||||
// Greek lowercase
|
||||
{0x03B1, "GREEK SMALL LETTER ALPHA", false},
|
||||
{0x03C9, "GREEK SMALL LETTER OMEGA", false},
|
||||
|
||||
// Greek uppercase
|
||||
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
|
||||
{0x03A9, "GREEK CAPITAL LETTER OMEGA", true},
|
||||
|
||||
// Cyrillic lowercase
|
||||
{0x0430, "CYRILLIC SMALL LETTER A", false},
|
||||
{0x044F, "CYRILLIC SMALL LETTER YA", false},
|
||||
|
||||
// Cyrillic uppercase
|
||||
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
|
||||
{0x042F, "CYRILLIC CAPITAL LETTER YA", true},
|
||||
|
||||
// Caseless scripts
|
||||
{0x05D0, "HEBREW LETTER ALEF", false},
|
||||
{0x0627, "ARABIC LETTER ALEF", false},
|
||||
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::UPPER;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Alpha) {
|
||||
TestCase cases[] = {
|
||||
// ASCII letters
|
||||
{0x0041, "LATIN CAPITAL LETTER A", true},
|
||||
{0x0061, "LATIN SMALL LETTER A", true},
|
||||
{0x005A, "LATIN CAPITAL LETTER Z", true},
|
||||
{0x007A, "LATIN SMALL LETTER Z", true},
|
||||
|
||||
// ASCII non-letters
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0039, "DIGIT NINE", false},
|
||||
{0x0020, "SPACE", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
{0x007E, "TILDE", false},
|
||||
|
||||
// Modified letters
|
||||
{0x02B0, "MODIFIED LETTER SMALL H", true},
|
||||
|
||||
// Latin Extended
|
||||
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
|
||||
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
|
||||
|
||||
// Greek
|
||||
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
|
||||
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
|
||||
{0x03C9, "GREEK SMALL LETTER OMEGA", true},
|
||||
|
||||
// Cyrillic
|
||||
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
|
||||
{0x0430, "CYRILLIC SMALL LETTER A", true},
|
||||
{0x044F, "CYRILLIC SMALL LETTER YA", true},
|
||||
|
||||
// Arabic
|
||||
{0x0627, "ARABIC LETTER ALEF", true},
|
||||
{0x0628, "ARABIC LETTER BEH", true},
|
||||
|
||||
// CJK
|
||||
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00 (first)", true},
|
||||
{0x4E01, "CJK UNIFIED IDEOGRAPH-4E01", true},
|
||||
{0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF (last in BMP)", true},
|
||||
|
||||
// Emoji and symbols
|
||||
{0x2764, "HEAVY BLACK HEART", false},
|
||||
|
||||
// Special cases
|
||||
{0x0000, "NULL", false},
|
||||
{0xFFFD, "REPLACEMENT CHARACTER", false},
|
||||
|
||||
// Roman numerals
|
||||
{0x2160, "ROMAN NUMERAL ONE", true},
|
||||
{0x2161, "ROMAN NUMERAL TWO", true},
|
||||
{0x2162, "ROMAN NUMERAL THREE", true},
|
||||
{0x2169, "ROMAN NUMERAL TEN", true},
|
||||
{0x216C, "ROMAN NUMERAL FIFTY", true},
|
||||
{0x216D, "ROMAN NUMERAL ONE HUNDRED", true},
|
||||
{0x216E, "ROMAN NUMERAL FIVE HUNDRED", true},
|
||||
{0x216F, "ROMAN NUMERAL ONE THOUSAND", true},
|
||||
|
||||
// ASCII digits
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0031, "DIGIT ONE", false},
|
||||
|
||||
// Non ASCII digits
|
||||
{0x0660, "ARABIC-INDIC DIGIT ZERO", true},
|
||||
{0x09e6, "BENGALI DIGIT ZERO", true},
|
||||
|
||||
// Combining marks
|
||||
{0x0300, "COMBINING GRAVE ACCENT", false},
|
||||
|
||||
#if WCHAR_MAX > 0xFFFF
|
||||
{0x1F600, "GRINNING FACE", false},
|
||||
{0x20000, "CJK UNIFIED IDEOGRAPH-20000", true},
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::ALPHA;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Punct) {
|
||||
TestCase cases[] = {// ASCII punctuation
|
||||
{0x0021, "EXCLAMATION MARK", true},
|
||||
{0x0022, "QUOTATION MARK", true},
|
||||
{0x0023, "NUMBER SIGN", true},
|
||||
{0x002C, "COMMA", true},
|
||||
{0x002E, "FULL STOP", true},
|
||||
{0x002F, "SOLIDUS", true},
|
||||
{0x003A, "COLON", true},
|
||||
{0x003B, "SEMICOLON", true},
|
||||
{0x003F, "QUESTION MARK", true},
|
||||
{0x0040, "COMMERCIAL AT", true},
|
||||
{0x005B, "LEFT SQUARE BRACKET", true},
|
||||
{0x005D, "RIGHT SQUARE BRACKET", true},
|
||||
{0x007B, "LEFT CURLY BRACKET", true},
|
||||
{0x007D, "RIGHT CURLY BRACKET", true},
|
||||
|
||||
// ASCII non-punctuation
|
||||
{0x0041, "LATIN CAPITAL LETTER A", false},
|
||||
{0x0061, "LATIN SMALL LETTER A", false},
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0020, "SPACE", false},
|
||||
|
||||
// Unicode punctuation
|
||||
{0x00A1, "INVERTED EXCLAMATION MARK", true},
|
||||
{0x00BF, "INVERTED QUESTION MARK", true},
|
||||
{0x2013, "EN DASH", true},
|
||||
{0x2014, "EM DASH", true},
|
||||
{0x2018, "LEFT SINGLE QUOTATION MARK", true},
|
||||
{0x2019, "RIGHT SINGLE QUOTATION MARK", true},
|
||||
{0x201C, "LEFT DOUBLE QUOTATION MARK", true},
|
||||
{0x201D, "RIGHT DOUBLE QUOTATION MARK", true},
|
||||
{0x2026, "HORIZONTAL ELLIPSIS", true},
|
||||
{0x2030, "PER MILLE SIGN", true},
|
||||
{0x3001, "IDEOGRAPHIC COMMA", true},
|
||||
{0x3002, "IDEOGRAPHIC FULL STOP", true},
|
||||
{0xFF01, "FULLWIDTH EXCLAMATION MARK", true},
|
||||
{0xFF1F, "FULLWIDTH QUESTION MARK", true},
|
||||
|
||||
// Symbols (treated as punct in C.UTF-8)
|
||||
{0x00A9, "COPYRIGHT SIGN", true},
|
||||
{0x20AC, "EURO SIGN", true},
|
||||
{0x2764, "HEAVY BLACK HEART", true},
|
||||
{0x002B, "PLUS SIGN", true},
|
||||
{0x00B6, "PILCROW SIGN", true},
|
||||
{0x00A7, "SECTION SIGN", true},
|
||||
{0x2022, "BULLET", true},
|
||||
{0x2023, "TRIANGULAR BULLET", true},
|
||||
{0x2020, "DAGGER", true},
|
||||
{0x2021, "DOUBLE DAGGER", true},
|
||||
|
||||
// Math symbols (treated as punct in C.UTF-8)
|
||||
{0x00D7, "MULTIPLICATION SIGN", true},
|
||||
{0x00F7, "DIVISION SIGN", true},
|
||||
{0x2212, "MINUS SIGN", true},
|
||||
{0x221E, "INFINITY", true}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::PUNCT;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Print) {
|
||||
TestCase cases[] = {
|
||||
// ASCII printable characters
|
||||
{0x0020, "SPACE", true},
|
||||
{0x0021, "EXCLAMATION MARK", true},
|
||||
{0x0030, "DIGIT ZERO", true},
|
||||
{0x0041, "LATIN CAPITAL LETTER A", true},
|
||||
{0x0061, "LATIN SMALL LETTER A", true},
|
||||
{0x007E, "TILDE", true},
|
||||
|
||||
// ASCII control characters
|
||||
{0x0000, "NULL", false},
|
||||
{0x0009, "TAB", false},
|
||||
{0x000A, "LINE FEED", false},
|
||||
{0x000D, "CARRIAGE RETURN", false},
|
||||
{0x001F, "UNIT SEPARATOR", false},
|
||||
{0x007F, "DELETE", false},
|
||||
|
||||
// Non ASCII printable
|
||||
{0x00A0, "NO-BREAK SPACE", true},
|
||||
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
|
||||
{0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
|
||||
{0x0391, "GREEK CAPITAL LETTER ALPHA", true},
|
||||
{0x03B1, "GREEK SMALL LETTER ALPHA", true},
|
||||
{0x0410, "CYRILLIC CAPITAL LETTER A", true},
|
||||
{0x0430, "CYRILLIC SMALL LETTER A", true},
|
||||
{0x0627, "ARABIC LETTER ALEF", true},
|
||||
{0x05D0, "HEBREW LETTER ALEF", true},
|
||||
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", true},
|
||||
{0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF", true},
|
||||
{0x3042, "HIRAGANA LETTER A", true},
|
||||
{0x30A2, "KATAKANA LETTER A", true},
|
||||
{0xAC00, "HANGUL SYLLABLE GA", true},
|
||||
|
||||
// Emoji and symbols
|
||||
{0x2764, "HEAVY BLACK HEART", true},
|
||||
|
||||
// Punctuation
|
||||
{0x002E, "FULL STOP", true},
|
||||
{0x002C, "COMMA", true},
|
||||
{0x003A, "COLON", true},
|
||||
|
||||
// C1 control characters
|
||||
{0x0080, "PADDING CHARACTER", false},
|
||||
{0x009F, "APPLICATION PROGRAM COMMAND", false},
|
||||
|
||||
{0xFFFD, "REPLACEMENT CHARACTER", true},
|
||||
|
||||
// Format characters
|
||||
{0x00AD, "SOFT HYPHEN", false},
|
||||
{0x200C, "ZERO WIDTH NON-JOINER", false},
|
||||
|
||||
// Combining marks
|
||||
{0x0300, "COMBINING GRAVE ACCENT", true},
|
||||
|
||||
// Private use area
|
||||
{0xE000, "PRIVATE USE AREA (first)", true},
|
||||
{0xF000, "PRIVATE USE AREA (last)", true},
|
||||
|
||||
#if WCHAR_MAX > 0xFFFF
|
||||
{0x10FFFD, "SUPPLEMENTARY PRIVATE USE AREA B", true},
|
||||
{0x1F600, "GRINNING FACE", true},
|
||||
#endif
|
||||
};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::PRINT;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Control) {
|
||||
TestCase cases[] = {// ASCII control characters
|
||||
{0x0000, "NULL", true},
|
||||
{0x0001, "START OF HEADING", true},
|
||||
{0x0009, "TAB", true},
|
||||
{0x000A, "LINE FEED", true},
|
||||
{0x000D, "CARRIAGE RETURN", true},
|
||||
{0x001B, "ESCAPE", true},
|
||||
{0x001F, "UNIT SEPARATOR", true},
|
||||
|
||||
// ASCII printable characters
|
||||
{0x0020, "SPACE", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0041, "LATIN CAPITAL LETTER A", false},
|
||||
{0x0061, "LATIN SMALL LETTER A", false},
|
||||
{0x007E, "TILDE", false},
|
||||
|
||||
// DELETE character
|
||||
{0x007F, "DELETE", true},
|
||||
|
||||
// C1 control characters
|
||||
{0x0080, "PADDING CHARACTER", true},
|
||||
{0x0081, "HIGH OCTET PRESET", true},
|
||||
{0x0090, "DEVICE CONTROL STRING", true},
|
||||
{0x009F, "APPLICATION PROGRAM COMMAND", true},
|
||||
|
||||
// Non-control characters after C1 range
|
||||
{0x00A0, "NO-BREAK SPACE", false},
|
||||
{0x00A1, "INVERTED EXCLAMATION MARK", false},
|
||||
{0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
|
||||
|
||||
// Letters
|
||||
{0x0391, "GREEK CAPITAL LETTER ALPHA", false},
|
||||
{0x0410, "CYRILLIC CAPITAL LETTER A", false},
|
||||
{0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::CNTRL;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Space) {
|
||||
TestCase cases[] = {// ASCII whitespace
|
||||
{0x0020, "SPACE", true},
|
||||
{0x0009, "CHARACTER TABULATION (TAB)", true},
|
||||
{0x000A, "LINE FEED", true},
|
||||
{0x000B, "LINE TABULATION", true},
|
||||
{0x000C, "FORM FEED", true},
|
||||
{0x000D, "CARRIAGE RETURN", true},
|
||||
|
||||
// ASCII non-whitespace
|
||||
{0x0041, "LATIN CAPITAL LETTER A", false},
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
|
||||
// Unicode whitespace
|
||||
{0x1680, "OGHAM SPACE MARK", true},
|
||||
{0x2000, "EN QUAD", true},
|
||||
{0x2001, "EM QUAD", true},
|
||||
{0x2002, "EN SPACE", true},
|
||||
{0x2003, "EM SPACE", true},
|
||||
{0x2004, "THREE-PER-EM SPACE", true},
|
||||
{0x2005, "FOUR-PER-EM SPACE", true},
|
||||
{0x2006, "SIX-PER-EM SPACE", true},
|
||||
{0x2008, "PUNCTUATION SPACE", true},
|
||||
{0x2009, "THIN SPACE", true},
|
||||
{0x200A, "HAIR SPACE", true},
|
||||
{0x2028, "LINE SEPARATOR", true},
|
||||
{0x2029, "PARAGRAPH SEPARATOR", true},
|
||||
{0x205F, "MEDIUM MATHEMATICAL SPACE", true},
|
||||
{0x3000, "IDEOGRAPHIC SPACE", true},
|
||||
|
||||
// Unicode non-whitespace
|
||||
{0x202F, "NARROW NO-BREAK SPACE", false},
|
||||
{0x0085, "NEXT LINE", false},
|
||||
{0x00A0, "NO-BREAK SPACE", false},
|
||||
{0x2007, "FIGURE SPACE", false},
|
||||
{0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
|
||||
{0x2764, "HEAVY BLACK HEART", false}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::SPACE;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Blank) {
|
||||
TestCase cases[] = {// Blank characters
|
||||
{0x0020, "SPACE", true},
|
||||
{0x0009, "CHARACTER TABULATION (TAB)", true},
|
||||
|
||||
// Non-blank whitespace
|
||||
{0x000A, "LINE FEED", false},
|
||||
{0x000D, "CARRIAGE RETURN", false},
|
||||
{0x000B, "LINE TABULATION", false},
|
||||
{0x000C, "FORM FEED", false},
|
||||
|
||||
// Unicode blank characters
|
||||
{0x1680, "OGHAM SPACE MARK", true},
|
||||
{0x2000, "EN QUAD", true},
|
||||
{0x2001, "EM QUAD", true},
|
||||
{0x2002, "EN SPACE", true},
|
||||
{0x2003, "EM SPACE", true},
|
||||
{0x2004, "THREE-PER-EM SPACE", true},
|
||||
{0x2005, "FOUR-PER-EM SPACE", true},
|
||||
{0x2006, "SIX-PER-EM SPACE", true},
|
||||
{0x2008, "PUNCTUATION SPACE", true},
|
||||
{0x2009, "THIN SPACE", true},
|
||||
{0x200A, "HAIR SPACE", true},
|
||||
{0x3000, "IDEOGRAPHIC SPACE", true},
|
||||
|
||||
// Non-blank characters
|
||||
{0x0041, "LATIN CAPITAL LETTER A", false},
|
||||
{0x0030, "DIGIT ZERO", false},
|
||||
{0x0021, "EXCLAMATION MARK", false},
|
||||
{0x00A0, "NO-BREAK SPACE", false},
|
||||
{0x2007, "FIGURE SPACE", false},
|
||||
{0x202F, "NARROW NO-BREAK SPACE", false},
|
||||
{0x205F, "MEDIUM MATHEMATICAL SPACE", true},
|
||||
{0x2028, "LINE SEPARATOR", false}};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
|
||||
LIBC_NAMESPACE::PropertyFlag::BLANK;
|
||||
EXPECT_EQ(res, tc.expected) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, InvalidCodepoints) {
|
||||
struct InvalidTestCase {
|
||||
uint32_t wc;
|
||||
const char *name;
|
||||
};
|
||||
|
||||
InvalidTestCase cases[] = {
|
||||
// Surrogate pair range
|
||||
{0xD800, "HIGH SURROGATE START"}, {0xD900, "HIGH SURROGATE MIDDLE"},
|
||||
{0xDBFF, "HIGH SURROGATE END"}, {0xDC00, "LOW SURROGATE START"},
|
||||
{0xDD00, "LOW SURROGATE MIDDLE"}, {0xDFFF, "LOW SURROGATE END"},
|
||||
|
||||
#if WCHAR_MAX > 0xFFFF
|
||||
{0x110000, "Beyond max Unicode"},
|
||||
#endif
|
||||
};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
uint8_t props =
|
||||
LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
|
||||
EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LlvmLibcWctypeClassificationUtilsTest, Noncharacters) {
|
||||
struct NoncharacterTestCase {
|
||||
uint32_t wc;
|
||||
const char *name;
|
||||
};
|
||||
|
||||
NoncharacterTestCase cases[] = {
|
||||
// BMP noncharacters
|
||||
{0xFFFE, "BMP NONCHARACTER U+FFFE"},
|
||||
{0xFFFF, "BMP NONCHARACTER U+FFFF"},
|
||||
|
||||
// Arabic Presentation Forms noncharacters
|
||||
{0xFDD0, "NONCHARACTER U+FDD0"},
|
||||
{0xFDD5, "NONCHARACTER U+FDD5"},
|
||||
|
||||
#if WCHAR_MAX > 0xFFFF
|
||||
// Supplementary plane noncharacters
|
||||
{0x1FFFE, "PLANE 1 NONCHARACTER"},
|
||||
{0x2FFFE, "PLANE 2 NONCHARACTER"},
|
||||
{0x3FFFE, "PLANE 3 NONCHARACTER"},
|
||||
{0x10FFFE, "PLANE 16 NONCHARACTER"},
|
||||
{0x10FFFF, "PLANE 16 NONCHARACTER"},
|
||||
#endif
|
||||
};
|
||||
|
||||
for (const auto &tc : cases) {
|
||||
uint8_t props =
|
||||
LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
|
||||
EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
3
libc/utils/wctype_utils/classification/__init__.py
Normal file
3
libc/utils/wctype_utils/classification/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
@@ -0,0 +1,308 @@
|
||||
# ===- Generate classification tables for wctype utils -----*- python -*----==#
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
# ==------------------------------------------------------------------------==#
|
||||
|
||||
|
||||
from enum import IntFlag
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
from sys import argv
|
||||
|
||||
|
||||
# WARNING: If you modify this enum, you must update the generated C++ enum
|
||||
# in generate_code as well
|
||||
class PropertyFlag(IntFlag):
|
||||
UPPER = 1 << 0
|
||||
LOWER = 1 << 1
|
||||
ALPHA = 1 << 2
|
||||
SPACE = 1 << 3
|
||||
PRINT = 1 << 4
|
||||
BLANK = 1 << 5
|
||||
CNTRL = 1 << 6
|
||||
PUNCT = 1 << 7
|
||||
|
||||
|
||||
@dataclass
|
||||
class UnicodeEntry:
|
||||
codepoint: int
|
||||
name: str
|
||||
category: str
|
||||
|
||||
|
||||
def read_unicode_data(filename: str) -> list[UnicodeEntry]:
|
||||
"""Reads Unicode data from file and returns list of entries."""
|
||||
entries: list[UnicodeEntry] = []
|
||||
|
||||
try:
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
fields = line.split(";")
|
||||
|
||||
if len(fields) < 3:
|
||||
continue
|
||||
|
||||
codepoint_str = fields[0].strip()
|
||||
name = fields[1].strip()
|
||||
category = fields[2].strip()
|
||||
|
||||
codepoint = int(codepoint_str, 16)
|
||||
|
||||
entries.append(UnicodeEntry(codepoint, name, category))
|
||||
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError(f"Cannot open file: {filename}")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
# Non-whitespace spaces in C.UTF-8
|
||||
NON_WHITESPACE_SPACES = {0x00A0, 0x2007, 0x202F}
|
||||
|
||||
ASCII_DIGITS = {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39}
|
||||
|
||||
|
||||
def handle_ranges(
|
||||
properties: defaultdict[int, int], entries: list[UnicodeEntry]
|
||||
) -> None:
|
||||
"""Handles Unicode ranges defined by <First> and <Last>."""
|
||||
range_start: int | None = None
|
||||
range_props: int | None = None
|
||||
|
||||
for entry in entries:
|
||||
if ", First>" in entry.name:
|
||||
range_start = entry.codepoint
|
||||
range_props = properties[entry.codepoint]
|
||||
elif ", Last>" in entry.name and range_start and range_props:
|
||||
for cp in range(range_start, entry.codepoint + 1):
|
||||
properties[cp] = range_props
|
||||
range_start = None
|
||||
range_props = None
|
||||
|
||||
|
||||
def get_props(entry: UnicodeEntry) -> int:
|
||||
"""Creates the property flag for a given UnicodeEntry."""
|
||||
codepoint = entry.codepoint
|
||||
category = entry.category
|
||||
props = 0
|
||||
|
||||
match category[0]:
|
||||
case "L":
|
||||
props |= PropertyFlag.ALPHA
|
||||
if category in ("Lu", "Lt"):
|
||||
props |= PropertyFlag.UPPER
|
||||
elif category == "Ll":
|
||||
props |= PropertyFlag.LOWER
|
||||
|
||||
case "N":
|
||||
# In C.UTF8, non-ASCII digits/letter-numbers are alpha
|
||||
if category in ("Nd", "Nl") and codepoint not in ASCII_DIGITS:
|
||||
props |= PropertyFlag.ALPHA
|
||||
|
||||
case "P" | "S":
|
||||
# Symbols are considered punctuation in C.UTF8
|
||||
props |= PropertyFlag.PUNCT
|
||||
|
||||
case "Z":
|
||||
if codepoint not in NON_WHITESPACE_SPACES:
|
||||
props |= PropertyFlag.SPACE
|
||||
if category == "Zs":
|
||||
props |= PropertyFlag.BLANK
|
||||
|
||||
case "C":
|
||||
if category == "Cc":
|
||||
props |= PropertyFlag.CNTRL
|
||||
|
||||
# Print = all except control, unassigned, surrogate, format
|
||||
if category not in ("Cc", "Cs", "Cn", "Cf"):
|
||||
props |= PropertyFlag.PRINT
|
||||
|
||||
return props
|
||||
|
||||
|
||||
def handle_special_cases(properties: defaultdict[int, int]) -> None:
|
||||
"""Handles special cases not parseable from UnicodeData.txt."""
|
||||
# ASCII whitespace characters
|
||||
properties[0x0020] |= PropertyFlag.SPACE # SPACE
|
||||
properties[0x0009] |= PropertyFlag.SPACE # TAB
|
||||
properties[0x000A] |= PropertyFlag.SPACE # LINE FEED
|
||||
properties[0x000D] |= PropertyFlag.SPACE # CARRIAGE RETURN
|
||||
properties[0x000B] |= PropertyFlag.SPACE # VERTICAL TAB
|
||||
properties[0x000C] |= PropertyFlag.SPACE # FORM FEED
|
||||
|
||||
# Blank
|
||||
properties[0x0020] |= PropertyFlag.BLANK # SPACE
|
||||
properties[0x0009] |= PropertyFlag.BLANK # TAB
|
||||
|
||||
|
||||
def parse_unicode_data(entries: list[UnicodeEntry]) -> defaultdict[int, int]:
|
||||
"""Returns codepoint -> property flag mappings."""
|
||||
properties: defaultdict[int, int] = defaultdict(int)
|
||||
|
||||
for entry in entries:
|
||||
codepoint = entry.codepoint
|
||||
|
||||
# Skip surrogate pairs
|
||||
if 0xD800 <= codepoint <= 0xDFFF:
|
||||
continue
|
||||
|
||||
properties[codepoint] = get_props(entry)
|
||||
|
||||
handle_ranges(properties, entries)
|
||||
handle_special_cases(properties)
|
||||
|
||||
return properties
|
||||
|
||||
|
||||
@dataclass
|
||||
class StagedLookupTable:
|
||||
level1: list[int] # Maps codepoint >> 8 to level2 offset
|
||||
level2: list[int] # Actual properties
|
||||
|
||||
|
||||
def build_lookup_tables(properties: defaultdict[int, int]) -> StagedLookupTable:
|
||||
"""Builds two-level lookup tables."""
|
||||
UNICODE_MAX = 0x110000
|
||||
BLOCK_SIZE = 256
|
||||
NUM_BLOCKS = UNICODE_MAX // BLOCK_SIZE
|
||||
|
||||
# Maps block content -> block index in level2
|
||||
blocks: defaultdict[tuple[int, ...], int] = defaultdict(int)
|
||||
level1: list[int] = []
|
||||
level2: list[int] = []
|
||||
|
||||
for block_num in range(NUM_BLOCKS):
|
||||
block_content = tuple(
|
||||
properties.get((block_num << 8) | offset, 0) for offset in range(BLOCK_SIZE)
|
||||
)
|
||||
|
||||
if block_content in blocks:
|
||||
# Reuse existing block
|
||||
level1.append(blocks[block_content])
|
||||
else:
|
||||
# New block - add to level2
|
||||
block_index = len(level2)
|
||||
blocks[block_content] = block_index
|
||||
|
||||
level2.extend(block_content)
|
||||
level1.append(block_index)
|
||||
|
||||
print("Table statistics:")
|
||||
print(f" Level 1 entries: {len(level1)}")
|
||||
print(f" Level 2 entries: {len(level2)}")
|
||||
print(f" Size: {len(level1) * 2 + len(level2)} bytes")
|
||||
|
||||
return StagedLookupTable(level1, level2)
|
||||
|
||||
|
||||
def generate_code(lookup_table: StagedLookupTable, llvm_project_root_path: str) -> None:
|
||||
"""Generates C++ header with lookup tables."""
|
||||
level1 = lookup_table.level1
|
||||
level2 = lookup_table.level2
|
||||
|
||||
with open(
|
||||
f"{llvm_project_root_path}/libc/src/__support/wctype/wctype_classification_utils.h",
|
||||
"w",
|
||||
) as f:
|
||||
f.write(
|
||||
f"""//===-- Utils for wctype classification functions ---------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DO NOT EDIT MANUALLY.
|
||||
// This file is generated by libc/utils/wctype_utils scripts.
|
||||
|
||||
#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
|
||||
#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
|
||||
|
||||
#include "hdr/stdint_proxy.h"
|
||||
#include "hdr/types/wchar_t.h"
|
||||
#include "src/__support/macros/attributes.h"
|
||||
#include "src/__support/macros/config.h"
|
||||
#include "src/__support/libc_assert.h"
|
||||
#include "src/__support/CPP/limits.h"
|
||||
|
||||
namespace LIBC_NAMESPACE_DECL {{
|
||||
|
||||
// Property flags for Unicode categories
|
||||
enum PropertyFlag : uint8_t {{
|
||||
UPPER = 1 << 0,
|
||||
LOWER = 1 << 1,
|
||||
ALPHA = 1 << 2,
|
||||
SPACE = 1 << 3,
|
||||
PRINT = 1 << 4,
|
||||
BLANK = 1 << 5,
|
||||
CNTRL = 1 << 6,
|
||||
PUNCT = 1 << 7,
|
||||
}};
|
||||
|
||||
static_assert({len(level1)} <= cpp::numeric_limits<unsigned short>::max());
|
||||
static_assert({len(level2)} <= cpp::numeric_limits<unsigned short>::max());
|
||||
|
||||
LIBC_INLINE_VAR constexpr uint16_t LEVEL1_SIZE = {len(level1)};
|
||||
LIBC_INLINE_VAR constexpr uint16_t LEVEL2_SIZE = {len(level2)};
|
||||
|
||||
// Level 1 table: indexed by (codepoint >> 8), stores level2 block offsets
|
||||
LIBC_INLINE_VAR constexpr uint16_t level1[LEVEL1_SIZE] = {{
|
||||
"""
|
||||
)
|
||||
for i in range(0, len(level1), 11):
|
||||
f.write(" ")
|
||||
for j in range(i, min(i + 11, len(level1))):
|
||||
f.write(f"{level1[j]:7d}")
|
||||
if j + 1 < len(level1):
|
||||
f.write(",")
|
||||
f.write("\n")
|
||||
f.write(
|
||||
f"""}};
|
||||
|
||||
// Level 2 table: blocks of 256 property flags
|
||||
LIBC_INLINE_VAR constexpr uint8_t level2[LEVEL2_SIZE] = {{
|
||||
"""
|
||||
)
|
||||
for i in range(0, len(level2), 11):
|
||||
f.write(" ")
|
||||
for j in range(i, min(i + 11, len(level2))):
|
||||
f.write(f"0x{level2[j]:02x}")
|
||||
if j + 1 < len(level2):
|
||||
f.write(", ")
|
||||
f.write("\n")
|
||||
f.write(
|
||||
f"""}};
|
||||
|
||||
// Returns the Unicode property flag for a given wide character.
|
||||
LIBC_INLINE constexpr uint8_t lookup_properties(const wchar_t wc) {{
|
||||
// Out of Unicode range
|
||||
if (static_cast<uint32_t>(wc) > 0x10FFFF) {{
|
||||
return 0;
|
||||
}}
|
||||
|
||||
uint16_t l1_idx = static_cast<uint16_t>(wc >> 8);
|
||||
LIBC_ASSERT(l1_idx < LEVEL1_SIZE);
|
||||
|
||||
uint16_t l2_offset = level1[l1_idx];
|
||||
uint16_t l2_idx = l2_offset + (wc & 0xFF);
|
||||
LIBC_ASSERT(l2_idx < LEVEL2_SIZE);
|
||||
|
||||
return level2[l2_idx];
|
||||
}}
|
||||
|
||||
}} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
|
||||
|
||||
"""
|
||||
)
|
||||
@@ -10,6 +10,12 @@
|
||||
|
||||
from conversion.gen_conversion_data import extract_maps_from_unicode_file
|
||||
from conversion.hex_writer import write_hex_conversions
|
||||
from classification.gen_classification_data import (
|
||||
read_unicode_data,
|
||||
parse_unicode_data,
|
||||
build_lookup_tables,
|
||||
generate_code,
|
||||
)
|
||||
from sys import argv
|
||||
from sys import exit
|
||||
|
||||
@@ -31,6 +37,16 @@ def write_wctype_conversion_data(
|
||||
)
|
||||
|
||||
|
||||
def write_wctype_classification_data(
|
||||
llvm_project_root_path: str, unicode_data_folder_path: str
|
||||
) -> None:
|
||||
"""Generates wctype classification utils"""
|
||||
entries = read_unicode_data(f"{unicode_data_folder_path}/UnicodeData.txt")
|
||||
properties = parse_unicode_data(entries)
|
||||
tables = build_lookup_tables(properties)
|
||||
generate_code(tables, llvm_project_root_path)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(argv) != 3:
|
||||
print("Codegen: wctype data generator script")
|
||||
@@ -45,6 +61,9 @@ def main() -> None:
|
||||
write_wctype_conversion_data(
|
||||
llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
|
||||
)
|
||||
write_wctype_classification_data(
|
||||
llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
|
||||
)
|
||||
print(f"wctype conversion data is written to {argv[1]}/libc/src/__support/wctype/")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user