[libc][wctype] Create generation script for classification lookup tables (#172042)

[#172040](https://github.com/llvm/llvm-project/issues/172040) This patch implements the scripts for generating the lookup tables and associated utils for wctype classification functions. Not all Unicode properties are covered as not all need a lookup table, the rest will be hardcoded. The size of the generated tables is 47,8KB.
2026-01-06 12:13:28 +01:00
parent 2bfb984a7c
commit 9373dbdc00
11 changed files with 4630 additions and 0 deletions
--- a/libc/.gitignore
+++ b/libc/.gitignore
@@ -1,3 +1,6 @@
 # Sphinx documentation
 docs/_build/
 build/
+
+# Unicode data used for wctype functions
+UnicodeData.txt
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -119,6 +119,10 @@ function(_get_compile_options_from_config output_var)
    list(APPEND config_options "-DLIBC_TRAP_ON_RAISE_FP_EXCEPT")
  endif()

+  if(LIBC_CONF_WCTYPE_MODE)
+    list(APPEND config_options "-DLIBC_CONF_WCTYPE_MODE=${LIBC_CONF_WCTYPE_MODE}")
+  endif()
+
  if(LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT)
    list(APPEND config_options "-DLIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT=${LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT}")
  endif()
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -413,6 +413,7 @@ add_subdirectory(time)
 # Therefore, cannot currently build this on macos in overlay mode
 if(NOT (LIBC_TARGET_OS_IS_DARWIN))
  add_subdirectory(wchar)
+  add_subdirectory(wctype)
 endif()

 add_subdirectory(math)
--- a/libc/src/__support/wctype/CMakeLists.txt
+++ b/libc/src/__support/wctype/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_header_library(
+  wctype_classification_utils
+  HDRS
+    wctype_classification_utils.h
+  DEPENDS
+    libc.hdr.types.wchar_t
+    libc.hdr.stdint_proxy
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.config
+    libc.src.__support.CPP.limits
+    libc.src.__support.libc_assert
+)
--- a/libc/src/__support/wctype/wctype_classification_utils.h
+++ b/libc/src/__support/wctype/wctype_classification_utils.h
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -293,4 +293,5 @@ add_subdirectory(threads)
 # Cannot currently build this on MacOS in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
  add_subdirectory(wchar)
+  add_subdirectory(wctype)
 endif()
--- a/libc/test/src/__support/wctype/CMakeLists.txt
+++ b/libc/test/src/__support/wctype/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_custom_target(libc-support-wctype-tests)
+
+add_libc_test(
+  wctype_classification_utils_test
+  SUITE
+    libc-support-tests
+  SRCS
+    wctype_classification_utils_test.cpp
+  DEPENDS
+    libc.hdr.stdint_proxy
+    libc.src.__support.wctype.wctype_classification_utils
+)
+
--- a/libc/test/src/__support/wctype/wctype_classification_utils_test.cpp
+++ b/libc/test/src/__support/wctype/wctype_classification_utils_test.cpp
@@ -0,0 +1,540 @@
+//===-- Unittests for wctype classification utils -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/stdint_proxy.h"
+#include "src/__support/wctype/wctype_classification_utils.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+// Some platform (like Windows) have a 16 bit wchar_t. We guard the cases that
+// do not fit within 16 bits to prevent narrowing conversion and incorrect test
+// results.
+struct TestCase {
+  uint32_t wc;
+  const char *name;
+  bool expected;
+};
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Lower) {
+  TestCase cases[] = {// ASCII lowercase
+                      {0x0061, "LATIN SMALL LETTER A", true},
+                      {0x007A, "LATIN SMALL LETTER Z", true},
+
+                      // ASCII uppercase
+                      {0x0041, "LATIN CAPITAL LETTER A", false},
+                      {0x005A, "LATIN CAPITAL LETTER Z", false},
+
+                      // ASCII non-letters
+                      {0x0030, "DIGIT ZERO", false},
+                      {0x0020, "SPACE", false},
+                      {0x0021, "EXCLAMATION MARK", false},
+
+                      // Latin Extended lowercase
+                      {0x00E0, "LATIN SMALL LETTER A WITH GRAVE", true},
+                      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+                      {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
+
+                      // Latin Extended uppercase
+                      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+                      {0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", false},
+
+                      // Greek lowercase
+                      {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+                      {0x03C9, "GREEK SMALL LETTER OMEGA", true},
+
+                      // Greek uppercase
+                      {0x0391, "GREEK CAPITAL LETTER ALPHA", false},
+                      {0x03A9, "GREEK CAPITAL LETTER OMEGA", false},
+
+                      // Cyrillic lowercase
+                      {0x0430, "CYRILLIC SMALL LETTER A", true},
+                      {0x044F, "CYRILLIC SMALL LETTER YA", true},
+
+                      // Cyrillic uppercase
+                      {0x0410, "CYRILLIC CAPITAL LETTER A", false},
+                      {0x042F, "CYRILLIC CAPITAL LETTER YA", false},
+
+                      // Caseless scripts
+                      {0x05D0, "HEBREW LETTER ALEF", false},
+                      {0x0627, "ARABIC LETTER ALEF", false},
+                      {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::LOWER;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Upper) {
+  TestCase cases[] = {
+      // ASCII lowercase
+      {0x0061, "LATIN SMALL LETTER A", false},
+      {0x007A, "LATIN SMALL LETTER Z", false},
+
+      // ASCII uppercase
+      {0x0041, "LATIN CAPITAL LETTER A", true},
+      {0x005A, "LATIN CAPITAL LETTER Z", true},
+
+      // ASCII non-letters
+      {0x0030, "DIGIT ZERO", false},
+      {0x0020, "SPACE", false},
+      {0x0021, "EXCLAMATION MARK", false},
+
+      // Titlecase
+      {0x01C5, "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON", true},
+
+      // Latin Extended lowercase
+      {0x00E0, "LATIN SMALL LETTER A WITH GRAVE", false},
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", false},
+
+      // Latin Extended uppercase
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+      {0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", true},
+
+      // Greek lowercase
+      {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+      {0x03C9, "GREEK SMALL LETTER OMEGA", false},
+
+      // Greek uppercase
+      {0x0391, "GREEK CAPITAL LETTER ALPHA", true},
+      {0x03A9, "GREEK CAPITAL LETTER OMEGA", true},
+
+      // Cyrillic lowercase
+      {0x0430, "CYRILLIC SMALL LETTER A", false},
+      {0x044F, "CYRILLIC SMALL LETTER YA", false},
+
+      // Cyrillic uppercase
+      {0x0410, "CYRILLIC CAPITAL LETTER A", true},
+      {0x042F, "CYRILLIC CAPITAL LETTER YA", true},
+
+      // Caseless scripts
+      {0x05D0, "HEBREW LETTER ALEF", false},
+      {0x0627, "ARABIC LETTER ALEF", false},
+      {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::UPPER;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Alpha) {
+  TestCase cases[] = {
+      // ASCII letters
+      {0x0041, "LATIN CAPITAL LETTER A", true},
+      {0x0061, "LATIN SMALL LETTER A", true},
+      {0x005A, "LATIN CAPITAL LETTER Z", true},
+      {0x007A, "LATIN SMALL LETTER Z", true},
+
+      // ASCII non-letters
+      {0x0030, "DIGIT ZERO", false},
+      {0x0039, "DIGIT NINE", false},
+      {0x0020, "SPACE", false},
+      {0x0021, "EXCLAMATION MARK", false},
+      {0x007E, "TILDE", false},
+
+      // Modified letters
+      {0x02B0, "MODIFIED LETTER SMALL H", true},
+
+      // Latin Extended
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+      {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
+
+      // Greek
+      {0x0391, "GREEK CAPITAL LETTER ALPHA", true},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+      {0x03C9, "GREEK SMALL LETTER OMEGA", true},
+
+      // Cyrillic
+      {0x0410, "CYRILLIC CAPITAL LETTER A", true},
+      {0x0430, "CYRILLIC SMALL LETTER A", true},
+      {0x044F, "CYRILLIC SMALL LETTER YA", true},
+
+      // Arabic
+      {0x0627, "ARABIC LETTER ALEF", true},
+      {0x0628, "ARABIC LETTER BEH", true},
+
+      // CJK
+      {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00 (first)", true},
+      {0x4E01, "CJK UNIFIED IDEOGRAPH-4E01", true},
+      {0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF (last in BMP)", true},
+
+      // Emoji and symbols
+      {0x2764, "HEAVY BLACK HEART", false},
+
+      // Special cases
+      {0x0000, "NULL", false},
+      {0xFFFD, "REPLACEMENT CHARACTER", false},
+
+      // Roman numerals
+      {0x2160, "ROMAN NUMERAL ONE", true},
+      {0x2161, "ROMAN NUMERAL TWO", true},
+      {0x2162, "ROMAN NUMERAL THREE", true},
+      {0x2169, "ROMAN NUMERAL TEN", true},
+      {0x216C, "ROMAN NUMERAL FIFTY", true},
+      {0x216D, "ROMAN NUMERAL ONE HUNDRED", true},
+      {0x216E, "ROMAN NUMERAL FIVE HUNDRED", true},
+      {0x216F, "ROMAN NUMERAL ONE THOUSAND", true},
+
+      // ASCII digits
+      {0x0030, "DIGIT ZERO", false},
+      {0x0031, "DIGIT ONE", false},
+
+      // Non ASCII digits
+      {0x0660, "ARABIC-INDIC DIGIT ZERO", true},
+      {0x09e6, "BENGALI DIGIT ZERO", true},
+
+      // Combining marks
+      {0x0300, "COMBINING GRAVE ACCENT", false},
+
+#if WCHAR_MAX > 0xFFFF
+      {0x1F600, "GRINNING FACE", false},
+      {0x20000, "CJK UNIFIED IDEOGRAPH-20000", true},
+#endif
+
+  };
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::ALPHA;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Punct) {
+  TestCase cases[] = {// ASCII punctuation
+                      {0x0021, "EXCLAMATION MARK", true},
+                      {0x0022, "QUOTATION MARK", true},
+                      {0x0023, "NUMBER SIGN", true},
+                      {0x002C, "COMMA", true},
+                      {0x002E, "FULL STOP", true},
+                      {0x002F, "SOLIDUS", true},
+                      {0x003A, "COLON", true},
+                      {0x003B, "SEMICOLON", true},
+                      {0x003F, "QUESTION MARK", true},
+                      {0x0040, "COMMERCIAL AT", true},
+                      {0x005B, "LEFT SQUARE BRACKET", true},
+                      {0x005D, "RIGHT SQUARE BRACKET", true},
+                      {0x007B, "LEFT CURLY BRACKET", true},
+                      {0x007D, "RIGHT CURLY BRACKET", true},
+
+                      // ASCII non-punctuation
+                      {0x0041, "LATIN CAPITAL LETTER A", false},
+                      {0x0061, "LATIN SMALL LETTER A", false},
+                      {0x0030, "DIGIT ZERO", false},
+                      {0x0020, "SPACE", false},
+
+                      // Unicode punctuation
+                      {0x00A1, "INVERTED EXCLAMATION MARK", true},
+                      {0x00BF, "INVERTED QUESTION MARK", true},
+                      {0x2013, "EN DASH", true},
+                      {0x2014, "EM DASH", true},
+                      {0x2018, "LEFT SINGLE QUOTATION MARK", true},
+                      {0x2019, "RIGHT SINGLE QUOTATION MARK", true},
+                      {0x201C, "LEFT DOUBLE QUOTATION MARK", true},
+                      {0x201D, "RIGHT DOUBLE QUOTATION MARK", true},
+                      {0x2026, "HORIZONTAL ELLIPSIS", true},
+                      {0x2030, "PER MILLE SIGN", true},
+                      {0x3001, "IDEOGRAPHIC COMMA", true},
+                      {0x3002, "IDEOGRAPHIC FULL STOP", true},
+                      {0xFF01, "FULLWIDTH EXCLAMATION MARK", true},
+                      {0xFF1F, "FULLWIDTH QUESTION MARK", true},
+
+                      // Symbols (treated as punct in C.UTF-8)
+                      {0x00A9, "COPYRIGHT SIGN", true},
+                      {0x20AC, "EURO SIGN", true},
+                      {0x2764, "HEAVY BLACK HEART", true},
+                      {0x002B, "PLUS SIGN", true},
+                      {0x00B6, "PILCROW SIGN", true},
+                      {0x00A7, "SECTION SIGN", true},
+                      {0x2022, "BULLET", true},
+                      {0x2023, "TRIANGULAR BULLET", true},
+                      {0x2020, "DAGGER", true},
+                      {0x2021, "DOUBLE DAGGER", true},
+
+                      // Math symbols (treated as punct in C.UTF-8)
+                      {0x00D7, "MULTIPLICATION SIGN", true},
+                      {0x00F7, "DIVISION SIGN", true},
+                      {0x2212, "MINUS SIGN", true},
+                      {0x221E, "INFINITY", true}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::PUNCT;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Print) {
+  TestCase cases[] = {
+      // ASCII printable characters
+      {0x0020, "SPACE", true},
+      {0x0021, "EXCLAMATION MARK", true},
+      {0x0030, "DIGIT ZERO", true},
+      {0x0041, "LATIN CAPITAL LETTER A", true},
+      {0x0061, "LATIN SMALL LETTER A", true},
+      {0x007E, "TILDE", true},
+
+      // ASCII control characters
+      {0x0000, "NULL", false},
+      {0x0009, "TAB", false},
+      {0x000A, "LINE FEED", false},
+      {0x000D, "CARRIAGE RETURN", false},
+      {0x001F, "UNIT SEPARATOR", false},
+      {0x007F, "DELETE", false},
+
+      // Non ASCII printable
+      {0x00A0, "NO-BREAK SPACE", true},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+      {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
+      {0x0391, "GREEK CAPITAL LETTER ALPHA", true},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+      {0x0410, "CYRILLIC CAPITAL LETTER A", true},
+      {0x0430, "CYRILLIC SMALL LETTER A", true},
+      {0x0627, "ARABIC LETTER ALEF", true},
+      {0x05D0, "HEBREW LETTER ALEF", true},
+      {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", true},
+      {0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF", true},
+      {0x3042, "HIRAGANA LETTER A", true},
+      {0x30A2, "KATAKANA LETTER A", true},
+      {0xAC00, "HANGUL SYLLABLE GA", true},
+
+      // Emoji and symbols
+      {0x2764, "HEAVY BLACK HEART", true},
+
+      // Punctuation
+      {0x002E, "FULL STOP", true},
+      {0x002C, "COMMA", true},
+      {0x003A, "COLON", true},
+
+      // C1 control characters
+      {0x0080, "PADDING CHARACTER", false},
+      {0x009F, "APPLICATION PROGRAM COMMAND", false},
+
+      {0xFFFD, "REPLACEMENT CHARACTER", true},
+
+      // Format characters
+      {0x00AD, "SOFT HYPHEN", false},
+      {0x200C, "ZERO WIDTH NON-JOINER", false},
+
+      // Combining marks
+      {0x0300, "COMBINING GRAVE ACCENT", true},
+
+      // Private use area
+      {0xE000, "PRIVATE USE AREA (first)", true},
+      {0xF000, "PRIVATE USE AREA (last)", true},
+
+#if WCHAR_MAX > 0xFFFF
+      {0x10FFFD, "SUPPLEMENTARY PRIVATE USE AREA B", true},
+      {0x1F600, "GRINNING FACE", true},
+#endif
+  };
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::PRINT;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Control) {
+  TestCase cases[] = {// ASCII control characters
+                      {0x0000, "NULL", true},
+                      {0x0001, "START OF HEADING", true},
+                      {0x0009, "TAB", true},
+                      {0x000A, "LINE FEED", true},
+                      {0x000D, "CARRIAGE RETURN", true},
+                      {0x001B, "ESCAPE", true},
+                      {0x001F, "UNIT SEPARATOR", true},
+
+                      // ASCII printable characters
+                      {0x0020, "SPACE", false},
+                      {0x0021, "EXCLAMATION MARK", false},
+                      {0x0030, "DIGIT ZERO", false},
+                      {0x0041, "LATIN CAPITAL LETTER A", false},
+                      {0x0061, "LATIN SMALL LETTER A", false},
+                      {0x007E, "TILDE", false},
+
+                      // DELETE character
+                      {0x007F, "DELETE", true},
+
+                      // C1 control characters
+                      {0x0080, "PADDING CHARACTER", true},
+                      {0x0081, "HIGH OCTET PRESET", true},
+                      {0x0090, "DEVICE CONTROL STRING", true},
+                      {0x009F, "APPLICATION PROGRAM COMMAND", true},
+
+                      // Non-control characters after C1 range
+                      {0x00A0, "NO-BREAK SPACE", false},
+                      {0x00A1, "INVERTED EXCLAMATION MARK", false},
+                      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+                      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+
+                      // Letters
+                      {0x0391, "GREEK CAPITAL LETTER ALPHA", false},
+                      {0x0410, "CYRILLIC CAPITAL LETTER A", false},
+                      {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::CNTRL;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Space) {
+  TestCase cases[] = {// ASCII whitespace
+                      {0x0020, "SPACE", true},
+                      {0x0009, "CHARACTER TABULATION (TAB)", true},
+                      {0x000A, "LINE FEED", true},
+                      {0x000B, "LINE TABULATION", true},
+                      {0x000C, "FORM FEED", true},
+                      {0x000D, "CARRIAGE RETURN", true},
+
+                      // ASCII non-whitespace
+                      {0x0041, "LATIN CAPITAL LETTER A", false},
+                      {0x0030, "DIGIT ZERO", false},
+                      {0x0021, "EXCLAMATION MARK", false},
+
+                      // Unicode whitespace
+                      {0x1680, "OGHAM SPACE MARK", true},
+                      {0x2000, "EN QUAD", true},
+                      {0x2001, "EM QUAD", true},
+                      {0x2002, "EN SPACE", true},
+                      {0x2003, "EM SPACE", true},
+                      {0x2004, "THREE-PER-EM SPACE", true},
+                      {0x2005, "FOUR-PER-EM SPACE", true},
+                      {0x2006, "SIX-PER-EM SPACE", true},
+                      {0x2008, "PUNCTUATION SPACE", true},
+                      {0x2009, "THIN SPACE", true},
+                      {0x200A, "HAIR SPACE", true},
+                      {0x2028, "LINE SEPARATOR", true},
+                      {0x2029, "PARAGRAPH SEPARATOR", true},
+                      {0x205F, "MEDIUM MATHEMATICAL SPACE", true},
+                      {0x3000, "IDEOGRAPHIC SPACE", true},
+
+                      // Unicode non-whitespace
+                      {0x202F, "NARROW NO-BREAK SPACE", false},
+                      {0x0085, "NEXT LINE", false},
+                      {0x00A0, "NO-BREAK SPACE", false},
+                      {0x2007, "FIGURE SPACE", false},
+                      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+                      {0x2764, "HEAVY BLACK HEART", false}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::SPACE;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Blank) {
+  TestCase cases[] = {// Blank characters
+                      {0x0020, "SPACE", true},
+                      {0x0009, "CHARACTER TABULATION (TAB)", true},
+
+                      // Non-blank whitespace
+                      {0x000A, "LINE FEED", false},
+                      {0x000D, "CARRIAGE RETURN", false},
+                      {0x000B, "LINE TABULATION", false},
+                      {0x000C, "FORM FEED", false},
+
+                      // Unicode blank characters
+                      {0x1680, "OGHAM SPACE MARK", true},
+                      {0x2000, "EN QUAD", true},
+                      {0x2001, "EM QUAD", true},
+                      {0x2002, "EN SPACE", true},
+                      {0x2003, "EM SPACE", true},
+                      {0x2004, "THREE-PER-EM SPACE", true},
+                      {0x2005, "FOUR-PER-EM SPACE", true},
+                      {0x2006, "SIX-PER-EM SPACE", true},
+                      {0x2008, "PUNCTUATION SPACE", true},
+                      {0x2009, "THIN SPACE", true},
+                      {0x200A, "HAIR SPACE", true},
+                      {0x3000, "IDEOGRAPHIC SPACE", true},
+
+                      // Non-blank characters
+                      {0x0041, "LATIN CAPITAL LETTER A", false},
+                      {0x0030, "DIGIT ZERO", false},
+                      {0x0021, "EXCLAMATION MARK", false},
+                      {0x00A0, "NO-BREAK SPACE", false},
+                      {0x2007, "FIGURE SPACE", false},
+                      {0x202F, "NARROW NO-BREAK SPACE", false},
+                      {0x205F, "MEDIUM MATHEMATICAL SPACE", true},
+                      {0x2028, "LINE SEPARATOR", false}};
+
+  for (const auto &tc : cases) {
+    bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+               LIBC_NAMESPACE::PropertyFlag::BLANK;
+    EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, InvalidCodepoints) {
+  struct InvalidTestCase {
+    uint32_t wc;
+    const char *name;
+  };
+
+  InvalidTestCase cases[] = {
+      // Surrogate pair range
+      {0xD800, "HIGH SURROGATE START"}, {0xD900, "HIGH SURROGATE MIDDLE"},
+      {0xDBFF, "HIGH SURROGATE END"},   {0xDC00, "LOW SURROGATE START"},
+      {0xDD00, "LOW SURROGATE MIDDLE"}, {0xDFFF, "LOW SURROGATE END"},
+
+#if WCHAR_MAX > 0xFFFF
+      {0x110000, "Beyond max Unicode"},
+#endif
+  };
+
+  for (const auto &tc : cases) {
+    uint8_t props =
+        LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
+    EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
+  }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Noncharacters) {
+  struct NoncharacterTestCase {
+    uint32_t wc;
+    const char *name;
+  };
+
+  NoncharacterTestCase cases[] = {
+      // BMP noncharacters
+      {0xFFFE, "BMP NONCHARACTER U+FFFE"},
+      {0xFFFF, "BMP NONCHARACTER U+FFFF"},
+
+      // Arabic Presentation Forms noncharacters
+      {0xFDD0, "NONCHARACTER U+FDD0"},
+      {0xFDD5, "NONCHARACTER U+FDD5"},
+
+#if WCHAR_MAX > 0xFFFF
+      // Supplementary plane noncharacters
+      {0x1FFFE, "PLANE 1 NONCHARACTER"},
+      {0x2FFFE, "PLANE 2 NONCHARACTER"},
+      {0x3FFFE, "PLANE 3 NONCHARACTER"},
+      {0x10FFFE, "PLANE 16 NONCHARACTER"},
+      {0x10FFFF, "PLANE 16 NONCHARACTER"},
+#endif
+  };
+
+  for (const auto &tc : cases) {
+    uint8_t props =
+        LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc));
+    EXPECT_EQ(props, uint8_t{0}) << tc.name << "\n";
+  }
+}
+
+} // namespace
--- a/libc/utils/wctype_utils/classification/init.py
+++ b/libc/utils/wctype_utils/classification/init.py
@@ -0,0 +1,3 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--- a/libc/utils/wctype_utils/classification/gen_classification_data.py
+++ b/libc/utils/wctype_utils/classification/gen_classification_data.py
@@ -0,0 +1,308 @@
+# ===- Generate classification tables for wctype utils -----*- python -*----==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+
+from enum import IntFlag
+from dataclasses import dataclass
+from collections import defaultdict
+from sys import argv
+
+
+# WARNING: If you modify this enum, you must update the generated C++ enum
+# in generate_code as well
+class PropertyFlag(IntFlag):
+    UPPER = 1 << 0
+    LOWER = 1 << 1
+    ALPHA = 1 << 2
+    SPACE = 1 << 3
+    PRINT = 1 << 4
+    BLANK = 1 << 5
+    CNTRL = 1 << 6
+    PUNCT = 1 << 7
+
+
+@dataclass
+class UnicodeEntry:
+    codepoint: int
+    name: str
+    category: str
+
+
+def read_unicode_data(filename: str) -> list[UnicodeEntry]:
+    """Reads Unicode data from file and returns list of entries."""
+    entries: list[UnicodeEntry] = []
+
+    try:
+        with open(filename, "r", encoding="utf-8") as file:
+            for line in file:
+                line = line.strip()
+
+                if not line or line.startswith("#"):
+                    continue
+
+                fields = line.split(";")
+
+                if len(fields) < 3:
+                    continue
+
+                codepoint_str = fields[0].strip()
+                name = fields[1].strip()
+                category = fields[2].strip()
+
+                codepoint = int(codepoint_str, 16)
+
+                entries.append(UnicodeEntry(codepoint, name, category))
+
+    except FileNotFoundError:
+        raise RuntimeError(f"Cannot open file: {filename}")
+
+    return entries
+
+
+from dataclasses import dataclass
+
+# Non-whitespace spaces in C.UTF-8
+NON_WHITESPACE_SPACES = {0x00A0, 0x2007, 0x202F}
+
+ASCII_DIGITS = {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39}
+
+
+def handle_ranges(
+    properties: defaultdict[int, int], entries: list[UnicodeEntry]
+) -> None:
+    """Handles Unicode ranges defined by <First> and <Last>."""
+    range_start: int | None = None
+    range_props: int | None = None
+
+    for entry in entries:
+        if ", First>" in entry.name:
+            range_start = entry.codepoint
+            range_props = properties[entry.codepoint]
+        elif ", Last>" in entry.name and range_start and range_props:
+            for cp in range(range_start, entry.codepoint + 1):
+                properties[cp] = range_props
+            range_start = None
+            range_props = None
+
+
+def get_props(entry: UnicodeEntry) -> int:
+    """Creates the property flag for a given UnicodeEntry."""
+    codepoint = entry.codepoint
+    category = entry.category
+    props = 0
+
+    match category[0]:
+        case "L":
+            props |= PropertyFlag.ALPHA
+            if category in ("Lu", "Lt"):
+                props |= PropertyFlag.UPPER
+            elif category == "Ll":
+                props |= PropertyFlag.LOWER
+
+        case "N":
+            # In C.UTF8, non-ASCII digits/letter-numbers are alpha
+            if category in ("Nd", "Nl") and codepoint not in ASCII_DIGITS:
+                props |= PropertyFlag.ALPHA
+
+        case "P" | "S":
+            # Symbols are considered punctuation in C.UTF8
+            props |= PropertyFlag.PUNCT
+
+        case "Z":
+            if codepoint not in NON_WHITESPACE_SPACES:
+                props |= PropertyFlag.SPACE
+                if category == "Zs":
+                    props |= PropertyFlag.BLANK
+
+        case "C":
+            if category == "Cc":
+                props |= PropertyFlag.CNTRL
+
+    # Print = all except control, unassigned, surrogate, format
+    if category not in ("Cc", "Cs", "Cn", "Cf"):
+        props |= PropertyFlag.PRINT
+
+    return props
+
+
+def handle_special_cases(properties: defaultdict[int, int]) -> None:
+    """Handles special cases not parseable from UnicodeData.txt."""
+    # ASCII whitespace characters
+    properties[0x0020] |= PropertyFlag.SPACE  # SPACE
+    properties[0x0009] |= PropertyFlag.SPACE  # TAB
+    properties[0x000A] |= PropertyFlag.SPACE  # LINE FEED
+    properties[0x000D] |= PropertyFlag.SPACE  # CARRIAGE RETURN
+    properties[0x000B] |= PropertyFlag.SPACE  # VERTICAL TAB
+    properties[0x000C] |= PropertyFlag.SPACE  # FORM FEED
+
+    # Blank
+    properties[0x0020] |= PropertyFlag.BLANK  # SPACE
+    properties[0x0009] |= PropertyFlag.BLANK  # TAB
+
+
+def parse_unicode_data(entries: list[UnicodeEntry]) -> defaultdict[int, int]:
+    """Returns codepoint -> property flag mappings."""
+    properties: defaultdict[int, int] = defaultdict(int)
+
+    for entry in entries:
+        codepoint = entry.codepoint
+
+        # Skip surrogate pairs
+        if 0xD800 <= codepoint <= 0xDFFF:
+            continue
+
+        properties[codepoint] = get_props(entry)
+
+    handle_ranges(properties, entries)
+    handle_special_cases(properties)
+
+    return properties
+
+
+@dataclass
+class StagedLookupTable:
+    level1: list[int]  # Maps codepoint >> 8 to level2 offset
+    level2: list[int]  # Actual properties
+
+
+def build_lookup_tables(properties: defaultdict[int, int]) -> StagedLookupTable:
+    """Builds two-level lookup tables."""
+    UNICODE_MAX = 0x110000
+    BLOCK_SIZE = 256
+    NUM_BLOCKS = UNICODE_MAX // BLOCK_SIZE
+
+    # Maps block content -> block index in level2
+    blocks: defaultdict[tuple[int, ...], int] = defaultdict(int)
+    level1: list[int] = []
+    level2: list[int] = []
+
+    for block_num in range(NUM_BLOCKS):
+        block_content = tuple(
+            properties.get((block_num << 8) | offset, 0) for offset in range(BLOCK_SIZE)
+        )
+
+        if block_content in blocks:
+            # Reuse existing block
+            level1.append(blocks[block_content])
+        else:
+            # New block - add to level2
+            block_index = len(level2)
+            blocks[block_content] = block_index
+
+            level2.extend(block_content)
+            level1.append(block_index)
+
+    print("Table statistics:")
+    print(f"  Level 1 entries: {len(level1)}")
+    print(f"  Level 2 entries: {len(level2)}")
+    print(f"  Size: {len(level1) * 2 + len(level2)} bytes")
+
+    return StagedLookupTable(level1, level2)
+
+
+def generate_code(lookup_table: StagedLookupTable, llvm_project_root_path: str) -> None:
+    """Generates C++ header with lookup tables."""
+    level1 = lookup_table.level1
+    level2 = lookup_table.level2
+
+    with open(
+        f"{llvm_project_root_path}/libc/src/__support/wctype/wctype_classification_utils.h",
+        "w",
+    ) as f:
+        f.write(
+            f"""//===-- Utils for wctype classification functions ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// DO NOT EDIT MANUALLY.
+// This file is generated by libc/utils/wctype_utils scripts.
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
+
+#include "hdr/stdint_proxy.h" 
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/libc_assert.h"
+#include "src/__support/CPP/limits.h"
+
+namespace LIBC_NAMESPACE_DECL {{
+
+// Property flags for Unicode categories
+enum PropertyFlag : uint8_t {{
+  UPPER = 1 << 0,
+  LOWER = 1 << 1,
+  ALPHA = 1 << 2,
+  SPACE = 1 << 3,
+  PRINT = 1 << 4,
+  BLANK = 1 << 5,
+  CNTRL = 1 << 6,
+  PUNCT = 1 << 7,
+}};
+
+static_assert({len(level1)} <= cpp::numeric_limits<unsigned short>::max());
+static_assert({len(level2)} <= cpp::numeric_limits<unsigned short>::max());
+
+LIBC_INLINE_VAR constexpr uint16_t LEVEL1_SIZE = {len(level1)};
+LIBC_INLINE_VAR constexpr uint16_t LEVEL2_SIZE = {len(level2)};
+
+// Level 1 table: indexed by (codepoint >> 8), stores level2 block offsets
+LIBC_INLINE_VAR constexpr uint16_t level1[LEVEL1_SIZE] = {{
+"""
+        )
+        for i in range(0, len(level1), 11):
+            f.write("  ")
+            for j in range(i, min(i + 11, len(level1))):
+                f.write(f"{level1[j]:7d}")
+                if j + 1 < len(level1):
+                    f.write(",")
+            f.write("\n")
+        f.write(
+            f"""}};
+
+// Level 2 table: blocks of 256 property flags
+LIBC_INLINE_VAR constexpr uint8_t level2[LEVEL2_SIZE] = {{
+"""
+        )
+        for i in range(0, len(level2), 11):
+            f.write("  ")
+            for j in range(i, min(i + 11, len(level2))):
+                f.write(f"0x{level2[j]:02x}")
+                if j + 1 < len(level2):
+                    f.write(", ")
+            f.write("\n")
+        f.write(
+            f"""}};
+
+// Returns the Unicode property flag for a given wide character.
+LIBC_INLINE constexpr uint8_t lookup_properties(const wchar_t wc) {{
+  // Out of Unicode range
+  if (static_cast<uint32_t>(wc) > 0x10FFFF) {{
+    return 0;
+  }}
+
+  uint16_t l1_idx = static_cast<uint16_t>(wc >> 8);
+  LIBC_ASSERT(l1_idx < LEVEL1_SIZE);
+
+  uint16_t l2_offset = level1[l1_idx];
+  uint16_t l2_idx = l2_offset + (wc & 0xFF);
+  LIBC_ASSERT(l2_idx < LEVEL2_SIZE);
+
+  return level2[l2_idx];
+}}
+
+}} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_WCTYPE_CLASSIFICATION_UTILS_H
+
+"""
+        )
--- a/libc/utils/wctype_utils/gen.py
+++ b/libc/utils/wctype_utils/gen.py
@@ -10,6 +10,12 @@

 from conversion.gen_conversion_data import extract_maps_from_unicode_file
 from conversion.hex_writer import write_hex_conversions
+from classification.gen_classification_data import (
+    read_unicode_data,
+    parse_unicode_data,
+    build_lookup_tables,
+    generate_code,
+)
 from sys import argv
 from sys import exit

@@ -31,6 +37,16 @@ def write_wctype_conversion_data(
    )


+def write_wctype_classification_data(
+    llvm_project_root_path: str, unicode_data_folder_path: str
+) -> None:
+    """Generates wctype classification utils"""
+    entries = read_unicode_data(f"{unicode_data_folder_path}/UnicodeData.txt")
+    properties = parse_unicode_data(entries)
+    tables = build_lookup_tables(properties)
+    generate_code(tables, llvm_project_root_path)
+
+
 def main() -> None:
    if len(argv) != 3:
        print("Codegen: wctype data generator script")
@@ -45,6 +61,9 @@ def main() -> None:
    write_wctype_conversion_data(
        llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
    )
+    write_wctype_classification_data(
+        llvm_project_root_path=argv[1], unicode_data_folder_path=argv[2]
+    )
    print(f"wctype conversion data is written to {argv[1]}/libc/src/__support/wctype/")