diff mbox series

[v2,12/13] vt: update gen_ucs_width_table.py to make tables more space efficient

Message ID 20250415192212.33949-13-nico@fluxnic.net
State New
Headers show
Series vt: implement proper Unicode handling | expand

Commit Message

Nicolas Pitre April 15, 2025, 7:18 p.m. UTC
From: Nicolas Pitre <npitre@baylibre.com>

Split table ranges into BMP (16-bit) and non-BMP (above 16-bit).
This reduces the corresponding text size by 20-25%.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 drivers/tty/vt/gen_ucs_width_table.py | 55 ++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/drivers/tty/vt/gen_ucs_width_table.py b/drivers/tty/vt/gen_ucs_width_table.py
index 00510444a7..059ed9a8ba 100755
--- a/drivers/tty/vt/gen_ucs_width_table.py
+++ b/drivers/tty/vt/gen_ucs_width_table.py
@@ -194,6 +194,27 @@  def write_tables(zero_width_ranges, double_width_ranges):
         double_width_ranges: List of (start, end) ranges for double-width characters
     """
 
+    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
+    def split_ranges_by_size(ranges):
+        bmp_ranges = []
+        non_bmp_ranges = []
+
+        for start, end in ranges:
+            if end <= 0xFFFF:
+                bmp_ranges.append((start, end))
+            elif start > 0xFFFF:
+                non_bmp_ranges.append((start, end))
+            else:
+                # Split the range at 0xFFFF
+                bmp_ranges.append((start, 0xFFFF))
+                non_bmp_ranges.append((0x10000, end))
+
+        return bmp_ranges, non_bmp_ranges
+
+    # Split ranges into BMP and non-BMP
+    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
+    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
+
     # Function to generate code point description comments
     def get_code_point_comment(start, end):
         try:
@@ -221,22 +242,44 @@  def write_tables(zero_width_ranges, double_width_ranges):
  * Unicode Version: {unicodedata.unidata_version}
  */
 
-/* Zero-width character ranges */
-static const struct ucs_interval ucs_zero_width_ranges[] = {{
+/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
+static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{
+""")
+
+        for start, end in zero_width_bmp:
+            comment = get_code_point_comment(start, end)
+            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
+
+        f.write("""\
+};
+
+/* Zero-width character ranges (non-BMP, U+10000 and above) */
+static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = {
 """)
 
-        for start, end in zero_width_ranges:
+        for start, end in zero_width_non_bmp:
             comment = get_code_point_comment(start, end)
             f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
 
         f.write("""\
 };
 
-/* Double-width character ranges */
-static const struct ucs_interval ucs_double_width_ranges[] = {
+/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
+static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = {
+""")
+
+        for start, end in double_width_bmp:
+            comment = get_code_point_comment(start, end)
+            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
+
+        f.write("""\
+};
+
+/* Double-width character ranges (non-BMP, U+10000 and above) */
+static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = {
 """)
 
-        for start, end in double_width_ranges:
+        for start, end in double_width_non_bmp:
             comment = get_code_point_comment(start, end)
             f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")