diff mbox series

[09/11] vt: update gen_ucs_width.py to produce more space efficient tables

Message ID 20250410011839.64418-10-nico@fluxnic.net
State New
Headers show
Series vt: implement proper Unicode handling | expand

Commit Message

Nicolas Pitre April 10, 2025, 1:14 a.m. UTC
From: Nicolas Pitre <npitre@baylibre.com>

Split table ranges into BMP (16-bit) and non-BMP (above 16-bit).
This reduces the corresponding text size by 20-25%.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 drivers/tty/vt/gen_ucs_width.py | 154 +++++++++++++++++++++++---------
 1 file changed, 113 insertions(+), 41 deletions(-)
diff mbox series

Patch

diff --git a/drivers/tty/vt/gen_ucs_width.py b/drivers/tty/vt/gen_ucs_width.py
index 41997fe001..c6cbc93e83 100755
--- a/drivers/tty/vt/gen_ucs_width.py
+++ b/drivers/tty/vt/gen_ucs_width.py
@@ -132,13 +132,49 @@  def generate_ucs_width():
         ranges.append((start, prev))
         return ranges
 
+    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
+    def split_ranges_by_size(ranges):
+        bmp_ranges = []
+        non_bmp_ranges = []
+
+        for start, end in ranges:
+            if end <= 0xFFFF:
+                bmp_ranges.append((start, end))
+            elif start > 0xFFFF:
+                non_bmp_ranges.append((start, end))
+            else:
+                # Split the range at 0xFFFF
+                bmp_ranges.append((start, 0xFFFF))
+                non_bmp_ranges.append((0x10000, end))
+
+        return bmp_ranges, non_bmp_ranges
+
     # Extract ranges for each width
     zero_width_ranges = ranges_optimize(width_map, 0)
     double_width_ranges = ranges_optimize(width_map, 2)
 
+    # Split ranges into BMP and non-BMP
+    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
+    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
+
     # Get Unicode version information
     unicode_version = unicodedata.unidata_version
 
+    # Function to generate code point description comments
+    def get_code_point_comment(start, end):
+        try:
+            start_char_desc = unicodedata.name(chr(start))
+            if start == end:
+                return f"/* {start_char_desc} */"
+            else:
+                end_char_desc = unicodedata.name(chr(end))
+                return f"/* {start_char_desc} - {end_char_desc} */"
+        except:
+            if start == end:
+                return f"/* U+{start:04X} */"
+            else:
+                return f"/* U+{start:04X} - U+{end:04X} */"
+
     # Generate C implementation file
     with open(c_file, 'w') as f:
         f.write(f"""\
@@ -156,62 +192,77 @@  def generate_ucs_width():
 #include <linux/bsearch.h>
 #include <linux/consolemap.h>
 
-struct interval {{
+struct interval16 {{
+	uint16_t first;
+	uint16_t last;
+}};
+
+struct interval32 {{
 	uint32_t first;
 	uint32_t last;
 }};
 
-/* Zero-width character ranges */
-static const struct interval zero_width_ranges[] = {{
+/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
+static const struct interval16 zero_width_bmp[] = {{
 """)
 
-        for start, end in zero_width_ranges:
-            try:
-                start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
-                if start == end:
-                    comment = f"/* {start_char_desc} */"
-                else:
-                    end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
-                    comment = f"/* {start_char_desc} - {end_char_desc} */"
-            except:
-                if start == end:
-                    comment = f"/* U+{start:05X} */"
-                else:
-                    comment = f"/* U+{start:05X} - U+{end:05X} */"
+        for start, end in zero_width_bmp:
+            comment = get_code_point_comment(start, end)
+            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
+
+        f.write("""\
+};
 
+/* Zero-width character ranges (non-BMP, U+10000 and above) */
+static const struct interval32 zero_width_non_bmp[] = {
+""")
+
+        for start, end in zero_width_non_bmp:
+            comment = get_code_point_comment(start, end)
             f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
 
         f.write("""\
 };
 
-/* Double-width character ranges */
-static const struct interval double_width_ranges[] = {
+/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
+static const struct interval16 double_width_bmp[] = {
 """)
 
-        for start, end in double_width_ranges:
-            try:
-                start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
-                if start == end:
-                    comment = f"/* {start_char_desc} */"
-                else:
-                    end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
-                    comment = f"/* {start_char_desc} - {end_char_desc} */"
-            except:
-                if start == end:
-                    comment = f"/* U+{start:05X} */"
-                else:
-                    comment = f"/* U+{start:05X} - U+{end:05X} */"
+        for start, end in double_width_bmp:
+            comment = get_code_point_comment(start, end)
+            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
+
+        f.write("""\
+};
 
+/* Double-width character ranges (non-BMP, U+10000 and above) */
+static const struct interval32 double_width_non_bmp[] = {
+""")
+
+        for start, end in double_width_non_bmp:
+            comment = get_code_point_comment(start, end)
             f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
 
         f.write("""\
 };
 
 
-static int ucs_cmp(const void *key, const void *element)
+static int ucs_cmp16(const void *key, const void *element)
+{
+	uint16_t cp = *(uint16_t *)key;
+	const struct interval16 *e = element;
+
+	if (cp > e->last)
+		return 1;
+	if (cp < e->first)
+		return -1;
+	return 0;
+}
+
+static int ucs_cmp32(const void *key, const void *element)
 {
 	uint32_t cp = *(uint32_t *)key;
-	const struct interval *e = element;
+	const struct interval32 *e = element;
 
 	if (cp > e->last)
 		return 1;
@@ -220,13 +271,22 @@  static int ucs_cmp(const void *key, const void *element)
 	return 0;
 }
 
-static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t count)
+static bool is_in_interval16(uint16_t cp, const struct interval16 *intervals, size_t count)
 {
 	if (cp < intervals[0].first || cp > intervals[count - 1].last)
 		return false;
 
 	return __inline_bsearch(&cp, intervals, count,
-				sizeof(*intervals), ucs_cmp) != NULL;
+				sizeof(*intervals), ucs_cmp16) != NULL;
+}
+
+static bool is_in_interval32(uint32_t cp, const struct interval32 *intervals, size_t count)
+{
+	if (cp < intervals[0].first || cp > intervals[count - 1].last)
+		return false;
+
+	return __inline_bsearch(&cp, intervals, count,
+				sizeof(*intervals), ucs_cmp32) != NULL;
 }
 
 /**
@@ -237,7 +297,9 @@  static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t
  */
 bool ucs_is_zero_width(uint32_t cp)
 {
-	return is_in_interval(cp, zero_width_ranges, ARRAY_SIZE(zero_width_ranges));
+	return (cp <= 0xFFFF)
+	       ? is_in_interval16(cp, zero_width_bmp, ARRAY_SIZE(zero_width_bmp))
+	       : is_in_interval32(cp, zero_width_non_bmp, ARRAY_SIZE(zero_width_non_bmp));
 }
 
 /**
@@ -248,17 +310,27 @@  bool ucs_is_zero_width(uint32_t cp)
  */
 bool ucs_is_double_width(uint32_t cp)
 {
-	return is_in_interval(cp, double_width_ranges, ARRAY_SIZE(double_width_ranges));
+	return (cp <= 0xFFFF)
+	       ? is_in_interval16(cp, double_width_bmp, ARRAY_SIZE(double_width_bmp))
+	       : is_in_interval32(cp, double_width_non_bmp, ARRAY_SIZE(double_width_non_bmp));
 }
 """)
 
     # Print summary
-    zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
-    double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
+    zero_width_bmp_count = sum(end - start + 1 for start, end in zero_width_bmp)
+    zero_width_non_bmp_count = sum(end - start + 1 for start, end in zero_width_non_bmp)
+    double_width_bmp_count = sum(end - start + 1 for start, end in double_width_bmp)
+    double_width_non_bmp_count = sum(end - start + 1 for start, end in double_width_non_bmp)
+
+    total_zero_width = zero_width_bmp_count + zero_width_non_bmp_count
+    total_double_width = double_width_bmp_count + double_width_non_bmp_count
 
     print(f"Generated {c_file} with:")
-    print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
-    print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
+    print(f"- {len(zero_width_bmp)} zero-width BMP ranges (16-bit) covering ~{zero_width_bmp_count} code points")
+    print(f"- {len(zero_width_non_bmp)} zero-width non-BMP ranges (32-bit) covering ~{zero_width_non_bmp_count} code points")
+    print(f"- {len(double_width_bmp)} double-width BMP ranges (16-bit) covering ~{double_width_bmp_count} code points")
+    print(f"- {len(double_width_non_bmp)} double-width non-BMP ranges (32-bit) covering ~{double_width_non_bmp_count} code points")
+    print(f"Total: {len(zero_width_bmp) + len(zero_width_non_bmp) + len(double_width_bmp) + len(double_width_non_bmp)} ranges covering ~{total_zero_width + total_double_width} code points")
 
 if __name__ == "__main__":
     generate_ucs_width()