libunistring: update to 1.4.1, remove pregenerated files

2026-05-31 22:30:36 +02:00 · 2026-01-26 22:38:07 +11:00 · 2026-01-26 22:38:07 +11:00 · b14eef084a
commit b14eef084a
parent 3ceae57996
9 changed files with 508 additions and 86 deletions
--- a/steps/libunistring-1.4.1/files/gen-uninames.py
+++ b/steps/libunistring-1.4.1/files/gen-uninames.py
@ -0,0 +1,325 @@
+# SPDX-FileCopyrightText: 2026 Samuel Tyler <samuel@samuelt.me>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+ 
+### Creation of gnulib's uninames.h from the UnicodeData.txt and NameAliases.txt
+### tables.
+
+### Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
+### Translated into Python by Samuel Tyler, 2026-01-31.
+###
+### This program is free software.
+### It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+### You can redistribute it and/or modify it under either
+###   - the terms of the GNU Lesser General Public License as published
+###     by the Free Software Foundation, either version 3, or (at your
+###     option) any later version, or
+###   - the terms of the GNU General Public License as published by the
+###     Free Software Foundation; either version 2, or (at your option)
+###     any later version, or
+###   - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+###
+### This program is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+### Lesser General Public License and the GNU General Public License
+### for more details.
+###
+### You should have received a copy of the GNU Lesser General Public
+### License and of the GNU General Public License along with this
+### program.  If not, see <https://www.gnu.org/licenses/>.
+
+import sys
+
+add_comments = False
+
+class UnicodeChar:
+    def __init__(self, index, name):
+        self.index = index
+        self.name = name
+        self.word_indices = []
+        self.word_indices_index = 0
+
+class Range:
+    def __init__(self, index, start_code, end_code):
+        self.index = index
+        self.start_code = start_code
+        self.end_code = end_code
+
+class WordList:
+    def __init__(self):
+        self.hashed = {}
+        self.sorted = []
+        self.size = 0
+        self.length = 0
+
+def main(inputfile, aliasfile, outputfile):
+    # Local variable initialization
+    all_chars = []
+    all_chars_hashed = {}
+    all_aliases = []
+    all_chars_and_aliases = []
+    all_ranges = []
+    name_index = 0
+    current_range = None
+
+    # Read all characters and names from the input file.
+    with open(inputfile, "r", encoding="utf-8") as istream:
+        for line in istream:
+            line = line.strip("\n")
+            if not line:
+                continue
+
+            code_string, name_string = line.split(";", 1)
+            code = int(code_string, 16)
+
+            # Ignore characters whose name starts with "<".
+            if name_string.startswith('<'):
+                continue
+
+            # Also ignore Hangul syllables; they are treated specially.
+            if 0xAC00 <= code <= 0xD7A3:
+                continue
+
+            # Also ignore CJK compatibility ideographs; they are treated
+            # specially as well.
+            if (0xF900 <= code <= 0xFA2D) or (0xFA30 <= code <= 0xFA6A) or \
+               (0xFA70 <= code <= 0xFAD9) or (0x2F800 <= code <= 0x2FA1D):
+                continue
+
+            # Also ignore variationselectors; they are treated
+            # specially as well.
+            if (0xFE00 <= code <= 0xFE0F) or (0xE0100 <= code <= 0xE01EF):
+                continue
+
+            uc = UnicodeChar(name_index, name_string)
+            all_chars.insert(0, uc)
+            all_chars_hashed[code] = uc
+
+            # Update the contiguous range, or start a new range.
+            if current_range and (current_range.end_code + 1 == code):
+                current_range.end_code = code
+            else:
+                if current_range:
+                    all_ranges.insert(0, current_range)
+                current_range = Range(name_index, code, code)
+            name_index += 1
+
+    all_chars.reverse()
+    if current_range:
+        all_ranges.insert(0, current_range)
+    all_ranges.reverse()
+
+    if aliasfile:
+        # Read all characters and names from the alias file.
+        with open(aliasfile, "r", encoding="utf-8") as istream:
+            for line in istream:
+                line = line.strip("\n")
+                if not line or line == "" or line.startswith('#'):
+                    continue
+
+                code_string, name_string = line.split(";", 1)
+                code = int(code_string, 16)
+
+                if code in all_chars_hashed:
+                    uc_alias = UnicodeChar(all_chars_hashed[code].index, name_string)
+                    all_aliases.insert(0, uc_alias)
+
+    all_aliases.reverse()
+    all_chars_and_aliases = all_chars + all_aliases
+
+    # Split into words.
+    words_by_length = []
+    for name in ["HANGUL SYLLABLE", "CJK COMPATIBILITY", "VARIATION"] + \
+        [c.name for c in all_chars_and_aliases]:
+        i1 = 0
+        while i1 < len(name):
+            i2 = name.find(' ', i1)
+            if i2 == -1:
+                i2 = len(name)
+
+            word = name[i1:i2]
+            while len(word) >= len(words_by_length):
+                words_by_length.append(WordList())
+
+            word_list = words_by_length[len(word)]
+            if word not in word_list.hashed:
+                word_list.hashed[word] = True
+                word_list.sorted.append(word)
+
+            i1 = i2 + 1
+
+    # Sort the word lists.
+    for length in range(len(words_by_length)):
+        if not words_by_length[length]:
+            words_by_length[length] = WordList()
+
+        word_list = words_by_length[length]
+        word_list.sorted.sort()
+        word_list.size = sum(len(w) for w in word_list.sorted)
+        word_list.length = len(word_list.sorted)
+
+    # Output the tables.
+    with open(outputfile, 'w', encoding='ascii') as ostream:
+        ostream.write("/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n")
+        ostream.write("/*\n")
+        ostream.write(f" * {outputfile}\n")
+        ostream.write(" *\n")
+        ostream.write(" * Unicode character name table.\n")
+        ostream.write(" * Generated automatically by the gen-uninames utility.\n")
+        ostream.write(" */\n")
+        ostream.write("/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n")
+        ostream.write("\n")
+        ostream.write("   This file is free software.\n")
+        ostream.write('   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".\n')
+        ostream.write("   You can redistribute it and/or modify it under either\n")
+        ostream.write("     - the terms of the GNU Lesser General Public License as published\n")
+        ostream.write("       by the Free Software Foundation, either version 3, or (at your\n")
+        ostream.write("       option) any later version, or\n")
+        ostream.write("     - the terms of the GNU General Public License as published by the\n")
+        ostream.write("       Free Software Foundation; either version 2, or (at your option)\n")
+        ostream.write("       any later version, or\n")
+        ostream.write('     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".\n')
+        ostream.write("\n")
+        ostream.write("   This file is distributed in the hope that it will be useful,\n")
+        ostream.write("   but WITHOUT ANY WARRANTY; without even the implied warranty of\n")
+        ostream.write("   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n")
+        ostream.write("   Lesser General Public License and the GNU General Public License\n")
+        ostream.write("   for more details.\n")
+        ostream.write("\n")
+        ostream.write("   You should have received a copy of the GNU Lesser General Public\n")
+        ostream.write("   License and of the GNU General Public License along with this\n")
+        ostream.write("   program.  If not, see <https://www.gnu.org/licenses/>.  */\n")
+        ostream.write("\n")
+
+        total_size = sum(wl.size for wl in words_by_length)
+        ostream.write(f"static const char unicode_name_words[{total_size}] = {{\n")
+
+        for wl in words_by_length:
+            for word in wl.sorted:
+                # format " ~{ '~C',~}"
+                # space before loop, print each char in single quotes followed by comma
+                chars = "".join(f"'{c}'," for c in word)
+                ostream.write(f" {chars}\n")
+        ostream.write("};\n")
+
+        total_num_words = sum(wl.length for wl in words_by_length)
+        ostream.write(f"#define UNICODE_CHARNAME_NUM_WORDS {total_num_words}\n")
+
+        # unicode_name_by_length
+        ostream.write(
+            "static const struct { uint32_t extra_offset; uint16_t ind_offset; } "
+            f"unicode_name_by_length[{len(words_by_length) + 1}] = {{\n"
+        )
+
+        extra_offset = 0
+        ind_offset = 0
+        for wl in words_by_length:
+            ostream.write(f"  {{ {extra_offset}, {ind_offset} }},\n")
+            extra_offset += wl.size
+            ind_offset += wl.length
+        ostream.write(f"  {{ {extra_offset}, {ind_offset} }}\n")
+        ostream.write("};\n")
+
+        # Assign indices to hashed words
+        current_idx = 0
+        for wl in words_by_length:
+            for word in wl.sorted:
+                wl.hashed[word] = current_idx
+                current_idx += 1
+
+        # Defines specific words
+        for word in ["HANGUL", "SYLLABLE", "CJK", "COMPATIBILITY", "VARIATION"]:
+            wlen = len(word)
+            idx = words_by_length[wlen].hashed.get(word)
+            ostream.write(f"#define UNICODE_CHARNAME_WORD_{word} {idx}\n")
+
+        # Compute word-indices for every unicode-char
+        for uc in all_chars_and_aliases:
+            indices = []
+            i1 = 0
+            name = uc.name
+            while i1 < len(name):
+                i2 = name.find(' ', i1)
+                if i2 == -1:
+                    i2 = len(name)
+                word = name[i1:i2]
+                wlen = len(word)
+                idx = words_by_length[wlen].hashed[word]
+                indices.append(idx)
+                i1 = i2 + 1
+            uc.word_indices = list(reversed(indices))
+
+        # Sort the list of unicode-chars by word-indices
+        all_chars_and_aliases.sort(key=lambda x: x.word_indices)
+
+        # Output the word-indices
+        total_indices = sum(len(uc.word_indices) for uc in all_chars_and_aliases)
+        ostream.write(f"static const uint16_t unicode_names[{total_indices}] = {{\n")
+
+        i = 0
+        for uc in all_chars_and_aliases:
+            packed_indices = []
+            wi = uc.word_indices
+            for k, val in enumerate(wi):
+                is_last = (k == len(wi) - 1)
+                packed_indices.append(val * 2 + is_last)
+
+            ostream.write(" " + " ".join(f"{val}," for val in packed_indices))
+
+            if add_comments:
+                ostream.write(f"{' ' * (40 - len(indices_str))}/* {uc.name} */")
+            ostream.write("\n")
+
+            uc.word_indices_index = i
+            i += len(uc.word_indices)
+        ostream.write("};\n")
+
+        ostream.write(
+            "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED "
+            f"unicode_name_to_index[{len(all_chars_and_aliases)}] = {{\n"
+        )
+        for uc in all_chars_and_aliases:
+            content = f"  {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
+            ostream.write(content)
+            if add_comments:
+                ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
+            ostream.write("\n")
+        ostream.write("};\n")
+
+        ostream.write(
+            f"static const struct {{ uint16_t index; uint32_t name:24; }} ATTRIBUTE_PACKED "
+            f"unicode_index_to_name[{len(all_chars)}] = {{\n"
+        )
+        for uc in sorted(all_chars, key=lambda c: c.index):
+            content = f"  {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
+            ostream.write(content)
+            if add_comments:
+                ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
+            ostream.write("\n")
+        ostream.write("};\n")
+
+        # Max counts
+        max_len = max(len(uc.name) for uc in all_chars_and_aliases)
+        ostream.write(f"#define UNICODE_CHARNAME_MAX_LENGTH {max_len}\n")
+
+        max_words = max(len(uc.word_indices) for uc in all_chars_and_aliases)
+        ostream.write(f"#define UNICODE_CHARNAME_MAX_WORDS {max_words}\n")
+
+        # Ranges
+        ostream.write(
+            "static const struct { uint16_t index; uint32_t gap; uint16_t length; } "
+            f"unicode_ranges[{len(all_ranges)}] = {{\n"
+        )
+        for r in all_ranges:
+            ostream.write(
+                f"  {{ {r.index}, {r.start_code - r.index}, {1 + r.end_code - r.start_code} }},\n"
+            )
+        ostream.write("};\n")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) >= 4:
+        main(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        print("Usage: script.py <inputfile> <aliasfile> <outputfile>", file=sys.stderr)