live-bootstrap/steps/libunistring-1.4.1/files/gen-uninames.py

# SPDX-FileCopyrightText: 2026 Samuel Tyler <samuel@samuelt.me>
#
# SPDX-License-Identifier: GPL-3.0-or-later

### Creation of gnulib's uninames.h from the UnicodeData.txt and NameAliases.txt
### tables.

### Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
### Translated into Python by Samuel Tyler, 2026-01-31.
###
### This program is free software.
### It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
### You can redistribute it and/or modify it under either
###   - the terms of the GNU Lesser General Public License as published
###     by the Free Software Foundation, either version 3, or (at your
###     option) any later version, or
###   - the terms of the GNU General Public License as published by the
###     Free Software Foundation; either version 2, or (at your option)
###     any later version, or
###   - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
### Lesser General Public License and the GNU General Public License
### for more details.
###
### You should have received a copy of the GNU Lesser General Public
### License and of the GNU General Public License along with this
### program.  If not, see <https://www.gnu.org/licenses/>.

import sys

add_comments = False

class UnicodeChar:
    def __init__(self, index, name):
        self.index = index
        self.name = name
        self.word_indices = []
        self.word_indices_index = 0

class Range:
    def __init__(self, index, start_code, end_code):
        self.index = index
        self.start_code = start_code
        self.end_code = end_code

class WordList:
    def __init__(self):
        self.hashed = {}
        self.sorted = []
        self.size = 0
        self.length = 0

def main(inputfile, aliasfile, outputfile):
    # Local variable initialization
    all_chars = []
    all_chars_hashed = {}
    all_aliases = []
    all_chars_and_aliases = []
    all_ranges = []
    name_index = 0
    current_range = None

    # Read all characters and names from the input file.
    with open(inputfile, "r", encoding="utf-8") as istream:
        for line in istream:
            line = line.strip("\n")
            if not line:
                continue

            code_string, name_string = line.split(";", 1)
            code = int(code_string, 16)

            # Ignore characters whose name starts with "<".
            if name_string.startswith('<'):
                continue

            # Also ignore Hangul syllables; they are treated specially.
            if 0xAC00 <= code <= 0xD7A3:
                continue

            # Also ignore CJK compatibility ideographs; they are treated
            # specially as well.
            if (0xF900 <= code <= 0xFA2D) or (0xFA30 <= code <= 0xFA6A) or \
               (0xFA70 <= code <= 0xFAD9) or (0x2F800 <= code <= 0x2FA1D):
                continue

            # Also ignore variationselectors; they are treated
            # specially as well.
            if (0xFE00 <= code <= 0xFE0F) or (0xE0100 <= code <= 0xE01EF):
                continue

            uc = UnicodeChar(name_index, name_string)
            all_chars.insert(0, uc)
            all_chars_hashed[code] = uc

            # Update the contiguous range, or start a new range.
            if current_range and (current_range.end_code + 1 == code):
                current_range.end_code = code
            else:
                if current_range:
                    all_ranges.insert(0, current_range)
                current_range = Range(name_index, code, code)
            name_index += 1

    all_chars.reverse()
    if current_range:
        all_ranges.insert(0, current_range)
    all_ranges.reverse()

    if aliasfile:
        # Read all characters and names from the alias file.
        with open(aliasfile, "r", encoding="utf-8") as istream:
            for line in istream:
                line = line.strip("\n")
                if not line or line == "" or line.startswith('#'):
                    continue

                code_string, name_string = line.split(";", 1)
                code = int(code_string, 16)

                if code in all_chars_hashed:
                    uc_alias = UnicodeChar(all_chars_hashed[code].index, name_string)
                    all_aliases.insert(0, uc_alias)

    all_aliases.reverse()
    all_chars_and_aliases = all_chars + all_aliases

    # Split into words.
    words_by_length = []
    for name in ["HANGUL SYLLABLE", "CJK COMPATIBILITY", "VARIATION"] + \
        [c.name for c in all_chars_and_aliases]:
        i1 = 0
        while i1 < len(name):
            i2 = name.find(' ', i1)
            if i2 == -1:
                i2 = len(name)

            word = name[i1:i2]
            while len(word) >= len(words_by_length):
                words_by_length.append(WordList())

            word_list = words_by_length[len(word)]
            if word not in word_list.hashed:
                word_list.hashed[word] = True
                word_list.sorted.append(word)

            i1 = i2 + 1

    # Sort the word lists.
    for length in range(len(words_by_length)):
        if not words_by_length[length]:
            words_by_length[length] = WordList()

        word_list = words_by_length[length]
        word_list.sorted.sort()
        word_list.size = sum(len(w) for w in word_list.sorted)
        word_list.length = len(word_list.sorted)

    # Output the tables.
    with open(outputfile, 'w', encoding='ascii') as ostream:
        ostream.write("/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n")
        ostream.write("/*\n")
        ostream.write(f" * {outputfile}\n")
        ostream.write(" *\n")
        ostream.write(" * Unicode character name table.\n")
        ostream.write(" * Generated automatically by the gen-uninames utility.\n")
        ostream.write(" */\n")
        ostream.write("/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n")
        ostream.write("\n")
        ostream.write("   This file is free software.\n")
        ostream.write('   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".\n')
        ostream.write("   You can redistribute it and/or modify it under either\n")
        ostream.write("     - the terms of the GNU Lesser General Public License as published\n")
        ostream.write("       by the Free Software Foundation, either version 3, or (at your\n")
        ostream.write("       option) any later version, or\n")
        ostream.write("     - the terms of the GNU General Public License as published by the\n")
        ostream.write("       Free Software Foundation; either version 2, or (at your option)\n")
        ostream.write("       any later version, or\n")
        ostream.write('     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".\n')
        ostream.write("\n")
        ostream.write("   This file is distributed in the hope that it will be useful,\n")
        ostream.write("   but WITHOUT ANY WARRANTY; without even the implied warranty of\n")
        ostream.write("   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n")
        ostream.write("   Lesser General Public License and the GNU General Public License\n")
        ostream.write("   for more details.\n")
        ostream.write("\n")
        ostream.write("   You should have received a copy of the GNU Lesser General Public\n")
        ostream.write("   License and of the GNU General Public License along with this\n")
        ostream.write("   program.  If not, see <https://www.gnu.org/licenses/>.  */\n")
        ostream.write("\n")

        total_size = sum(wl.size for wl in words_by_length)
        ostream.write(f"static const char unicode_name_words[{total_size}] = {{\n")

        for wl in words_by_length:
            for word in wl.sorted:
                # format " ~{ '~C',~}"
                # space before loop, print each char in single quotes followed by comma
                chars = "".join(f"'{c}'," for c in word)
                ostream.write(f" {chars}\n")
        ostream.write("};\n")

        total_num_words = sum(wl.length for wl in words_by_length)
        ostream.write(f"#define UNICODE_CHARNAME_NUM_WORDS {total_num_words}\n")

        # unicode_name_by_length
        ostream.write(
            "static const struct { uint32_t extra_offset; uint16_t ind_offset; } "
            f"unicode_name_by_length[{len(words_by_length) + 1}] = {{\n"
        )

        extra_offset = 0
        ind_offset = 0
        for wl in words_by_length:
            ostream.write(f"  {{ {extra_offset}, {ind_offset} }},\n")
            extra_offset += wl.size
            ind_offset += wl.length
        ostream.write(f"  {{ {extra_offset}, {ind_offset} }}\n")
        ostream.write("};\n")

        # Assign indices to hashed words
        current_idx = 0
        for wl in words_by_length:
            for word in wl.sorted:
                wl.hashed[word] = current_idx
                current_idx += 1

        # Defines specific words
        for word in ["HANGUL", "SYLLABLE", "CJK", "COMPATIBILITY", "VARIATION"]:
            wlen = len(word)
            idx = words_by_length[wlen].hashed.get(word)
            ostream.write(f"#define UNICODE_CHARNAME_WORD_{word} {idx}\n")

        # Compute word-indices for every unicode-char
        for uc in all_chars_and_aliases:
            indices = []
            i1 = 0
            name = uc.name
            while i1 < len(name):
                i2 = name.find(' ', i1)
                if i2 == -1:
                    i2 = len(name)
                word = name[i1:i2]
                wlen = len(word)
                idx = words_by_length[wlen].hashed[word]
                indices.append(idx)
                i1 = i2 + 1
            uc.word_indices = list(reversed(indices))

        # Sort the list of unicode-chars by word-indices
        all_chars_and_aliases.sort(key=lambda x: x.word_indices)

        # Output the word-indices
        total_indices = sum(len(uc.word_indices) for uc in all_chars_and_aliases)
        ostream.write(f"static const uint16_t unicode_names[{total_indices}] = {{\n")

        i = 0
        for uc in all_chars_and_aliases:
            packed_indices = []
            wi = uc.word_indices
            for k, val in enumerate(wi):
                is_last = (k == len(wi) - 1)
                packed_indices.append(val * 2 + is_last)

            ostream.write(" " + " ".join(f"{val}," for val in packed_indices))

            if add_comments:
                ostream.write(f"{' ' * (40 - len(indices_str))}/* {uc.name} */")
            ostream.write("\n")

            uc.word_indices_index = i
            i += len(uc.word_indices)
        ostream.write("};\n")

        ostream.write(
            "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED "
            f"unicode_name_to_index[{len(all_chars_and_aliases)}] = {{\n"
        )
        for uc in all_chars_and_aliases:
            content = f"  {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
            ostream.write(content)
            if add_comments:
                ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
            ostream.write("\n")
        ostream.write("};\n")

        ostream.write(
            f"static const struct {{ uint16_t index; uint32_t name:24; }} ATTRIBUTE_PACKED "
            f"unicode_index_to_name[{len(all_chars)}] = {{\n"
        )
        for uc in sorted(all_chars, key=lambda c: c.index):
            content = f"  {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
            ostream.write(content)
            if add_comments:
                ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
            ostream.write("\n")
        ostream.write("};\n")

        # Max counts
        max_len = max(len(uc.name) for uc in all_chars_and_aliases)
        ostream.write(f"#define UNICODE_CHARNAME_MAX_LENGTH {max_len}\n")

        max_words = max(len(uc.word_indices) for uc in all_chars_and_aliases)
        ostream.write(f"#define UNICODE_CHARNAME_MAX_WORDS {max_words}\n")

        # Ranges
        ostream.write(
            "static const struct { uint16_t index; uint32_t gap; uint16_t length; } "
            f"unicode_ranges[{len(all_ranges)}] = {{\n"
        )
        for r in all_ranges:
            ostream.write(
                f"  {{ {r.index}, {r.start_code - r.index}, {1 + r.end_code - r.start_code} }},\n"
            )
        ostream.write("};\n")


if __name__ == "__main__":
    if len(sys.argv) >= 4:
        main(sys.argv[1], sys.argv[2], sys.argv[3])
    else:
        print("Usage: script.py <inputfile> <aliasfile> <outputfile>", file=sys.stderr)