mirror of
https://github.com/fosslinux/live-bootstrap.git
synced 2026-03-02 01:18:08 +01:00
325 lines
12 KiB
Python
325 lines
12 KiB
Python
# SPDX-FileCopyrightText: 2026 Samuel Tyler <samuel@samuelt.me>
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
### Creation of gnulib's uninames.h from the UnicodeData.txt and NameAliases.txt
|
|
### tables.
|
|
|
|
### Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
|
|
### Translated into Python by Samuel Tyler, 2026-01-31.
|
|
###
|
|
### This program is free software.
|
|
### It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
|
|
### You can redistribute it and/or modify it under either
|
|
### - the terms of the GNU Lesser General Public License as published
|
|
### by the Free Software Foundation, either version 3, or (at your
|
|
### option) any later version, or
|
|
### - the terms of the GNU General Public License as published by the
|
|
### Free Software Foundation; either version 2, or (at your option)
|
|
### any later version, or
|
|
### - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
|
|
###
|
|
### This program is distributed in the hope that it will be useful,
|
|
### but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
### Lesser General Public License and the GNU General Public License
|
|
### for more details.
|
|
###
|
|
### You should have received a copy of the GNU Lesser General Public
|
|
### License and of the GNU General Public License along with this
|
|
### program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
import sys
|
|
|
|
add_comments = False
|
|
|
|
class UnicodeChar:
|
|
def __init__(self, index, name):
|
|
self.index = index
|
|
self.name = name
|
|
self.word_indices = []
|
|
self.word_indices_index = 0
|
|
|
|
class Range:
|
|
def __init__(self, index, start_code, end_code):
|
|
self.index = index
|
|
self.start_code = start_code
|
|
self.end_code = end_code
|
|
|
|
class WordList:
|
|
def __init__(self):
|
|
self.hashed = {}
|
|
self.sorted = []
|
|
self.size = 0
|
|
self.length = 0
|
|
|
|
def main(inputfile, aliasfile, outputfile):
|
|
# Local variable initialization
|
|
all_chars = []
|
|
all_chars_hashed = {}
|
|
all_aliases = []
|
|
all_chars_and_aliases = []
|
|
all_ranges = []
|
|
name_index = 0
|
|
current_range = None
|
|
|
|
# Read all characters and names from the input file.
|
|
with open(inputfile, "r", encoding="utf-8") as istream:
|
|
for line in istream:
|
|
line = line.strip("\n")
|
|
if not line:
|
|
continue
|
|
|
|
code_string, name_string = line.split(";", 1)
|
|
code = int(code_string, 16)
|
|
|
|
# Ignore characters whose name starts with "<".
|
|
if name_string.startswith('<'):
|
|
continue
|
|
|
|
# Also ignore Hangul syllables; they are treated specially.
|
|
if 0xAC00 <= code <= 0xD7A3:
|
|
continue
|
|
|
|
# Also ignore CJK compatibility ideographs; they are treated
|
|
# specially as well.
|
|
if (0xF900 <= code <= 0xFA2D) or (0xFA30 <= code <= 0xFA6A) or \
|
|
(0xFA70 <= code <= 0xFAD9) or (0x2F800 <= code <= 0x2FA1D):
|
|
continue
|
|
|
|
# Also ignore variationselectors; they are treated
|
|
# specially as well.
|
|
if (0xFE00 <= code <= 0xFE0F) or (0xE0100 <= code <= 0xE01EF):
|
|
continue
|
|
|
|
uc = UnicodeChar(name_index, name_string)
|
|
all_chars.insert(0, uc)
|
|
all_chars_hashed[code] = uc
|
|
|
|
# Update the contiguous range, or start a new range.
|
|
if current_range and (current_range.end_code + 1 == code):
|
|
current_range.end_code = code
|
|
else:
|
|
if current_range:
|
|
all_ranges.insert(0, current_range)
|
|
current_range = Range(name_index, code, code)
|
|
name_index += 1
|
|
|
|
all_chars.reverse()
|
|
if current_range:
|
|
all_ranges.insert(0, current_range)
|
|
all_ranges.reverse()
|
|
|
|
if aliasfile:
|
|
# Read all characters and names from the alias file.
|
|
with open(aliasfile, "r", encoding="utf-8") as istream:
|
|
for line in istream:
|
|
line = line.strip("\n")
|
|
if not line or line == "" or line.startswith('#'):
|
|
continue
|
|
|
|
code_string, name_string = line.split(";", 1)
|
|
code = int(code_string, 16)
|
|
|
|
if code in all_chars_hashed:
|
|
uc_alias = UnicodeChar(all_chars_hashed[code].index, name_string)
|
|
all_aliases.insert(0, uc_alias)
|
|
|
|
all_aliases.reverse()
|
|
all_chars_and_aliases = all_chars + all_aliases
|
|
|
|
# Split into words.
|
|
words_by_length = []
|
|
for name in ["HANGUL SYLLABLE", "CJK COMPATIBILITY", "VARIATION"] + \
|
|
[c.name for c in all_chars_and_aliases]:
|
|
i1 = 0
|
|
while i1 < len(name):
|
|
i2 = name.find(' ', i1)
|
|
if i2 == -1:
|
|
i2 = len(name)
|
|
|
|
word = name[i1:i2]
|
|
while len(word) >= len(words_by_length):
|
|
words_by_length.append(WordList())
|
|
|
|
word_list = words_by_length[len(word)]
|
|
if word not in word_list.hashed:
|
|
word_list.hashed[word] = True
|
|
word_list.sorted.append(word)
|
|
|
|
i1 = i2 + 1
|
|
|
|
# Sort the word lists.
|
|
for length in range(len(words_by_length)):
|
|
if not words_by_length[length]:
|
|
words_by_length[length] = WordList()
|
|
|
|
word_list = words_by_length[length]
|
|
word_list.sorted.sort()
|
|
word_list.size = sum(len(w) for w in word_list.sorted)
|
|
word_list.length = len(word_list.sorted)
|
|
|
|
# Output the tables.
|
|
with open(outputfile, 'w', encoding='ascii') as ostream:
|
|
ostream.write("/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n")
|
|
ostream.write("/*\n")
|
|
ostream.write(f" * {outputfile}\n")
|
|
ostream.write(" *\n")
|
|
ostream.write(" * Unicode character name table.\n")
|
|
ostream.write(" * Generated automatically by the gen-uninames utility.\n")
|
|
ostream.write(" */\n")
|
|
ostream.write("/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n")
|
|
ostream.write("\n")
|
|
ostream.write(" This file is free software.\n")
|
|
ostream.write(' It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".\n')
|
|
ostream.write(" You can redistribute it and/or modify it under either\n")
|
|
ostream.write(" - the terms of the GNU Lesser General Public License as published\n")
|
|
ostream.write(" by the Free Software Foundation, either version 3, or (at your\n")
|
|
ostream.write(" option) any later version, or\n")
|
|
ostream.write(" - the terms of the GNU General Public License as published by the\n")
|
|
ostream.write(" Free Software Foundation; either version 2, or (at your option)\n")
|
|
ostream.write(" any later version, or\n")
|
|
ostream.write(' - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".\n')
|
|
ostream.write("\n")
|
|
ostream.write(" This file is distributed in the hope that it will be useful,\n")
|
|
ostream.write(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n")
|
|
ostream.write(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n")
|
|
ostream.write(" Lesser General Public License and the GNU General Public License\n")
|
|
ostream.write(" for more details.\n")
|
|
ostream.write("\n")
|
|
ostream.write(" You should have received a copy of the GNU Lesser General Public\n")
|
|
ostream.write(" License and of the GNU General Public License along with this\n")
|
|
ostream.write(" program. If not, see <https://www.gnu.org/licenses/>. */\n")
|
|
ostream.write("\n")
|
|
|
|
total_size = sum(wl.size for wl in words_by_length)
|
|
ostream.write(f"static const char unicode_name_words[{total_size}] = {{\n")
|
|
|
|
for wl in words_by_length:
|
|
for word in wl.sorted:
|
|
# format " ~{ '~C',~}"
|
|
# space before loop, print each char in single quotes followed by comma
|
|
chars = "".join(f"'{c}'," for c in word)
|
|
ostream.write(f" {chars}\n")
|
|
ostream.write("};\n")
|
|
|
|
total_num_words = sum(wl.length for wl in words_by_length)
|
|
ostream.write(f"#define UNICODE_CHARNAME_NUM_WORDS {total_num_words}\n")
|
|
|
|
# unicode_name_by_length
|
|
ostream.write(
|
|
"static const struct { uint32_t extra_offset; uint16_t ind_offset; } "
|
|
f"unicode_name_by_length[{len(words_by_length) + 1}] = {{\n"
|
|
)
|
|
|
|
extra_offset = 0
|
|
ind_offset = 0
|
|
for wl in words_by_length:
|
|
ostream.write(f" {{ {extra_offset}, {ind_offset} }},\n")
|
|
extra_offset += wl.size
|
|
ind_offset += wl.length
|
|
ostream.write(f" {{ {extra_offset}, {ind_offset} }}\n")
|
|
ostream.write("};\n")
|
|
|
|
# Assign indices to hashed words
|
|
current_idx = 0
|
|
for wl in words_by_length:
|
|
for word in wl.sorted:
|
|
wl.hashed[word] = current_idx
|
|
current_idx += 1
|
|
|
|
# Defines specific words
|
|
for word in ["HANGUL", "SYLLABLE", "CJK", "COMPATIBILITY", "VARIATION"]:
|
|
wlen = len(word)
|
|
idx = words_by_length[wlen].hashed.get(word)
|
|
ostream.write(f"#define UNICODE_CHARNAME_WORD_{word} {idx}\n")
|
|
|
|
# Compute word-indices for every unicode-char
|
|
for uc in all_chars_and_aliases:
|
|
indices = []
|
|
i1 = 0
|
|
name = uc.name
|
|
while i1 < len(name):
|
|
i2 = name.find(' ', i1)
|
|
if i2 == -1:
|
|
i2 = len(name)
|
|
word = name[i1:i2]
|
|
wlen = len(word)
|
|
idx = words_by_length[wlen].hashed[word]
|
|
indices.append(idx)
|
|
i1 = i2 + 1
|
|
uc.word_indices = list(reversed(indices))
|
|
|
|
# Sort the list of unicode-chars by word-indices
|
|
all_chars_and_aliases.sort(key=lambda x: x.word_indices)
|
|
|
|
# Output the word-indices
|
|
total_indices = sum(len(uc.word_indices) for uc in all_chars_and_aliases)
|
|
ostream.write(f"static const uint16_t unicode_names[{total_indices}] = {{\n")
|
|
|
|
i = 0
|
|
for uc in all_chars_and_aliases:
|
|
packed_indices = []
|
|
wi = uc.word_indices
|
|
for k, val in enumerate(wi):
|
|
is_last = (k == len(wi) - 1)
|
|
packed_indices.append(val * 2 + is_last)
|
|
|
|
ostream.write(" " + " ".join(f"{val}," for val in packed_indices))
|
|
|
|
if add_comments:
|
|
ostream.write(f"{' ' * (40 - len(indices_str))}/* {uc.name} */")
|
|
ostream.write("\n")
|
|
|
|
uc.word_indices_index = i
|
|
i += len(uc.word_indices)
|
|
ostream.write("};\n")
|
|
|
|
ostream.write(
|
|
"static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED "
|
|
f"unicode_name_to_index[{len(all_chars_and_aliases)}] = {{\n"
|
|
)
|
|
for uc in all_chars_and_aliases:
|
|
content = f" {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
|
|
ostream.write(content)
|
|
if add_comments:
|
|
ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
|
|
ostream.write("\n")
|
|
ostream.write("};\n")
|
|
|
|
ostream.write(
|
|
f"static const struct {{ uint16_t index; uint32_t name:24; }} ATTRIBUTE_PACKED "
|
|
f"unicode_index_to_name[{len(all_chars)}] = {{\n"
|
|
)
|
|
for uc in sorted(all_chars, key=lambda c: c.index):
|
|
content = f" {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
|
|
ostream.write(content)
|
|
if add_comments:
|
|
ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
|
|
ostream.write("\n")
|
|
ostream.write("};\n")
|
|
|
|
# Max counts
|
|
max_len = max(len(uc.name) for uc in all_chars_and_aliases)
|
|
ostream.write(f"#define UNICODE_CHARNAME_MAX_LENGTH {max_len}\n")
|
|
|
|
max_words = max(len(uc.word_indices) for uc in all_chars_and_aliases)
|
|
ostream.write(f"#define UNICODE_CHARNAME_MAX_WORDS {max_words}\n")
|
|
|
|
# Ranges
|
|
ostream.write(
|
|
"static const struct { uint16_t index; uint32_t gap; uint16_t length; } "
|
|
f"unicode_ranges[{len(all_ranges)}] = {{\n"
|
|
)
|
|
for r in all_ranges:
|
|
ostream.write(
|
|
f" {{ {r.index}, {r.start_code - r.index}, {1 + r.end_code - r.start_code} }},\n"
|
|
)
|
|
ostream.write("};\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) >= 4:
|
|
main(sys.argv[1], sys.argv[2], sys.argv[3])
|
|
else:
|
|
print("Usage: script.py <inputfile> <aliasfile> <outputfile>", file=sys.stderr)
|