mirror of
https://github.com/fosslinux/live-bootstrap.git
synced 2026-03-22 11:06:32 +01:00
libunistring: update to 1.4.1, remove pregenerated files
This commit is contained in:
parent
3ceae57996
commit
b14eef084a
9 changed files with 508 additions and 86 deletions
325
steps/libunistring-1.4.1/files/gen-uninames.py
Normal file
325
steps/libunistring-1.4.1/files/gen-uninames.py
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
# SPDX-FileCopyrightText: 2026 Samuel Tyler <samuel@samuelt.me>
|
||||
#
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
### Creation of gnulib's uninames.h from the UnicodeData.txt and NameAliases.txt
|
||||
### tables.
|
||||
|
||||
### Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
|
||||
### Translated into Python by Samuel Tyler, 2026-01-31.
|
||||
###
|
||||
### This program is free software.
|
||||
### It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
### You can redistribute it and/or modify it under either
|
||||
### - the terms of the GNU Lesser General Public License as published
|
||||
### by the Free Software Foundation, either version 3, or (at your
|
||||
### option) any later version, or
|
||||
### - the terms of the GNU General Public License as published by the
|
||||
### Free Software Foundation; either version 2, or (at your option)
|
||||
### any later version, or
|
||||
### - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
###
|
||||
### This program is distributed in the hope that it will be useful,
|
||||
### but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
### Lesser General Public License and the GNU General Public License
|
||||
### for more details.
|
||||
###
|
||||
### You should have received a copy of the GNU Lesser General Public
|
||||
### License and of the GNU General Public License along with this
|
||||
### program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
|
||||
add_comments = False
|
||||
|
||||
class UnicodeChar:
|
||||
def __init__(self, index, name):
|
||||
self.index = index
|
||||
self.name = name
|
||||
self.word_indices = []
|
||||
self.word_indices_index = 0
|
||||
|
||||
class Range:
|
||||
def __init__(self, index, start_code, end_code):
|
||||
self.index = index
|
||||
self.start_code = start_code
|
||||
self.end_code = end_code
|
||||
|
||||
class WordList:
|
||||
def __init__(self):
|
||||
self.hashed = {}
|
||||
self.sorted = []
|
||||
self.size = 0
|
||||
self.length = 0
|
||||
|
||||
def main(inputfile, aliasfile, outputfile):
|
||||
# Local variable initialization
|
||||
all_chars = []
|
||||
all_chars_hashed = {}
|
||||
all_aliases = []
|
||||
all_chars_and_aliases = []
|
||||
all_ranges = []
|
||||
name_index = 0
|
||||
current_range = None
|
||||
|
||||
# Read all characters and names from the input file.
|
||||
with open(inputfile, "r", encoding="utf-8") as istream:
|
||||
for line in istream:
|
||||
line = line.strip("\n")
|
||||
if not line:
|
||||
continue
|
||||
|
||||
code_string, name_string = line.split(";", 1)
|
||||
code = int(code_string, 16)
|
||||
|
||||
# Ignore characters whose name starts with "<".
|
||||
if name_string.startswith('<'):
|
||||
continue
|
||||
|
||||
# Also ignore Hangul syllables; they are treated specially.
|
||||
if 0xAC00 <= code <= 0xD7A3:
|
||||
continue
|
||||
|
||||
# Also ignore CJK compatibility ideographs; they are treated
|
||||
# specially as well.
|
||||
if (0xF900 <= code <= 0xFA2D) or (0xFA30 <= code <= 0xFA6A) or \
|
||||
(0xFA70 <= code <= 0xFAD9) or (0x2F800 <= code <= 0x2FA1D):
|
||||
continue
|
||||
|
||||
# Also ignore variationselectors; they are treated
|
||||
# specially as well.
|
||||
if (0xFE00 <= code <= 0xFE0F) or (0xE0100 <= code <= 0xE01EF):
|
||||
continue
|
||||
|
||||
uc = UnicodeChar(name_index, name_string)
|
||||
all_chars.insert(0, uc)
|
||||
all_chars_hashed[code] = uc
|
||||
|
||||
# Update the contiguous range, or start a new range.
|
||||
if current_range and (current_range.end_code + 1 == code):
|
||||
current_range.end_code = code
|
||||
else:
|
||||
if current_range:
|
||||
all_ranges.insert(0, current_range)
|
||||
current_range = Range(name_index, code, code)
|
||||
name_index += 1
|
||||
|
||||
all_chars.reverse()
|
||||
if current_range:
|
||||
all_ranges.insert(0, current_range)
|
||||
all_ranges.reverse()
|
||||
|
||||
if aliasfile:
|
||||
# Read all characters and names from the alias file.
|
||||
with open(aliasfile, "r", encoding="utf-8") as istream:
|
||||
for line in istream:
|
||||
line = line.strip("\n")
|
||||
if not line or line == "" or line.startswith('#'):
|
||||
continue
|
||||
|
||||
code_string, name_string = line.split(";", 1)
|
||||
code = int(code_string, 16)
|
||||
|
||||
if code in all_chars_hashed:
|
||||
uc_alias = UnicodeChar(all_chars_hashed[code].index, name_string)
|
||||
all_aliases.insert(0, uc_alias)
|
||||
|
||||
all_aliases.reverse()
|
||||
all_chars_and_aliases = all_chars + all_aliases
|
||||
|
||||
# Split into words.
|
||||
words_by_length = []
|
||||
for name in ["HANGUL SYLLABLE", "CJK COMPATIBILITY", "VARIATION"] + \
|
||||
[c.name for c in all_chars_and_aliases]:
|
||||
i1 = 0
|
||||
while i1 < len(name):
|
||||
i2 = name.find(' ', i1)
|
||||
if i2 == -1:
|
||||
i2 = len(name)
|
||||
|
||||
word = name[i1:i2]
|
||||
while len(word) >= len(words_by_length):
|
||||
words_by_length.append(WordList())
|
||||
|
||||
word_list = words_by_length[len(word)]
|
||||
if word not in word_list.hashed:
|
||||
word_list.hashed[word] = True
|
||||
word_list.sorted.append(word)
|
||||
|
||||
i1 = i2 + 1
|
||||
|
||||
# Sort the word lists.
|
||||
for length in range(len(words_by_length)):
|
||||
if not words_by_length[length]:
|
||||
words_by_length[length] = WordList()
|
||||
|
||||
word_list = words_by_length[length]
|
||||
word_list.sorted.sort()
|
||||
word_list.size = sum(len(w) for w in word_list.sorted)
|
||||
word_list.length = len(word_list.sorted)
|
||||
|
||||
# Output the tables.
|
||||
with open(outputfile, 'w', encoding='ascii') as ostream:
|
||||
ostream.write("/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n")
|
||||
ostream.write("/*\n")
|
||||
ostream.write(f" * {outputfile}\n")
|
||||
ostream.write(" *\n")
|
||||
ostream.write(" * Unicode character name table.\n")
|
||||
ostream.write(" * Generated automatically by the gen-uninames utility.\n")
|
||||
ostream.write(" */\n")
|
||||
ostream.write("/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n")
|
||||
ostream.write("\n")
|
||||
ostream.write(" This file is free software.\n")
|
||||
ostream.write(' It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".\n')
|
||||
ostream.write(" You can redistribute it and/or modify it under either\n")
|
||||
ostream.write(" - the terms of the GNU Lesser General Public License as published\n")
|
||||
ostream.write(" by the Free Software Foundation, either version 3, or (at your\n")
|
||||
ostream.write(" option) any later version, or\n")
|
||||
ostream.write(" - the terms of the GNU General Public License as published by the\n")
|
||||
ostream.write(" Free Software Foundation; either version 2, or (at your option)\n")
|
||||
ostream.write(" any later version, or\n")
|
||||
ostream.write(' - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".\n')
|
||||
ostream.write("\n")
|
||||
ostream.write(" This file is distributed in the hope that it will be useful,\n")
|
||||
ostream.write(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n")
|
||||
ostream.write(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n")
|
||||
ostream.write(" Lesser General Public License and the GNU General Public License\n")
|
||||
ostream.write(" for more details.\n")
|
||||
ostream.write("\n")
|
||||
ostream.write(" You should have received a copy of the GNU Lesser General Public\n")
|
||||
ostream.write(" License and of the GNU General Public License along with this\n")
|
||||
ostream.write(" program. If not, see <https://www.gnu.org/licenses/>. */\n")
|
||||
ostream.write("\n")
|
||||
|
||||
total_size = sum(wl.size for wl in words_by_length)
|
||||
ostream.write(f"static const char unicode_name_words[{total_size}] = {{\n")
|
||||
|
||||
for wl in words_by_length:
|
||||
for word in wl.sorted:
|
||||
# format " ~{ '~C',~}"
|
||||
# space before loop, print each char in single quotes followed by comma
|
||||
chars = "".join(f"'{c}'," for c in word)
|
||||
ostream.write(f" {chars}\n")
|
||||
ostream.write("};\n")
|
||||
|
||||
total_num_words = sum(wl.length for wl in words_by_length)
|
||||
ostream.write(f"#define UNICODE_CHARNAME_NUM_WORDS {total_num_words}\n")
|
||||
|
||||
# unicode_name_by_length
|
||||
ostream.write(
|
||||
"static const struct { uint32_t extra_offset; uint16_t ind_offset; } "
|
||||
f"unicode_name_by_length[{len(words_by_length) + 1}] = {{\n"
|
||||
)
|
||||
|
||||
extra_offset = 0
|
||||
ind_offset = 0
|
||||
for wl in words_by_length:
|
||||
ostream.write(f" {{ {extra_offset}, {ind_offset} }},\n")
|
||||
extra_offset += wl.size
|
||||
ind_offset += wl.length
|
||||
ostream.write(f" {{ {extra_offset}, {ind_offset} }}\n")
|
||||
ostream.write("};\n")
|
||||
|
||||
# Assign indices to hashed words
|
||||
current_idx = 0
|
||||
for wl in words_by_length:
|
||||
for word in wl.sorted:
|
||||
wl.hashed[word] = current_idx
|
||||
current_idx += 1
|
||||
|
||||
# Defines specific words
|
||||
for word in ["HANGUL", "SYLLABLE", "CJK", "COMPATIBILITY", "VARIATION"]:
|
||||
wlen = len(word)
|
||||
idx = words_by_length[wlen].hashed.get(word)
|
||||
ostream.write(f"#define UNICODE_CHARNAME_WORD_{word} {idx}\n")
|
||||
|
||||
# Compute word-indices for every unicode-char
|
||||
for uc in all_chars_and_aliases:
|
||||
indices = []
|
||||
i1 = 0
|
||||
name = uc.name
|
||||
while i1 < len(name):
|
||||
i2 = name.find(' ', i1)
|
||||
if i2 == -1:
|
||||
i2 = len(name)
|
||||
word = name[i1:i2]
|
||||
wlen = len(word)
|
||||
idx = words_by_length[wlen].hashed[word]
|
||||
indices.append(idx)
|
||||
i1 = i2 + 1
|
||||
uc.word_indices = list(reversed(indices))
|
||||
|
||||
# Sort the list of unicode-chars by word-indices
|
||||
all_chars_and_aliases.sort(key=lambda x: x.word_indices)
|
||||
|
||||
# Output the word-indices
|
||||
total_indices = sum(len(uc.word_indices) for uc in all_chars_and_aliases)
|
||||
ostream.write(f"static const uint16_t unicode_names[{total_indices}] = {{\n")
|
||||
|
||||
i = 0
|
||||
for uc in all_chars_and_aliases:
|
||||
packed_indices = []
|
||||
wi = uc.word_indices
|
||||
for k, val in enumerate(wi):
|
||||
is_last = (k == len(wi) - 1)
|
||||
packed_indices.append(val * 2 + is_last)
|
||||
|
||||
ostream.write(" " + " ".join(f"{val}," for val in packed_indices))
|
||||
|
||||
if add_comments:
|
||||
ostream.write(f"{' ' * (40 - len(indices_str))}/* {uc.name} */")
|
||||
ostream.write("\n")
|
||||
|
||||
uc.word_indices_index = i
|
||||
i += len(uc.word_indices)
|
||||
ostream.write("};\n")
|
||||
|
||||
ostream.write(
|
||||
"static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED "
|
||||
f"unicode_name_to_index[{len(all_chars_and_aliases)}] = {{\n"
|
||||
)
|
||||
for uc in all_chars_and_aliases:
|
||||
content = f" {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
|
||||
ostream.write(content)
|
||||
if add_comments:
|
||||
ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
|
||||
ostream.write("\n")
|
||||
ostream.write("};\n")
|
||||
|
||||
ostream.write(
|
||||
f"static const struct {{ uint16_t index; uint32_t name:24; }} ATTRIBUTE_PACKED "
|
||||
f"unicode_index_to_name[{len(all_chars)}] = {{\n"
|
||||
)
|
||||
for uc in sorted(all_chars, key=lambda c: c.index):
|
||||
content = f" {{ 0x{uc.index:04X}, {uc.word_indices_index} }},"
|
||||
ostream.write(content)
|
||||
if add_comments:
|
||||
ostream.write(f"{' ' * (21 - len(content))}/* {uc.name} */")
|
||||
ostream.write("\n")
|
||||
ostream.write("};\n")
|
||||
|
||||
# Max counts
|
||||
max_len = max(len(uc.name) for uc in all_chars_and_aliases)
|
||||
ostream.write(f"#define UNICODE_CHARNAME_MAX_LENGTH {max_len}\n")
|
||||
|
||||
max_words = max(len(uc.word_indices) for uc in all_chars_and_aliases)
|
||||
ostream.write(f"#define UNICODE_CHARNAME_MAX_WORDS {max_words}\n")
|
||||
|
||||
# Ranges
|
||||
ostream.write(
|
||||
"static const struct { uint16_t index; uint32_t gap; uint16_t length; } "
|
||||
f"unicode_ranges[{len(all_ranges)}] = {{\n"
|
||||
)
|
||||
for r in all_ranges:
|
||||
ostream.write(
|
||||
f" {{ {r.index}, {r.start_code - r.index}, {1 + r.end_code - r.start_code} }},\n"
|
||||
)
|
||||
ostream.write("};\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) >= 4:
|
||||
main(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
else:
|
||||
print("Usage: script.py <inputfile> <aliasfile> <outputfile>", file=sys.stderr)
|
||||
Loading…
Add table
Add a link
Reference in a new issue