Prepare perl 5.18.4 for 5.22

This commit is contained in:
Samuel Tyler 2025-08-31 11:52:18 +10:00
parent 863406ba04
commit dcce75dd79
8 changed files with 681 additions and 1 deletions

View file

@ -34,6 +34,10 @@ src_prepare() {
perl regen/unicode_constants.pl
perl regen/regcharclass.pl
# Change the name, patching the generator script is not easy
# Because of 0005-Move-an-inversion-list-generation-to-mktables.patch
sed -i "s/_Perl_Multi_Char_Folds_invlist/_Perl_Folds_To_Multi_Char_invlist/" charclass_invlists.h
# regenerate configure
ln -s ../metaconfig*/.package .
ln -s ../metaconfig*/U .

View file

@ -1,4 +1,4 @@
SPDX-FileCopyrightText: 2012 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
SPDX-License-Identifier: Artistic-1.0

View file

@ -0,0 +1,219 @@
SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
SPDX-License-Identifier: Artistic-1.0
This function is required for 5.22.
From 6bf3612f9c9b0788de8adf06539b41c64695c014 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 27 Jan 2015 15:08:08 -0700
Subject: [PATCH] Unicode::UCD: Add prop_values() function
This new function returns the input property's possible values.
---
lib/Unicode/UCD.pm | 125 +++++++++++++++++++++++++++++++++++++--------
2 files changed, 132 insertions(+), 23 deletions(-)
diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
index 9c3dd7c710..7033128ae5 100644
--- perl-5.18.4/lib/Unicode/UCD.pm
+++ perl-5.18.4/lib/Unicode/UCD.pm
@@ -22,6 +20,7 @@ our @EXPORT_OK = qw(charinfo
num
prop_aliases
prop_value_aliases
+ prop_values
prop_invlist
prop_invmap
MAX_CP
@@ -73,6 +72,9 @@ Unicode::UCD - Unicode character database
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+ use Unicode::UCD 'prop_values';
+ my @all_EA_short_names = prop_values("East_Asian_Width");
+
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
@@ -730,6 +732,9 @@ names>).
L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
different type of data structure.
+L<prop_values("Block")|/prop_values()> can be used to get all
+the known new-style block names as a list, without the code point ranges.
+
See also L</Blocks versus Scripts>.
=cut
@@ -752,6 +757,9 @@ the values.
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
different type of data structure.
+L<C<prop_values("Script")>|/prop_values()> can be used to get all
+the known script names as a list, without the code point ranges.
+
See also L</Blocks versus Scripts>.
=cut
@@ -835,8 +843,9 @@ from the long names to the short names. The general category is the
one returned from
L</charinfo()> under the C<category> key.
-The L</prop_value_aliases()> function can be used to get all the synonyms of
-the category name.
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
+alternative to this function; the first returning a simple list of the short
+category names; and the second gets all the synonyms of a given category name.
=cut
@@ -880,8 +889,10 @@ the Unicode TR9 is recommended reading:
L<http://www.unicode.org/reports/tr9/>
(as of Unicode 5.0.0)
-The L</prop_value_aliases()> function can be used to get all the synonyms of
-the bidi type name.
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
+alternative to this function; the first returning a simple list of the short
+bidi type names; and the second gets all the synonyms of a given bidi type
+name.
=cut
@@ -1864,6 +1875,79 @@ sub prop_aliases ($) {
=pod
+=head2 B<prop_values()>
+
+ use Unicode::UCD 'prop_values';
+
+ print "AHex values are: ", join(", ", prop_values("AHex")),
+ "\n";
+ prints:
+ AHex values are: N, Y
+
+Some Unicode properties have a restricted set of legal values. For example,
+all binary properties are restricted to just C<true> or C<false>; and there
+are only a few dozen possible General Categories. Use C<prop_values>
+to find out if a given property is one such, and if so, to get a list of the
+values:
+
+ print join ", ", prop_values("NFC_Quick_Check");
+ prints:
+ M, N, Y
+
+If the property doesn't have such a restricted set, C<undef> is returned.
+
+There are usually several synonyms for each possible value. Use
+L</prop_value_aliases()> to access those.
+
+Case, white space, hyphens, and underscores are ignored in the input property
+name (except for the trailing underscore in the old-form grandfathered-in
+general category property value C<"L_">, which is better written as C<"LC">).
+
+If the property name is unknown, C<undef> is returned. Note that Perl typically
+recognizes property names in regular expressions with an optional C<"Is_>"
+(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
+This function does not recognize those in the property parameter, returning
+C<undef>.
+
+For the block property, new-style block names are returned (see
+L</Old-style versus new-style block names>).
+
+C<prop_values> does not know about any user-defined properties, and
+will return C<undef> if called with one of those.
+
+=cut
+
+# These are created by mktables for this module and stored in unicore/UCD.pl
+# where their structures are described.
+our %loose_to_standard_value;
+our %prop_value_aliases;
+
+sub prop_values ($) {
+ my $prop = shift;
+ return undef unless defined $prop;
+
+ require "unicore/UCD.pl";
+ require "utf8_heavy.pl";
+
+ # Find the property name synonym that's used as the key in other hashes,
+ # which is element 0 in the returned list.
+ ($prop) = prop_aliases($prop);
+ return undef if ! $prop;
+ $prop = utf8::_loose_name(lc $prop);
+
+ # Here is a legal property.
+ return undef unless exists $prop_value_aliases{$prop};
+ my @return;
+ foreach my $value_key (sort { lc $a cmp lc $b }
+ keys %{$prop_value_aliases{$prop}})
+ {
+ push @return, $prop_value_aliases{$prop}{$value_key}[0];
+ }
+ return @return;
+}
+
+=pod
+
=head2 B<prop_value_aliases()>
use Unicode::UCD 'prop_value_aliases';
@@ -1877,7 +1961,7 @@ sub prop_aliases ($) {
print "The short name is $short_name\n";
print "The other aliases are: ", join(", ", @other_names), "\n";
- prints:
+ prints:
The full name is Punctuation
The short name is P
The other aliases are: Punct
@@ -1886,18 +1970,20 @@ Some Unicode properties have a restricted set of legal values. For example,
all binary properties are restricted to just C<true> or C<false>; and there
are only a few dozen possible General Categories.
-For such properties, there are usually several synonyms for each possible
-value. For example, in binary properties, I<truth> can be represented by any of
-the strings "Y", "Yes", "T", or "True"; and the General Category
-"Punctuation" by that string, or "Punct", or simply "P".
+You can use L</prop_values()> to find out if a given property is one which has
+a restricted set of values, and if so, what those values are. But usually
+each value actually has several synonyms. For example, in binary properties,
+I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True";
+and the General Category "Punctuation" by that string, or "Punct", or simply
+"P".
Like property names, there is typically at least a short name for each such
-property-value, and a long name. If you know any name of the property-value,
-you can use C<prop_value_aliases>() to get the long name (when called in
-scalar context), or a list of all the names, with the short name in the 0th
-element, the long name in the next element, and any other synonyms in the
-remaining elements, in no particular order, except that any all-numeric
-synonyms will be last.
+property-value, and a long name. If you know any name of the property-value
+(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
+to get the long name (when called in scalar context), or a list of all the
+names, with the short name in the 0th element, the long name in the next
+element, and any other synonyms in the remaining elements, in no particular
+order, except that any all-numeric synonyms will be last.
The long name is returned in a form nicely capitalized, suitable for printing.
@@ -1926,11 +2012,6 @@ will return C<undef> if called with one of those.
=cut
-# These are created by mktables for this routine and stored in unicore/UCD.pl
-# where their structures are described.
-our %loose_to_standard_value;
-our %prop_value_aliases;
-
sub prop_value_aliases ($$) {
my ($prop, $value) = @_;
return unless defined $prop && defined $value;
--
2.49.1

View file

@ -0,0 +1,68 @@
SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
SPDX-License-Identifier: Artistic-1.0
Again, we need this for 5.22.
From b86e5545be9b2e891e3ee8e2fba38b799b1836fc Mon Sep 17 00:00:00 2001
From: Nicholas Clark <nick@ccl4.org>
Date: Sun, 7 Jul 2013 15:12:42 +0200
Subject: [PATCH] Refactor ExtUtils::Embed to work with miniperl.
Remove the use of FileHandle, which relies on IO, and XS module.
Only load Getopt::Std if it is needed (in xsinit()), to avoid needing to add
Getopt::Std to lib/buildcustomize.pl
Require File::Spec instead of using it, as it exports nothing, so there is no
benefit to using it (but it costs a BEGIN block).
---
lib/ExtUtils/Embed.pm | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
index 9710630e51..758e24139c 100644
--- perl-5.18.4/lib/ExtUtils/Embed.pm
+++ perl-5.18.4/lib/ExtUtils/Embed.pm
@@ -2,10 +2,8 @@ require 5.002;
package ExtUtils::Embed;
require Exporter;
-require FileHandle;
use Config;
-use Getopt::Std;
-use File::Spec;
+require File::Spec;
#Only when we need them
#require ExtUtils::MakeMaker;
@@ -18,7 +16,7 @@ use vars qw(@ISA @EXPORT $VERSION
use strict;
# This is not a dual-life module, so no need for development version numbers
-$VERSION = '1.30';
+$VERSION = '1.31';
@ISA = qw(Exporter);
@EXPORT = qw(&xsinit &ldopts
@@ -54,7 +52,8 @@ sub xsinit {
@mods = @$mods if $mods;
}
else {
- getopts('o:s:');
+ require Getopt::Std;
+ Getopt::Std::getopts('o:s:');
$file = $opt_o if defined $opt_o;
$std = $opt_s if defined $opt_s;
@mods = @ARGV;
@@ -65,7 +64,8 @@ sub xsinit {
$fh = \*STDOUT;
}
else {
- $fh = new FileHandle "> $file";
+ open $fh, '>', $file
+ or die "Can't open '$file': $!";
}
push(@mods, static_ext()) if defined $std;
--
2.49.1

View file

@ -0,0 +1,79 @@
SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
SPDX-License-Identifier: Artistic-1.0
Again, we need this for 5.22.
From 87c2fdd468989b9450aa393c4c715594cbba5711 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <nick@ccl4.org>
Date: Mon, 8 Jul 2013 11:08:12 +0200
Subject: [PATCH] Tweak ExtUtils::Embed's generated C code to be closer to
ExtUtils::Miniperl.
Use #include "..." instead of #include <...> in xsi_header(), and don't add
a trailing newline (and add a newline in xsinit() to compensate).
Use four spaces instead of a tab for indenting.
If there are no extensions and hence no calls to newXS() don't declare file[]
and don't add a trailing newline.
---
lib/ExtUtils/Embed.pm | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
index 9710630e51..52221cba53 100644
--- perl-5.18.4/lib/ExtUtils/Embed.pm
+++ perl-5.18.4/lib/ExtUtils/Embed.pm
@@ -72,7 +72,7 @@ sub xsinit {
@mods = grep(!$seen{$_}++, @mods);
print $fh &xsi_header();
- print $fh "EXTERN_C void xs_init ($xsinit_proto);\n\n";
+ print $fh "\nEXTERN_C void xs_init ($xsinit_proto);\n\n";
print $fh &xsi_protos(@mods);
print $fh "\nEXTERN_C void\nxs_init($xsinit_proto)\n{\n";
@@ -83,9 +83,9 @@ sub xsinit {
sub xsi_header {
return <<EOF;
-#include <EXTERN.h>
-#include <perl.h>
-
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
EOF
}
@@ -109,9 +109,12 @@ sub xsi_body {
my(@exts) = @_;
my($pname,@retval,%seen);
my($dl) = canon('/','DynaLoader');
- push(@retval, "\tchar *file = __FILE__;\n");
- push(@retval, "\tdXSUB_SYS;\n") if $] > 5.002;
- push(@retval, "\n");
+ push(@retval, " static const char file[] = __FILE__;\n")
+ if @exts;
+ push(@retval, " dXSUB_SYS;\n");
+ push(@retval, " PERL_UNUSED_CONTEXT;\n");
+ push(@retval, "\n")
+ if @exts;
foreach $_ (@exts){
my($pname) = canon('/', $_);
@@ -121,10 +124,10 @@ sub xsi_body {
if ($pname eq $dl){
# Must NOT install 'DynaLoader::boot_DynaLoader' as 'bootstrap'!
# boot_DynaLoader is called directly in DynaLoader.pm
- $ccode = "\t/* DynaLoader is a special case */\n\tnewXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
+ $ccode = " /* DynaLoader is a special case */\n newXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
push(@retval, $ccode) unless $seen{$ccode}++;
} else {
- $ccode = "\tnewXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
+ $ccode = " newXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
push(@retval, $ccode) unless $seen{$ccode}++;
}
}
--
2.49.1

View file

@ -0,0 +1,72 @@
SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
SPDX-License-Identifier: Artistic-1.0
This renaming is expected by 5.22. The resulting modifications to
charclass_invlists.h are done in the script.
From c75f24e6fc1fa9676693e4e42f199e542b2b2549 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Thu, 23 Jan 2014 20:34:15 -0700
Subject: [PATCH 1/2] Move an inversion list generation to mktables
Prior to this patch, this was in regen/mk_invlists.pl, but future
commits will want it to also be used by the header generated by
regen/regcharclass.pl, so use a common source so the logic doesn't have
to be duplicated.
---
lib/unicore/mktables | 22 ++++++++++++++++++++++
regcomp.c | 3 ++-
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
index 808760d002..a5c0d8930e 100644
--- perl-5.18.4/lib/unicore/mktables
+++ perl-5.18.4/lib/unicore/mktables
@@ -13321,6 +13321,28 @@ sub compile_perl() {
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
+ my $folds_to_multi_char = $perl->add_match_table(
+ "_Perl_Folds_To_Multi_Char",
+ Description =>
+ "Code points whose fold is a string of more than one character",
+ );
+
+ foreach my $range (property_ref('Case_Folding')->ranges) {
+ my $start = $range->start;
+ my $end = $range->end;
+ $any_folds->add_range($start, $end);
+
+ my @hex_code_points = split " ", $range->value;
+ if (@hex_code_points > 1) {
+ $folds_to_multi_char->add_range($start, $end);
+ }
+
+ foreach my $i (0 .. @hex_code_points - 1) {
+ my $code_point = hex $hex_code_points[$i];
+ $any_folds->add_range($code_point, $code_point);
+ }
+ }
+
my $dt = property_ref('Decomposition_Type');
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
diff --git perl-5.18.4/regcomp.c perl-5.18.4/regcomp.c
index 0841f172e5..e839e3e765 100644
--- perl-5.18.4/regcomp.c
+++ perl-5.18.4/regcomp.c
@@ -5566,7 +5566,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
- PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
+ PL_HasMultiCharFold =
+ _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
}
#endif
--
2.49.1

View file

@ -0,0 +1,199 @@
SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
SPDX-License-Identifier: Artistic-1.0
The _Perl_Any_Folds and _Perl_Problematic_Locale_Folds tables are expected
by 5.22.
Their addition is part of a giga commit (~1500 lines changed), we take
just that small part of the commit.
From 10d28d6a24e2a3c41270c43203be5502a586b2f9 Mon Sep 17 00:00:00 2001
From: Samuel Tyler <fosslinux@aussies.space>
Date: Thu, 28 Aug 2025 21:54:12 +1000
Subject: [PATCH 2/2] Partial backport: Work properly under UTF-8 LC_CTYPE
locales
[... original commit message follows, mostly irrelevant]
This large (sorry, I couldn't figure out how to meaningfully split it
up) commit causes Perl to fully support LC_CTYPE operations (case
changing, character classification) in UTF-8 locales.
As a side effect it resolves [perl #56820].
The basics are easy, but there were a lot of details, and one
troublesome edge case discussed below.
What essentially happens is that when the locale is changed to a UTF-8
one, a global variable is set TRUE (FALSE when changed to a non-UTF-8
locale). Within the scope of 'use locale', this variable is checked,
and if TRUE, the code that Perl uses for non-locale behavior is used
instead of the code for locale behavior. Since Perl's internal
representation is UTF-8, we get UTF-8 behavior for a UTF-8 locale.
More work had to be done for regular expressions. There are three
cases.
1) The character classes \w, [[:punct:]] needed no extra work, as
the changes fall out from the base work.
2) Strings that are to be matched case-insensitively. These form
EXACTFL regops (nodes). Notice that if such a string contains only
characters above-Latin1 that match only themselves, that the node can be
downgraded to an EXACT-only node, which presents better optimization
possibilities, as we now have a fixed string known at compile time to be
required to be in the target string to match. Similarly if all
characters in the string match only other above-Latin1 characters
case-insensitively, the node can be downgraded to a regular EXACTFU node
(match, folding, using Unicode, not locale, rules). The code changes
for this could be done without accepting UTF-8 locales fully, but there
were edge cases which needed to be handled differently if I stopped
there, so I continued on.
In an EXACTFL node, all such characters are now folded at compile time
(just as before this commit), while the other characters whose folds are
locale-dependent are left unfolded. This means that they have to be
folded at execution time based on the locale in effect at the moment.
Again, this isn't a change from before. The difference is that now some
of the folds that need to be done at execution time (in regexec) are
potentially multi-char. Some of the code in regexec was trivial to
extend to account for this because of existing infrastructure, but the
part dealing with regex quantifiers, had to have more work.
Also the code that joins EXACTish nodes together had to be expanded to
account for the possibility of multi-character folds within locale
handling. This was fairly easy, because it already has infrastructure
to handle these under somewhat different circumstances.
3) In bracketed character classes, represented by ANYOF nodes, a new
inversion list was created giving the characters that should be matched
by this node when the runtime locale is UTF-8. The list is ignored
except under that circumstance. To do this, I created a new ANYOF type
which has an extra SV for the inversion list.
The edge case that caused the most difficulty is folding involving the
MICRO SIGN, U+00B5. It folds to the GREEK SMALL LETTER MU, as does the
GREEK CAPITAL LETTER MU. The MICRO SIGN is the only 0-255 range
character that folds to outside that range. The issue is that it
doesn't naturally fall out that it will match the CAP MU. If we let the
CAP MU fold to the samll mu at compile time (which it can because both
are above-Latin1 and so the fold is the same no matter what locale is in
effect), it could appear that the regnode can be downgraded away from
EXACTFL to EXACTFU, but doing so would cause the MICRO SIGN to not case
insensitvely match the CAP MU. This could be special cased in regcomp
and regexec, but I wanted to avoid that. Instead the mktables tables
are set up to include the CAP MU as a character whose presence forbids
the downgrading, so the special casing is in mktables, and not in the C
code.
---
lib/unicore/mktables | 79 ++++++++++++++++++++++++++++++++++++++++----
1 file changed, 73 insertions(+), 6 deletions(-)
diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
index a5c0d8930e..4b34b3c338 100644
--- perl-5.18.4/lib/unicore/mktables
+++ perl-5.18.4/lib/unicore/mktables
@@ -13321,25 +13321,92 @@ sub compile_perl() {
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
+ my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
+ Description => "Code points that particpate in some fold",
+ );
+ my $loc_problem_folds = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Folds",
+ Description =>
+ "Code points that are in some way problematic under locale",
+ );
+
+ # This allows regexec.c to skip some work when appropriate. Some of the
+ # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
+ my $loc_problem_folds_start = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Foldeds_Start",
+ Description =>
+ "The first character of every sequence in _Perl_Problematic_Locale_Folds",
+ );
+
+ my $cf = property_ref('Case_Folding');
+
+ # Every character 0-255 is problematic because what each folds to depends
+ # on the current locale
+ $loc_problem_folds->add_range(0, 255);
+ $loc_problem_folds_start += $loc_problem_folds;
+
+ # Also problematic are anything these fold to outside the range. Likely
+ # forever the only thing folded to by these outside the 0-255 range is the
+ # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
+ # completely general, which should catch any unexpected changes or errors.
+ # We look at each code point 0-255, and add its fold (including each part
+ # of a multi-char fold) to the list. See the commit message for these
+ # changes for a more complete description of the MU issue.
+ foreach my $range ($loc_problem_folds->ranges) {
+ foreach my $code_point($range->start .. $range->end) {
+ my $fold_range = $cf->containing_range($code_point);
+ next unless defined $fold_range;
+
+ my @hex_folds = split " ", $fold_range->value;
+ my $start_cp = hex $hex_folds[0];
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ next unless $cp > 255; # Already have the < 256 ones
+
+ $loc_problem_folds->add_range($cp, $cp);
+ $loc_problem_folds_start->add_range($start_cp, $start_cp);
+ }
+ }
+ }
+
my $folds_to_multi_char = $perl->add_match_table(
"_Perl_Folds_To_Multi_Char",
Description =>
"Code points whose fold is a string of more than one character",
);
- foreach my $range (property_ref('Case_Folding')->ranges) {
+ # Look through all the known folds to populate these tables.
+ foreach my $range ($cf->ranges) {
my $start = $range->start;
my $end = $range->end;
$any_folds->add_range($start, $end);
- my @hex_code_points = split " ", $range->value;
- if (@hex_code_points > 1) {
+ my @hex_folds = split " ", $range->value;
+ if (@hex_folds > 1) { # Is multi-char fold
$folds_to_multi_char->add_range($start, $end);
}
- foreach my $i (0 .. @hex_code_points - 1) {
- my $code_point = hex $hex_code_points[$i];
- $any_folds->add_range($code_point, $code_point);
+ my $found_locale_problematic = 0;
+
+ # Look at each of the folded-to characters...
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ $any_folds->add_range($cp, $cp);
+
+ # The fold is problematic if any of the folded-to characters is
+ # already considered problematic.
+ if ($loc_problem_folds->contains($cp)) {
+ $loc_problem_folds->add_range($start, $end);
+ $found_locale_problematic = 1;
+ }
+ }
+
+ # If this is a problematic fold, add to the start chars the
+ # folding-from characters and first folded-to character.
+ if ($found_locale_problematic) {
+ $loc_problem_folds_start->add_range($start, $end);
+ my $cp = hex $hex_folds[0];
+ $loc_problem_folds_start->add_range($cp, $cp);
}
}
--
2.49.1

View file

@ -0,0 +1,39 @@
SPDX-FileCopyrightText: 2013 Karl Williamson <public@khwilliamson.com>
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
SPDX-License-Identifier: Artistic-1.0
This renaming is required for 5.22.
It is a manual port of 1fdd5e539a9.
diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
index 9c3dd7c710..2349300626 100644
--- perl-5.18.4/lib/Unicode/UCD.pm
+++ perl-5.18.4/lib/Unicode/UCD.pm
@@ -25,6 +25,7 @@ our @EXPORT_OK = qw(charinfo
prop_invlist
prop_invmap
MAX_CP
+ search_invlist
);
use Carp;
@@ -2261,7 +2261,7 @@ sub prop_invlist ($;$) {
return @invlist;
}
-sub _search_invlist {
+sub search_invlist {
# Find the range in the inversion list which contains a code point; that
# is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no
# such i.
@@ -3411,7 +3411,7 @@ RETRY:
}
# Find the range that the override applies to.
- my $i = _search_invlist(\@invlist, $cp);
+ my $i = search_invlist(\@invlist, $cp);
if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
}