From dcce75dd7996d46cedca57be75cbd5394945d83a Mon Sep 17 00:00:00 2001 From: Samuel Tyler Date: Sun, 31 Aug 2025 11:52:18 +1000 Subject: [PATCH] Prepare perl 5.18.4 for 5.22 --- steps/perl-5.18.4/pass1.sh | 4 + ...-Allow-internal-properties-in-invmap.patch | 2 +- ...Unicode-UCD-Add-prop_values-function.patch | 219 ++++++++++++++++++ ...ExtUtils-Embed-to-work-with-miniperl.patch | 68 ++++++ ...mbed-s-generated-C-code-to-be-closer.patch | 79 +++++++ ...nversion-list-generation-to-mktables.patch | 72 ++++++ ...-Work-properly-under-UTF-8-LC_CTYPE-.patch | 199 ++++++++++++++++ .../patches/Unicode-UCD-search_invlist.patch | 39 ++++ 8 files changed, 681 insertions(+), 1 deletion(-) create mode 100644 steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch create mode 100644 steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch create mode 100644 steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch create mode 100644 steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch create mode 100644 steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch create mode 100644 steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch diff --git a/steps/perl-5.18.4/pass1.sh b/steps/perl-5.18.4/pass1.sh index d4c4d7af..dae2a5c0 100755 --- a/steps/perl-5.18.4/pass1.sh +++ b/steps/perl-5.18.4/pass1.sh @@ -34,6 +34,10 @@ src_prepare() { perl regen/unicode_constants.pl perl regen/regcharclass.pl + # Change the name, patching the generator script is not easy + # Because of 0005-Move-an-inversion-list-generation-to-mktables.patch + sed -i "s/_Perl_Multi_Char_Folds_invlist/_Perl_Folds_To_Multi_Char_invlist/" charclass_invlists.h + # regenerate configure ln -s ../metaconfig*/.package . ln -s ../metaconfig*/U . diff --git a/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch b/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch index 61f0b0b9..db46658f 100644 --- a/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch +++ b/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch @@ -1,4 +1,4 @@ -SPDX-FileCopyrightText: 2012 Karl Williamson +SPDX-FileCopyrightText: 2014 Karl Williamson SPDX-FileCopyrightText: 2025 fosslinux SPDX-License-Identifier: Artistic-1.0 diff --git a/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch b/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch new file mode 100644 index 00000000..bc144b3a --- /dev/null +++ b/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch @@ -0,0 +1,219 @@ +SPDX-FileCopyrightText: 2015 Karl Williamson +SPDX-FileCopyrightText: 2025 fosslinux + +SPDX-License-Identifier: Artistic-1.0 + +This function is required for 5.22. + +From 6bf3612f9c9b0788de8adf06539b41c64695c014 Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Tue, 27 Jan 2015 15:08:08 -0700 +Subject: [PATCH] Unicode::UCD: Add prop_values() function + +This new function returns the input property's possible values. +--- + lib/Unicode/UCD.pm | 125 +++++++++++++++++++++++++++++++++++++-------- + 2 files changed, 132 insertions(+), 23 deletions(-) + +diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm +index 9c3dd7c710..7033128ae5 100644 +--- perl-5.18.4/lib/Unicode/UCD.pm ++++ perl-5.18.4/lib/Unicode/UCD.pm +@@ -22,6 +20,7 @@ our @EXPORT_OK = qw(charinfo + num + prop_aliases + prop_value_aliases ++ prop_values + prop_invlist + prop_invmap + MAX_CP +@@ -73,6 +72,9 @@ Unicode::UCD - Unicode character database + use Unicode::UCD 'prop_value_aliases'; + my @gc_punct_names = prop_value_aliases("Gc", "Punct"); + ++ use Unicode::UCD 'prop_values'; ++ my @all_EA_short_names = prop_values("East_Asian_Width"); ++ + use Unicode::UCD 'prop_invlist'; + my @puncts = prop_invlist("gc=punctuation"); + +@@ -730,6 +732,9 @@ names>). + L can be used to get this same data in a + different type of data structure. + ++L can be used to get all ++the known new-style block names as a list, without the code point ranges. ++ + See also L. + + =cut +@@ -752,6 +757,9 @@ the values. + L can be used to get this same data in a + different type of data structure. + ++L|/prop_values()> can be used to get all ++the known script names as a list, without the code point ranges. ++ + See also L. + + =cut +@@ -835,8 +843,9 @@ from the long names to the short names. The general category is the + one returned from + L under the C key. + +-The L function can be used to get all the synonyms of +-the category name. ++The L and L functions can be used as an ++alternative to this function; the first returning a simple list of the short ++category names; and the second gets all the synonyms of a given category name. + + =cut + +@@ -880,8 +889,10 @@ the Unicode TR9 is recommended reading: + L + (as of Unicode 5.0.0) + +-The L function can be used to get all the synonyms of +-the bidi type name. ++The L and L functions can be used as an ++alternative to this function; the first returning a simple list of the short ++bidi type names; and the second gets all the synonyms of a given bidi type ++name. + + =cut + +@@ -1864,6 +1875,79 @@ sub prop_aliases ($) { + + =pod + ++=head2 B ++ ++ use Unicode::UCD 'prop_values'; ++ ++ print "AHex values are: ", join(", ", prop_values("AHex")), ++ "\n"; ++ prints: ++ AHex values are: N, Y ++ ++Some Unicode properties have a restricted set of legal values. For example, ++all binary properties are restricted to just C or C; and there ++are only a few dozen possible General Categories. Use C ++to find out if a given property is one such, and if so, to get a list of the ++values: ++ ++ print join ", ", prop_values("NFC_Quick_Check"); ++ prints: ++ M, N, Y ++ ++If the property doesn't have such a restricted set, C is returned. ++ ++There are usually several synonyms for each possible value. Use ++L to access those. ++ ++Case, white space, hyphens, and underscores are ignored in the input property ++name (except for the trailing underscore in the old-form grandfathered-in ++general category property value C<"L_">, which is better written as C<"LC">). ++ ++If the property name is unknown, C is returned. Note that Perl typically ++recognizes property names in regular expressions with an optional C<"Is_>" ++(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. ++This function does not recognize those in the property parameter, returning ++C. ++ ++For the block property, new-style block names are returned (see ++L). ++ ++C does not know about any user-defined properties, and ++will return C if called with one of those. ++ ++=cut ++ ++# These are created by mktables for this module and stored in unicore/UCD.pl ++# where their structures are described. ++our %loose_to_standard_value; ++our %prop_value_aliases; ++ ++sub prop_values ($) { ++ my $prop = shift; ++ return undef unless defined $prop; ++ ++ require "unicore/UCD.pl"; ++ require "utf8_heavy.pl"; ++ ++ # Find the property name synonym that's used as the key in other hashes, ++ # which is element 0 in the returned list. ++ ($prop) = prop_aliases($prop); ++ return undef if ! $prop; ++ $prop = utf8::_loose_name(lc $prop); ++ ++ # Here is a legal property. ++ return undef unless exists $prop_value_aliases{$prop}; ++ my @return; ++ foreach my $value_key (sort { lc $a cmp lc $b } ++ keys %{$prop_value_aliases{$prop}}) ++ { ++ push @return, $prop_value_aliases{$prop}{$value_key}[0]; ++ } ++ return @return; ++} ++ ++=pod ++ + =head2 B + + use Unicode::UCD 'prop_value_aliases'; +@@ -1877,7 +1961,7 @@ sub prop_aliases ($) { + print "The short name is $short_name\n"; + print "The other aliases are: ", join(", ", @other_names), "\n"; + +- prints: ++ prints: + The full name is Punctuation + The short name is P + The other aliases are: Punct +@@ -1886,18 +1970,20 @@ Some Unicode properties have a restricted set of legal values. For example, + all binary properties are restricted to just C or C; and there + are only a few dozen possible General Categories. + +-For such properties, there are usually several synonyms for each possible +-value. For example, in binary properties, I can be represented by any of +-the strings "Y", "Yes", "T", or "True"; and the General Category +-"Punctuation" by that string, or "Punct", or simply "P". ++You can use L to find out if a given property is one which has ++a restricted set of values, and if so, what those values are. But usually ++each value actually has several synonyms. For example, in binary properties, ++I can be represented by any of the strings "Y", "Yes", "T", or "True"; ++and the General Category "Punctuation" by that string, or "Punct", or simply ++"P". + + Like property names, there is typically at least a short name for each such +-property-value, and a long name. If you know any name of the property-value, +-you can use C() to get the long name (when called in +-scalar context), or a list of all the names, with the short name in the 0th +-element, the long name in the next element, and any other synonyms in the +-remaining elements, in no particular order, except that any all-numeric +-synonyms will be last. ++property-value, and a long name. If you know any name of the property-value ++(which you can get by L, you can use C() ++to get the long name (when called in scalar context), or a list of all the ++names, with the short name in the 0th element, the long name in the next ++element, and any other synonyms in the remaining elements, in no particular ++order, except that any all-numeric synonyms will be last. + + The long name is returned in a form nicely capitalized, suitable for printing. + +@@ -1926,11 +2012,6 @@ will return C if called with one of those. + + =cut + +-# These are created by mktables for this routine and stored in unicore/UCD.pl +-# where their structures are described. +-our %loose_to_standard_value; +-our %prop_value_aliases; +- + sub prop_value_aliases ($$) { + my ($prop, $value) = @_; + return unless defined $prop && defined $value; +-- +2.49.1 + diff --git a/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch b/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch new file mode 100644 index 00000000..f947537e --- /dev/null +++ b/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch @@ -0,0 +1,68 @@ +SPDX-FileCopyrightText: 2013 Nicholas Clark + +SPDX-License-Identifier: Artistic-1.0 + +Again, we need this for 5.22. + +From b86e5545be9b2e891e3ee8e2fba38b799b1836fc Mon Sep 17 00:00:00 2001 +From: Nicholas Clark +Date: Sun, 7 Jul 2013 15:12:42 +0200 +Subject: [PATCH] Refactor ExtUtils::Embed to work with miniperl. + +Remove the use of FileHandle, which relies on IO, and XS module. +Only load Getopt::Std if it is needed (in xsinit()), to avoid needing to add +Getopt::Std to lib/buildcustomize.pl +Require File::Spec instead of using it, as it exports nothing, so there is no +benefit to using it (but it costs a BEGIN block). +--- + lib/ExtUtils/Embed.pm | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm +index 9710630e51..758e24139c 100644 +--- perl-5.18.4/lib/ExtUtils/Embed.pm ++++ perl-5.18.4/lib/ExtUtils/Embed.pm +@@ -2,10 +2,8 @@ require 5.002; + + package ExtUtils::Embed; + require Exporter; +-require FileHandle; + use Config; +-use Getopt::Std; +-use File::Spec; ++require File::Spec; + + #Only when we need them + #require ExtUtils::MakeMaker; +@@ -18,7 +16,7 @@ use vars qw(@ISA @EXPORT $VERSION + use strict; + + # This is not a dual-life module, so no need for development version numbers +-$VERSION = '1.30'; ++$VERSION = '1.31'; + + @ISA = qw(Exporter); + @EXPORT = qw(&xsinit &ldopts +@@ -54,7 +52,8 @@ sub xsinit { + @mods = @$mods if $mods; + } + else { +- getopts('o:s:'); ++ require Getopt::Std; ++ Getopt::Std::getopts('o:s:'); + $file = $opt_o if defined $opt_o; + $std = $opt_s if defined $opt_s; + @mods = @ARGV; +@@ -65,7 +64,8 @@ sub xsinit { + $fh = \*STDOUT; + } + else { +- $fh = new FileHandle "> $file"; ++ open $fh, '>', $file ++ or die "Can't open '$file': $!"; + } + + push(@mods, static_ext()) if defined $std; +-- +2.49.1 + diff --git a/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch b/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch new file mode 100644 index 00000000..0cf4c4ed --- /dev/null +++ b/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch @@ -0,0 +1,79 @@ +SPDX-FileCopyrightText: 2013 Nicholas Clark + +SPDX-License-Identifier: Artistic-1.0 + +Again, we need this for 5.22. + +From 87c2fdd468989b9450aa393c4c715594cbba5711 Mon Sep 17 00:00:00 2001 +From: Nicholas Clark +Date: Mon, 8 Jul 2013 11:08:12 +0200 +Subject: [PATCH] Tweak ExtUtils::Embed's generated C code to be closer to + ExtUtils::Miniperl. + +Use #include "..." instead of #include <...> in xsi_header(), and don't add +a trailing newline (and add a newline in xsinit() to compensate). +Use four spaces instead of a tab for indenting. +If there are no extensions and hence no calls to newXS() don't declare file[] +and don't add a trailing newline. +--- + lib/ExtUtils/Embed.pm | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm +index 9710630e51..52221cba53 100644 +--- perl-5.18.4/lib/ExtUtils/Embed.pm ++++ perl-5.18.4/lib/ExtUtils/Embed.pm +@@ -72,7 +72,7 @@ sub xsinit { + @mods = grep(!$seen{$_}++, @mods); + + print $fh &xsi_header(); +- print $fh "EXTERN_C void xs_init ($xsinit_proto);\n\n"; ++ print $fh "\nEXTERN_C void xs_init ($xsinit_proto);\n\n"; + print $fh &xsi_protos(@mods); + + print $fh "\nEXTERN_C void\nxs_init($xsinit_proto)\n{\n"; +@@ -83,9 +83,9 @@ sub xsinit { + + sub xsi_header { + return < +-#include +- ++#include "EXTERN.h" ++#include "perl.h" ++#include "XSUB.h" + EOF + } + +@@ -109,9 +109,12 @@ sub xsi_body { + my(@exts) = @_; + my($pname,@retval,%seen); + my($dl) = canon('/','DynaLoader'); +- push(@retval, "\tchar *file = __FILE__;\n"); +- push(@retval, "\tdXSUB_SYS;\n") if $] > 5.002; +- push(@retval, "\n"); ++ push(@retval, " static const char file[] = __FILE__;\n") ++ if @exts; ++ push(@retval, " dXSUB_SYS;\n"); ++ push(@retval, " PERL_UNUSED_CONTEXT;\n"); ++ push(@retval, "\n") ++ if @exts; + + foreach $_ (@exts){ + my($pname) = canon('/', $_); +@@ -121,10 +124,10 @@ sub xsi_body { + if ($pname eq $dl){ + # Must NOT install 'DynaLoader::boot_DynaLoader' as 'bootstrap'! + # boot_DynaLoader is called directly in DynaLoader.pm +- $ccode = "\t/* DynaLoader is a special case */\n\tnewXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n"; ++ $ccode = " /* DynaLoader is a special case */\n newXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n"; + push(@retval, $ccode) unless $seen{$ccode}++; + } else { +- $ccode = "\tnewXS(\"${mname}::bootstrap\", boot_${cname}, file);\n"; ++ $ccode = " newXS(\"${mname}::bootstrap\", boot_${cname}, file);\n"; + push(@retval, $ccode) unless $seen{$ccode}++; + } + } +-- +2.49.1 + diff --git a/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch b/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch new file mode 100644 index 00000000..0a459539 --- /dev/null +++ b/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch @@ -0,0 +1,72 @@ +SPDX-FileCopyrightText: 2015 Karl Williamson +SPDX-FileCopyrightText: 2025 fosslinux + +SPDX-License-Identifier: Artistic-1.0 + +This renaming is expected by 5.22. The resulting modifications to +charclass_invlists.h are done in the script. + +From c75f24e6fc1fa9676693e4e42f199e542b2b2549 Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Thu, 23 Jan 2014 20:34:15 -0700 +Subject: [PATCH 1/2] Move an inversion list generation to mktables + +Prior to this patch, this was in regen/mk_invlists.pl, but future +commits will want it to also be used by the header generated by +regen/regcharclass.pl, so use a common source so the logic doesn't have +to be duplicated. +--- + lib/unicore/mktables | 22 ++++++++++++++++++++++ + regcomp.c | 3 ++- + 2 files changed, 24 insertions(+), 1 deletion(-) + +diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables +index 808760d002..a5c0d8930e 100644 +--- perl-5.18.4/lib/unicore/mktables ++++ perl-5.18.4/lib/unicore/mktables +@@ -13321,6 +13321,28 @@ sub compile_perl() { + } + $PosixXDigit->add_description('[0-9A-Fa-f]'); + ++ my $folds_to_multi_char = $perl->add_match_table( ++ "_Perl_Folds_To_Multi_Char", ++ Description => ++ "Code points whose fold is a string of more than one character", ++ ); ++ ++ foreach my $range (property_ref('Case_Folding')->ranges) { ++ my $start = $range->start; ++ my $end = $range->end; ++ $any_folds->add_range($start, $end); ++ ++ my @hex_code_points = split " ", $range->value; ++ if (@hex_code_points > 1) { ++ $folds_to_multi_char->add_range($start, $end); ++ } ++ ++ foreach my $i (0 .. @hex_code_points - 1) { ++ my $code_point = hex $hex_code_points[$i]; ++ $any_folds->add_range($code_point, $code_point); ++ } ++ } ++ + my $dt = property_ref('Decomposition_Type'); + $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', + Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), +diff --git perl-5.18.4/regcomp.c perl-5.18.4/regcomp.c +index 0841f172e5..e839e3e765 100644 +--- perl-5.18.4/regcomp.c ++++ perl-5.18.4/regcomp.c +@@ -5566,7 +5566,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); + +- PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); ++ PL_HasMultiCharFold = ++ _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + } + #endif + +-- +2.49.1 + diff --git a/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch b/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch new file mode 100644 index 00000000..6fbf6aaa --- /dev/null +++ b/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch @@ -0,0 +1,199 @@ +SPDX-FileCopyrightText: 2014 Karl Williamson +SPDX-FileCopyrightText: 2025 fosslinux + +SPDX-License-Identifier: Artistic-1.0 + +The _Perl_Any_Folds and _Perl_Problematic_Locale_Folds tables are expected +by 5.22. + +Their addition is part of a giga commit (~1500 lines changed), we take +just that small part of the commit. + +From 10d28d6a24e2a3c41270c43203be5502a586b2f9 Mon Sep 17 00:00:00 2001 +From: Samuel Tyler +Date: Thu, 28 Aug 2025 21:54:12 +1000 +Subject: [PATCH 2/2] Partial backport: Work properly under UTF-8 LC_CTYPE + locales + +[... original commit message follows, mostly irrelevant] + +This large (sorry, I couldn't figure out how to meaningfully split it +up) commit causes Perl to fully support LC_CTYPE operations (case +changing, character classification) in UTF-8 locales. + +As a side effect it resolves [perl #56820]. + +The basics are easy, but there were a lot of details, and one +troublesome edge case discussed below. + +What essentially happens is that when the locale is changed to a UTF-8 +one, a global variable is set TRUE (FALSE when changed to a non-UTF-8 +locale). Within the scope of 'use locale', this variable is checked, +and if TRUE, the code that Perl uses for non-locale behavior is used +instead of the code for locale behavior. Since Perl's internal +representation is UTF-8, we get UTF-8 behavior for a UTF-8 locale. + +More work had to be done for regular expressions. There are three +cases. + +1) The character classes \w, [[:punct:]] needed no extra work, as +the changes fall out from the base work. + +2) Strings that are to be matched case-insensitively. These form +EXACTFL regops (nodes). Notice that if such a string contains only +characters above-Latin1 that match only themselves, that the node can be +downgraded to an EXACT-only node, which presents better optimization +possibilities, as we now have a fixed string known at compile time to be +required to be in the target string to match. Similarly if all +characters in the string match only other above-Latin1 characters +case-insensitively, the node can be downgraded to a regular EXACTFU node +(match, folding, using Unicode, not locale, rules). The code changes +for this could be done without accepting UTF-8 locales fully, but there +were edge cases which needed to be handled differently if I stopped +there, so I continued on. + +In an EXACTFL node, all such characters are now folded at compile time +(just as before this commit), while the other characters whose folds are +locale-dependent are left unfolded. This means that they have to be +folded at execution time based on the locale in effect at the moment. +Again, this isn't a change from before. The difference is that now some +of the folds that need to be done at execution time (in regexec) are +potentially multi-char. Some of the code in regexec was trivial to +extend to account for this because of existing infrastructure, but the +part dealing with regex quantifiers, had to have more work. + +Also the code that joins EXACTish nodes together had to be expanded to +account for the possibility of multi-character folds within locale +handling. This was fairly easy, because it already has infrastructure +to handle these under somewhat different circumstances. + +3) In bracketed character classes, represented by ANYOF nodes, a new +inversion list was created giving the characters that should be matched +by this node when the runtime locale is UTF-8. The list is ignored +except under that circumstance. To do this, I created a new ANYOF type +which has an extra SV for the inversion list. + +The edge case that caused the most difficulty is folding involving the +MICRO SIGN, U+00B5. It folds to the GREEK SMALL LETTER MU, as does the +GREEK CAPITAL LETTER MU. The MICRO SIGN is the only 0-255 range +character that folds to outside that range. The issue is that it +doesn't naturally fall out that it will match the CAP MU. If we let the +CAP MU fold to the samll mu at compile time (which it can because both +are above-Latin1 and so the fold is the same no matter what locale is in +effect), it could appear that the regnode can be downgraded away from +EXACTFL to EXACTFU, but doing so would cause the MICRO SIGN to not case +insensitvely match the CAP MU. This could be special cased in regcomp +and regexec, but I wanted to avoid that. Instead the mktables tables +are set up to include the CAP MU as a character whose presence forbids +the downgrading, so the special casing is in mktables, and not in the C +code. +--- + lib/unicore/mktables | 79 ++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 73 insertions(+), 6 deletions(-) + +diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables +index a5c0d8930e..4b34b3c338 100644 +--- perl-5.18.4/lib/unicore/mktables ++++ perl-5.18.4/lib/unicore/mktables +@@ -13321,25 +13321,92 @@ sub compile_perl() { + } + $PosixXDigit->add_description('[0-9A-Fa-f]'); + ++ my $any_folds = $perl->add_match_table("_Perl_Any_Folds", ++ Description => "Code points that particpate in some fold", ++ ); ++ my $loc_problem_folds = $perl->add_match_table( ++ "_Perl_Problematic_Locale_Folds", ++ Description => ++ "Code points that are in some way problematic under locale", ++ ); ++ ++ # This allows regexec.c to skip some work when appropriate. Some of the ++ # entries in _Perl_Problematic_Locale_Folds are multi-character folds, ++ my $loc_problem_folds_start = $perl->add_match_table( ++ "_Perl_Problematic_Locale_Foldeds_Start", ++ Description => ++ "The first character of every sequence in _Perl_Problematic_Locale_Folds", ++ ); ++ ++ my $cf = property_ref('Case_Folding'); ++ ++ # Every character 0-255 is problematic because what each folds to depends ++ # on the current locale ++ $loc_problem_folds->add_range(0, 255); ++ $loc_problem_folds_start += $loc_problem_folds; ++ ++ # Also problematic are anything these fold to outside the range. Likely ++ # forever the only thing folded to by these outside the 0-255 range is the ++ # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code ++ # completely general, which should catch any unexpected changes or errors. ++ # We look at each code point 0-255, and add its fold (including each part ++ # of a multi-char fold) to the list. See the commit message for these ++ # changes for a more complete description of the MU issue. ++ foreach my $range ($loc_problem_folds->ranges) { ++ foreach my $code_point($range->start .. $range->end) { ++ my $fold_range = $cf->containing_range($code_point); ++ next unless defined $fold_range; ++ ++ my @hex_folds = split " ", $fold_range->value; ++ my $start_cp = hex $hex_folds[0]; ++ foreach my $i (0 .. @hex_folds - 1) { ++ my $cp = hex $hex_folds[$i]; ++ next unless $cp > 255; # Already have the < 256 ones ++ ++ $loc_problem_folds->add_range($cp, $cp); ++ $loc_problem_folds_start->add_range($start_cp, $start_cp); ++ } ++ } ++ } ++ + my $folds_to_multi_char = $perl->add_match_table( + "_Perl_Folds_To_Multi_Char", + Description => + "Code points whose fold is a string of more than one character", + ); + +- foreach my $range (property_ref('Case_Folding')->ranges) { ++ # Look through all the known folds to populate these tables. ++ foreach my $range ($cf->ranges) { + my $start = $range->start; + my $end = $range->end; + $any_folds->add_range($start, $end); + +- my @hex_code_points = split " ", $range->value; +- if (@hex_code_points > 1) { ++ my @hex_folds = split " ", $range->value; ++ if (@hex_folds > 1) { # Is multi-char fold + $folds_to_multi_char->add_range($start, $end); + } + +- foreach my $i (0 .. @hex_code_points - 1) { +- my $code_point = hex $hex_code_points[$i]; +- $any_folds->add_range($code_point, $code_point); ++ my $found_locale_problematic = 0; ++ ++ # Look at each of the folded-to characters... ++ foreach my $i (0 .. @hex_folds - 1) { ++ my $cp = hex $hex_folds[$i]; ++ $any_folds->add_range($cp, $cp); ++ ++ # The fold is problematic if any of the folded-to characters is ++ # already considered problematic. ++ if ($loc_problem_folds->contains($cp)) { ++ $loc_problem_folds->add_range($start, $end); ++ $found_locale_problematic = 1; ++ } ++ } ++ ++ # If this is a problematic fold, add to the start chars the ++ # folding-from characters and first folded-to character. ++ if ($found_locale_problematic) { ++ $loc_problem_folds_start->add_range($start, $end); ++ my $cp = hex $hex_folds[0]; ++ $loc_problem_folds_start->add_range($cp, $cp); + } + } + +-- +2.49.1 + diff --git a/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch b/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch new file mode 100644 index 00000000..8d69ace8 --- /dev/null +++ b/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch @@ -0,0 +1,39 @@ +SPDX-FileCopyrightText: 2013 Karl Williamson +SPDX-FileCopyrightText: 2025 fosslinux + +SPDX-License-Identifier: Artistic-1.0 + +This renaming is required for 5.22. + +It is a manual port of 1fdd5e539a9. + +diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm +index 9c3dd7c710..2349300626 100644 +--- perl-5.18.4/lib/Unicode/UCD.pm ++++ perl-5.18.4/lib/Unicode/UCD.pm +@@ -25,6 +25,7 @@ our @EXPORT_OK = qw(charinfo + prop_invlist + prop_invmap + MAX_CP ++ search_invlist + ); + + use Carp; +@@ -2261,7 +2261,7 @@ sub prop_invlist ($;$) { + return @invlist; + } + +-sub _search_invlist { ++sub search_invlist { + # Find the range in the inversion list which contains a code point; that + # is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no + # such i. +@@ -3411,7 +3411,7 @@ RETRY: + } + + # Find the range that the override applies to. +- my $i = _search_invlist(\@invlist, $cp); ++ my $i = search_invlist(\@invlist, $cp); + if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) { + croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]" + }