From dcce75dd7996d46cedca57be75cbd5394945d83a Mon Sep 17 00:00:00 2001
From: Samuel Tyler <fosslinux@aussies.space>
Date: Sun, 31 Aug 2025 11:52:18 +1000
Subject: [PATCH] Prepare perl 5.18.4 for 5.22

---
 steps/perl-5.18.4/pass1.sh                    |   4 +
 ...-Allow-internal-properties-in-invmap.patch |   2 +-
 ...Unicode-UCD-Add-prop_values-function.patch | 219 ++++++++++++++++++
 ...ExtUtils-Embed-to-work-with-miniperl.patch |  68 ++++++
 ...mbed-s-generated-C-code-to-be-closer.patch |  79 +++++++
 ...nversion-list-generation-to-mktables.patch |  72 ++++++
 ...-Work-properly-under-UTF-8-LC_CTYPE-.patch | 199 ++++++++++++++++
 .../patches/Unicode-UCD-search_invlist.patch  |  39 ++++
 8 files changed, 681 insertions(+), 1 deletion(-)
 create mode 100644 steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch
 create mode 100644 steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch
 create mode 100644 steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch
 create mode 100644 steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch
 create mode 100644 steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch
 create mode 100644 steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch

diff --git a/steps/perl-5.18.4/pass1.sh b/steps/perl-5.18.4/pass1.sh
index d4c4d7af..dae2a5c0 100755
--- a/steps/perl-5.18.4/pass1.sh
+++ b/steps/perl-5.18.4/pass1.sh
@@ -34,6 +34,10 @@ src_prepare() {
     perl regen/unicode_constants.pl
     perl regen/regcharclass.pl
 
+    # Change the name, patching the generator script is not easy
+    # Because of 0005-Move-an-inversion-list-generation-to-mktables.patch
+    sed -i "s/_Perl_Multi_Char_Folds_invlist/_Perl_Folds_To_Multi_Char_invlist/" charclass_invlists.h
+
     # regenerate configure
     ln -s ../metaconfig*/.package .
     ln -s ../metaconfig*/U .
diff --git a/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch b/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch
index 61f0b0b9..db46658f 100644
--- a/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch
+++ b/steps/perl-5.18.4/patches/0001-Unicode-UCD-Allow-internal-properties-in-invmap.patch
@@ -1,4 +1,4 @@
-SPDX-FileCopyrightText: 2012 Karl Williamson <public@khwilliamson.com>
+SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
 SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
 
 SPDX-License-Identifier: Artistic-1.0
diff --git a/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch b/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch
new file mode 100644
index 00000000..bc144b3a
--- /dev/null
+++ b/steps/perl-5.18.4/patches/0002-Unicode-UCD-Add-prop_values-function.patch
@@ -0,0 +1,219 @@
+SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
+SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
+
+SPDX-License-Identifier: Artistic-1.0
+
+This function is required for 5.22.
+
+From 6bf3612f9c9b0788de8adf06539b41c64695c014 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Tue, 27 Jan 2015 15:08:08 -0700
+Subject: [PATCH] Unicode::UCD: Add prop_values() function
+
+This new function returns the input property's possible values.
+---
+ lib/Unicode/UCD.pm | 125 +++++++++++++++++++++++++++++++++++++--------
+ 2 files changed, 132 insertions(+), 23 deletions(-)
+
+diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
+index 9c3dd7c710..7033128ae5 100644
+--- perl-5.18.4/lib/Unicode/UCD.pm
++++ perl-5.18.4/lib/Unicode/UCD.pm
+@@ -22,6 +20,7 @@ our @EXPORT_OK = qw(charinfo
+                     num
+                     prop_aliases
+                     prop_value_aliases
++                    prop_values
+                     prop_invlist
+                     prop_invmap
+                     MAX_CP
+@@ -73,6 +72,9 @@ Unicode::UCD - Unicode character database
+     use Unicode::UCD 'prop_value_aliases';
+     my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+ 
++    use Unicode::UCD 'prop_values';
++    my @all_EA_short_names = prop_values("East_Asian_Width");
++
+     use Unicode::UCD 'prop_invlist';
+     my @puncts = prop_invlist("gc=punctuation");
+ 
+@@ -730,6 +732,9 @@ names>).
+ L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
+ different type of data structure.
+ 
++L<prop_values("Block")|/prop_values()> can be used to get all
++the known new-style block names as a list, without the code point ranges.
++
+ See also L</Blocks versus Scripts>.
+ 
+ =cut
+@@ -752,6 +757,9 @@ the values.
+ L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
+ different type of data structure.
+ 
++L<C<prop_values("Script")>|/prop_values()> can be used to get all
++the known script names as a list, without the code point ranges.
++
+ See also L</Blocks versus Scripts>.
+ 
+ =cut
+@@ -835,8 +843,9 @@ from the long names to the short names.  The general category is the
+ one returned from
+ L</charinfo()> under the C<category> key.
+ 
+-The L</prop_value_aliases()> function can be used to get all the synonyms of
+-the category name.
++The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
++alternative to this function; the first returning a simple list of the short
++category names; and the second gets all the synonyms of a given category name.
+ 
+ =cut
+ 
+@@ -880,8 +889,10 @@ the Unicode TR9 is recommended reading:
+ L<http://www.unicode.org/reports/tr9/>
+ (as of Unicode 5.0.0)
+ 
+-The L</prop_value_aliases()> function can be used to get all the synonyms of
+-the bidi type name.
++The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
++alternative to this function; the first returning a simple list of the short
++bidi type names; and the second gets all the synonyms of a given bidi type
++name.
+ 
+ =cut
+ 
+@@ -1864,6 +1875,79 @@ sub prop_aliases ($) {
+ 
+ =pod
+ 
++=head2 B<prop_values()>
++
++    use Unicode::UCD 'prop_values';
++
++    print "AHex values are: ", join(", ", prop_values("AHex")),
++                               "\n";
++  prints:
++    AHex values are: N, Y
++
++Some Unicode properties have a restricted set of legal values.  For example,
++all binary properties are restricted to just C<true> or C<false>; and there
++are only a few dozen possible General Categories.  Use C<prop_values>
++to find out if a given property is one such, and if so, to get a list of the
++values:
++
++    print join ", ", prop_values("NFC_Quick_Check");
++  prints:
++    M, N, Y
++
++If the property doesn't have such a restricted set, C<undef> is returned.
++
++There are usually several synonyms for each possible value.  Use
++L</prop_value_aliases()> to access those.
++
++Case, white space, hyphens, and underscores are ignored in the input property
++name (except for the trailing underscore in the old-form grandfathered-in
++general category property value C<"L_">, which is better written as C<"LC">).
++
++If the property name is unknown, C<undef> is returned.  Note that Perl typically
++recognizes property names in regular expressions with an optional C<"Is_>"
++(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
++This function does not recognize those in the property parameter, returning
++C<undef>.
++
++For the block property, new-style block names are returned (see
++L</Old-style versus new-style block names>).
++
++C<prop_values> does not know about any user-defined properties, and
++will return C<undef> if called with one of those.
++
++=cut
++
++# These are created by mktables for this module and stored in unicore/UCD.pl
++# where their structures are described.
++our %loose_to_standard_value;
++our %prop_value_aliases;
++
++sub prop_values ($) {
++    my $prop = shift;
++    return undef unless defined $prop;
++
++    require "unicore/UCD.pl";
++    require "utf8_heavy.pl";
++
++    # Find the property name synonym that's used as the key in other hashes,
++    # which is element 0 in the returned list.
++    ($prop) = prop_aliases($prop);
++    return undef if ! $prop;
++    $prop = utf8::_loose_name(lc $prop);
++
++    # Here is a legal property.
++    return undef unless exists $prop_value_aliases{$prop};
++    my @return;
++    foreach my $value_key (sort { lc $a cmp lc $b }
++                            keys %{$prop_value_aliases{$prop}})
++    {
++        push @return, $prop_value_aliases{$prop}{$value_key}[0];
++    }
++    return @return;
++}
++
++=pod
++
+ =head2 B<prop_value_aliases()>
+ 
+     use Unicode::UCD 'prop_value_aliases';
+@@ -1877,7 +1961,7 @@ sub prop_aliases ($) {
+     print "The short name is $short_name\n";
+     print "The other aliases are: ", join(", ", @other_names), "\n";
+ 
+-    prints:
++  prints:
+     The full name is Punctuation
+     The short name is P
+     The other aliases are: Punct
+@@ -1886,18 +1970,20 @@ Some Unicode properties have a restricted set of legal values.  For example,
+ all binary properties are restricted to just C<true> or C<false>; and there
+ are only a few dozen possible General Categories.
+ 
+-For such properties, there are usually several synonyms for each possible
+-value.  For example, in binary properties, I<truth> can be represented by any of
+-the strings "Y", "Yes", "T", or "True"; and the General Category
+-"Punctuation" by that string, or "Punct", or simply "P".
++You can use L</prop_values()> to find out if a given property is one which has
++a restricted set of values, and if so, what those values are.  But usually
++each value actually has several synonyms.  For example, in binary properties,
++I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True";
++and the General Category "Punctuation" by that string, or "Punct", or simply
++"P".
+ 
+ Like property names, there is typically at least a short name for each such
+-property-value, and a long name.  If you know any name of the property-value,
+-you can use C<prop_value_aliases>() to get the long name (when called in
+-scalar context), or a list of all the names, with the short name in the 0th
+-element, the long name in the next element, and any other synonyms in the
+-remaining elements, in no particular order, except that any all-numeric
+-synonyms will be last.
++property-value, and a long name.  If you know any name of the property-value
++(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
++to get the long name (when called in scalar context), or a list of all the
++names, with the short name in the 0th element, the long name in the next
++element, and any other synonyms in the remaining elements, in no particular
++order, except that any all-numeric synonyms will be last.
+ 
+ The long name is returned in a form nicely capitalized, suitable for printing.
+ 
+@@ -1926,11 +2012,6 @@ will return C<undef> if called with one of those.
+ 
+ =cut
+ 
+-# These are created by mktables for this routine and stored in unicore/UCD.pl
+-# where their structures are described.
+-our %loose_to_standard_value;
+-our %prop_value_aliases;
+-
+ sub prop_value_aliases ($$) {
+     my ($prop, $value) = @_;
+     return unless defined $prop && defined $value;
+-- 
+2.49.1
+
diff --git a/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch b/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch
new file mode 100644
index 00000000..f947537e
--- /dev/null
+++ b/steps/perl-5.18.4/patches/0003-Refactor-ExtUtils-Embed-to-work-with-miniperl.patch
@@ -0,0 +1,68 @@
+SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
+
+SPDX-License-Identifier: Artistic-1.0
+
+Again, we need this for 5.22.
+
+From b86e5545be9b2e891e3ee8e2fba38b799b1836fc Mon Sep 17 00:00:00 2001
+From: Nicholas Clark <nick@ccl4.org>
+Date: Sun, 7 Jul 2013 15:12:42 +0200
+Subject: [PATCH] Refactor ExtUtils::Embed to work with miniperl.
+
+Remove the use of FileHandle, which relies on IO, and XS module.
+Only load Getopt::Std if it is needed (in xsinit()), to avoid needing to add
+Getopt::Std to lib/buildcustomize.pl
+Require File::Spec instead of using it, as it exports nothing, so there is no
+benefit to using it (but it costs a BEGIN block).
+---
+ lib/ExtUtils/Embed.pm | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
+index 9710630e51..758e24139c 100644
+--- perl-5.18.4/lib/ExtUtils/Embed.pm
++++ perl-5.18.4/lib/ExtUtils/Embed.pm
+@@ -2,10 +2,8 @@ require 5.002;
+ 
+ package ExtUtils::Embed;
+ require Exporter;
+-require FileHandle;
+ use Config;
+-use Getopt::Std;
+-use File::Spec;
++require File::Spec;
+ 
+ #Only when we need them
+ #require ExtUtils::MakeMaker;
+@@ -18,7 +16,7 @@ use vars qw(@ISA @EXPORT $VERSION
+ use strict;
+ 
+ # This is not a dual-life module, so no need for development version numbers
+-$VERSION = '1.30';
++$VERSION = '1.31';
+ 
+ @ISA = qw(Exporter);
+ @EXPORT = qw(&xsinit &ldopts 
+@@ -54,7 +52,8 @@ sub xsinit {
+        @mods = @$mods if $mods;
+     }
+     else {
+-       getopts('o:s:');
++       require Getopt::Std;
++       Getopt::Std::getopts('o:s:');
+        $file = $opt_o if defined $opt_o;
+        $std  = $opt_s  if defined $opt_s;
+        @mods = @ARGV;
+@@ -65,7 +64,8 @@ sub xsinit {
+ 	$fh = \*STDOUT;
+     }
+     else {
+-	$fh = new FileHandle "> $file";
++        open $fh, '>', $file
++            or die "Can't open '$file': $!";
+     }
+ 
+     push(@mods, static_ext()) if defined $std;
+-- 
+2.49.1
+
diff --git a/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch b/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch
new file mode 100644
index 00000000..0cf4c4ed
--- /dev/null
+++ b/steps/perl-5.18.4/patches/0004-Tweak-ExtUtils-Embed-s-generated-C-code-to-be-closer.patch
@@ -0,0 +1,79 @@
+SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
+
+SPDX-License-Identifier: Artistic-1.0
+
+Again, we need this for 5.22.
+
+From 87c2fdd468989b9450aa393c4c715594cbba5711 Mon Sep 17 00:00:00 2001
+From: Nicholas Clark <nick@ccl4.org>
+Date: Mon, 8 Jul 2013 11:08:12 +0200
+Subject: [PATCH] Tweak ExtUtils::Embed's generated C code to be closer to
+ ExtUtils::Miniperl.
+
+Use #include "..." instead of #include <...> in xsi_header(), and don't add
+a trailing newline (and add a newline in xsinit() to compensate).
+Use four spaces instead of a tab for indenting.
+If there are no extensions and hence no calls to newXS() don't declare file[]
+and don't add a trailing newline.
+---
+ lib/ExtUtils/Embed.pm | 21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
+index 9710630e51..52221cba53 100644
+--- perl-5.18.4/lib/ExtUtils/Embed.pm
++++ perl-5.18.4/lib/ExtUtils/Embed.pm
+@@ -72,7 +72,7 @@ sub xsinit {
+     @mods = grep(!$seen{$_}++, @mods);
+ 
+     print $fh &xsi_header();
+-    print $fh "EXTERN_C void xs_init ($xsinit_proto);\n\n";     
++    print $fh "\nEXTERN_C void xs_init ($xsinit_proto);\n\n";
+     print $fh &xsi_protos(@mods);
+ 
+     print $fh "\nEXTERN_C void\nxs_init($xsinit_proto)\n{\n";
+@@ -83,9 +83,9 @@ sub xsinit {
+ 
+ sub xsi_header {
+     return <<EOF;
+-#include <EXTERN.h>
+-#include <perl.h>
+-
++#include "EXTERN.h"
++#include "perl.h"
++#include "XSUB.h"
+ EOF
+ }    
+ 
+@@ -109,9 +109,12 @@ sub xsi_body {
+     my(@exts) = @_;
+     my($pname,@retval,%seen);
+     my($dl) = canon('/','DynaLoader');
+-    push(@retval, "\tchar *file = __FILE__;\n");
+-    push(@retval, "\tdXSUB_SYS;\n") if $] > 5.002;
+-    push(@retval, "\n");
++    push(@retval, "    static const char file[] = __FILE__;\n")
++        if @exts;
++    push(@retval, "    dXSUB_SYS;\n");
++    push(@retval, "    PERL_UNUSED_CONTEXT;\n");
++    push(@retval, "\n")
++        if @exts;
+ 
+     foreach $_ (@exts){
+         my($pname) = canon('/', $_);
+@@ -121,10 +124,10 @@ sub xsi_body {
+         if ($pname eq $dl){
+             # Must NOT install 'DynaLoader::boot_DynaLoader' as 'bootstrap'!
+             # boot_DynaLoader is called directly in DynaLoader.pm
+-            $ccode = "\t/* DynaLoader is a special case */\n\tnewXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
++            $ccode = "    /* DynaLoader is a special case */\n    newXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
+             push(@retval, $ccode) unless $seen{$ccode}++;
+         } else {
+-            $ccode = "\tnewXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
++            $ccode = "    newXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
+             push(@retval, $ccode) unless $seen{$ccode}++;
+         }
+     }
+-- 
+2.49.1
+
diff --git a/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch b/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch
new file mode 100644
index 00000000..0a459539
--- /dev/null
+++ b/steps/perl-5.18.4/patches/0005-Move-an-inversion-list-generation-to-mktables.patch
@@ -0,0 +1,72 @@
+SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
+SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
+
+SPDX-License-Identifier: Artistic-1.0
+
+This renaming is expected by 5.22. The resulting modifications to
+charclass_invlists.h are done in the script.
+
+From c75f24e6fc1fa9676693e4e42f199e542b2b2549 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <public@khwilliamson.com>
+Date: Thu, 23 Jan 2014 20:34:15 -0700
+Subject: [PATCH 1/2] Move an inversion list generation to mktables
+
+Prior to this patch, this was in regen/mk_invlists.pl, but future
+commits will want it to also be used by the header generated by
+regen/regcharclass.pl, so use a common source so the logic doesn't have
+to be duplicated.
+---
+ lib/unicore/mktables | 22 ++++++++++++++++++++++
+ regcomp.c            |  3 ++-
+ 2 files changed, 24 insertions(+), 1 deletion(-)
+
+diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
+index 808760d002..a5c0d8930e 100644
+--- perl-5.18.4/lib/unicore/mktables
++++ perl-5.18.4/lib/unicore/mktables
+@@ -13321,6 +13321,28 @@ sub compile_perl() {
+     }
+     $PosixXDigit->add_description('[0-9A-Fa-f]');
+ 
++    my $folds_to_multi_char = $perl->add_match_table(
++         "_Perl_Folds_To_Multi_Char",
++         Description =>
++              "Code points whose fold is a string of more than one character",
++    );
++
++    foreach my $range (property_ref('Case_Folding')->ranges) {
++        my $start = $range->start;
++        my $end = $range->end;
++        $any_folds->add_range($start, $end);
++
++        my @hex_code_points = split " ", $range->value;
++        if (@hex_code_points > 1) {
++            $folds_to_multi_char->add_range($start, $end);
++        }
++
++        foreach my $i (0 .. @hex_code_points - 1) {
++            my $code_point = hex $hex_code_points[$i];
++            $any_folds->add_range($code_point, $code_point);
++        }
++    }
++
+     my $dt = property_ref('Decomposition_Type');
+     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
+         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
+diff --git perl-5.18.4/regcomp.c perl-5.18.4/regcomp.c
+index 0841f172e5..e839e3e765 100644
+--- perl-5.18.4/regcomp.c
++++ perl-5.18.4/regcomp.c
+@@ -5566,7 +5566,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
+ 	PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
+ 	PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
+ 
+-        PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
++        PL_HasMultiCharFold =
++                       _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
+     }
+ #endif
+ 
+-- 
+2.49.1
+
diff --git a/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch b/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch
new file mode 100644
index 00000000..6fbf6aaa
--- /dev/null
+++ b/steps/perl-5.18.4/patches/0006-Partial-backport-Work-properly-under-UTF-8-LC_CTYPE-.patch
@@ -0,0 +1,199 @@
+SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
+SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
+
+SPDX-License-Identifier: Artistic-1.0
+
+The _Perl_Any_Folds and _Perl_Problematic_Locale_Folds tables are expected
+by 5.22.
+
+Their addition is part of a giga commit (~1500 lines changed), we take
+just that small part of the commit.
+
+From 10d28d6a24e2a3c41270c43203be5502a586b2f9 Mon Sep 17 00:00:00 2001
+From: Samuel Tyler <fosslinux@aussies.space>
+Date: Thu, 28 Aug 2025 21:54:12 +1000
+Subject: [PATCH 2/2] Partial backport: Work properly under UTF-8 LC_CTYPE
+ locales
+
+[... original commit message follows, mostly irrelevant]
+
+This large (sorry, I couldn't figure out how to meaningfully split it
+up) commit causes Perl to fully support LC_CTYPE operations (case
+changing, character classification) in UTF-8 locales.
+
+As a side effect it resolves [perl #56820].
+
+The basics are easy, but there were a lot of details, and one
+troublesome edge case discussed below.
+
+What essentially happens is that when the locale is changed to a UTF-8
+one, a global variable is set TRUE (FALSE when changed to a non-UTF-8
+locale).  Within the scope of 'use locale', this variable is checked,
+and if TRUE, the code that Perl uses for non-locale behavior is used
+instead of the code for locale behavior.  Since Perl's internal
+representation is UTF-8, we get UTF-8 behavior for a UTF-8 locale.
+
+More work had to be done for regular expressions.  There are three
+cases.
+
+1) The character classes \w, [[:punct:]] needed no extra work, as
+the changes fall out from the base work.
+
+2) Strings that are to be matched case-insensitively.  These form
+EXACTFL regops (nodes).  Notice that if such a string contains only
+characters above-Latin1 that match only themselves, that the node can be
+downgraded to an EXACT-only node, which presents better optimization
+possibilities, as we now have a fixed string known at compile time to be
+required to be in the target string to match.  Similarly if all
+characters in the string match only other above-Latin1 characters
+case-insensitively, the node can be downgraded to a regular EXACTFU node
+(match, folding, using Unicode, not locale, rules).  The code changes
+for this could be done without accepting UTF-8 locales fully, but there
+were edge cases which needed to be handled differently if I stopped
+there, so I continued on.
+
+In an EXACTFL node, all such characters are now folded at compile time
+(just as before this commit), while the other characters whose folds are
+locale-dependent are left unfolded.  This means that they have to be
+folded at execution time based on the locale in effect at the moment.
+Again, this isn't a change from before.  The difference is that now some
+of the folds that need to be done at execution time (in regexec) are
+potentially multi-char.  Some of the code in regexec was trivial to
+extend to account for this because of existing infrastructure, but the
+part dealing with regex quantifiers, had to have more work.
+
+Also the code that joins EXACTish nodes together had to be expanded to
+account for the possibility of multi-character folds within locale
+handling.  This was fairly easy, because it already has infrastructure
+to handle these under somewhat different circumstances.
+
+3) In bracketed character classes, represented by ANYOF nodes, a new
+inversion list was created giving the characters that should be matched
+by this node when the runtime locale is UTF-8.  The list is ignored
+except under that circumstance.  To do this, I created a new ANYOF type
+which has an extra SV for the inversion list.
+
+The edge case that caused the most difficulty is folding involving the
+MICRO SIGN, U+00B5.  It folds to the GREEK SMALL LETTER MU, as does the
+GREEK CAPITAL LETTER MU.  The MICRO SIGN is the only 0-255 range
+character that folds to outside that range.  The issue is that it
+doesn't naturally fall out that it will match the CAP MU.  If we let the
+CAP MU fold to the samll mu at compile time (which it can because both
+are above-Latin1 and so the fold is the same no matter what locale is in
+effect), it could appear that the regnode can be downgraded away from
+EXACTFL to EXACTFU, but doing so would cause the MICRO SIGN to not case
+insensitvely match the CAP MU.  This could be special cased in regcomp
+and regexec, but I wanted to avoid that.  Instead the mktables tables
+are set up to include the CAP MU as a character whose presence forbids
+the downgrading, so the special casing is in mktables, and not in the C
+code.
+---
+ lib/unicore/mktables | 79 ++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 73 insertions(+), 6 deletions(-)
+
+diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
+index a5c0d8930e..4b34b3c338 100644
+--- perl-5.18.4/lib/unicore/mktables
++++ perl-5.18.4/lib/unicore/mktables
+@@ -13321,25 +13321,92 @@ sub compile_perl() {
+     }
+     $PosixXDigit->add_description('[0-9A-Fa-f]');
+ 
++    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
++                    Description => "Code points that particpate in some fold",
++                    );
++    my $loc_problem_folds = $perl->add_match_table(
++               "_Perl_Problematic_Locale_Folds",
++               Description =>
++                   "Code points that are in some way problematic under locale",
++    );
++
++    # This allows regexec.c to skip some work when appropriate.  Some of the
++    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
++    my $loc_problem_folds_start = $perl->add_match_table(
++               "_Perl_Problematic_Locale_Foldeds_Start",
++               Description =>
++                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
++    );
++
++    my $cf = property_ref('Case_Folding');
++
++    # Every character 0-255 is problematic because what each folds to depends
++    # on the current locale
++    $loc_problem_folds->add_range(0, 255);
++    $loc_problem_folds_start += $loc_problem_folds;
++
++    # Also problematic are anything these fold to outside the range.  Likely
++    # forever the only thing folded to by these outside the 0-255 range is the
++    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
++    # completely general, which should catch any unexpected changes or errors.
++    # We look at each code point 0-255, and add its fold (including each part
++    # of a multi-char fold) to the list.  See the commit message for these
++    # changes for a more complete description of the MU issue.
++    foreach my $range ($loc_problem_folds->ranges) {
++        foreach my $code_point($range->start .. $range->end) {
++            my $fold_range = $cf->containing_range($code_point);
++            next unless defined $fold_range;
++
++            my @hex_folds = split " ", $fold_range->value;
++            my $start_cp = hex $hex_folds[0];
++            foreach my $i (0 .. @hex_folds - 1) {
++                my $cp = hex $hex_folds[$i];
++                next unless $cp > 255;    # Already have the < 256 ones
++
++                $loc_problem_folds->add_range($cp, $cp);
++                $loc_problem_folds_start->add_range($start_cp, $start_cp);
++            }
++        }
++    }
++
+     my $folds_to_multi_char = $perl->add_match_table(
+          "_Perl_Folds_To_Multi_Char",
+          Description =>
+               "Code points whose fold is a string of more than one character",
+     );
+ 
+-    foreach my $range (property_ref('Case_Folding')->ranges) {
++    # Look through all the known folds to populate these tables.
++    foreach my $range ($cf->ranges) {
+         my $start = $range->start;
+         my $end = $range->end;
+         $any_folds->add_range($start, $end);
+ 
+-        my @hex_code_points = split " ", $range->value;
+-        if (@hex_code_points > 1) {
++        my @hex_folds = split " ", $range->value;
++        if (@hex_folds > 1) {   # Is multi-char fold
+             $folds_to_multi_char->add_range($start, $end);
+         }
+ 
+-        foreach my $i (0 .. @hex_code_points - 1) {
+-            my $code_point = hex $hex_code_points[$i];
+-            $any_folds->add_range($code_point, $code_point);
++        my $found_locale_problematic = 0;
++
++        # Look at each of the folded-to characters...
++        foreach my $i (0 .. @hex_folds - 1) {
++            my $cp = hex $hex_folds[$i];
++            $any_folds->add_range($cp, $cp);
++
++            # The fold is problematic if any of the folded-to characters is
++            # already considered problematic.
++            if ($loc_problem_folds->contains($cp)) {
++                $loc_problem_folds->add_range($start, $end);
++                $found_locale_problematic = 1;
++            }
++        }
++
++        # If this is a problematic fold, add to the start chars the
++        # folding-from characters and first folded-to character.
++        if ($found_locale_problematic) {
++            $loc_problem_folds_start->add_range($start, $end);
++            my $cp = hex $hex_folds[0];
++            $loc_problem_folds_start->add_range($cp, $cp);
+         }
+     }
+ 
+-- 
+2.49.1
+
diff --git a/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch b/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch
new file mode 100644
index 00000000..8d69ace8
--- /dev/null
+++ b/steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch
@@ -0,0 +1,39 @@
+SPDX-FileCopyrightText: 2013 Karl Williamson <public@khwilliamson.com>
+SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
+
+SPDX-License-Identifier: Artistic-1.0
+
+This renaming is required for 5.22.
+
+It is a manual port of 1fdd5e539a9.
+
+diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
+index 9c3dd7c710..2349300626 100644
+--- perl-5.18.4/lib/Unicode/UCD.pm
++++ perl-5.18.4/lib/Unicode/UCD.pm
+@@ -25,6 +25,7 @@ our @EXPORT_OK = qw(charinfo
+                     prop_invlist
+                     prop_invmap
+                     MAX_CP
++                    search_invlist
+                 );
+ 
+ use Carp;
+@@ -2261,7 +2261,7 @@ sub prop_invlist ($;$) {
+     return @invlist;
+ }
+ 
+-sub _search_invlist {
++sub search_invlist {
+     # Find the range in the inversion list which contains a code point; that
+     # is, find i such that l[i] <= code_point < l[i+1].  Returns undef if no
+     # such i.
+@@ -3411,7 +3411,7 @@ RETRY:
+                 }
+ 
+                 # Find the range that the override applies to.
+-                my $i = _search_invlist(\@invlist, $cp);
++                my $i = search_invlist(\@invlist, $cp);
+                 if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
+                     croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+                 }