mirror of
https://github.com/fosslinux/live-bootstrap.git
synced 2026-03-02 01:18:08 +01:00
Prepare perl 5.18.4 for 5.22
This commit is contained in:
parent
863406ba04
commit
dcce75dd79
8 changed files with 681 additions and 1 deletions
|
|
@ -34,6 +34,10 @@ src_prepare() {
|
|||
perl regen/unicode_constants.pl
|
||||
perl regen/regcharclass.pl
|
||||
|
||||
# Change the name, patching the generator script is not easy
|
||||
# Because of 0005-Move-an-inversion-list-generation-to-mktables.patch
|
||||
sed -i "s/_Perl_Multi_Char_Folds_invlist/_Perl_Folds_To_Multi_Char_invlist/" charclass_invlists.h
|
||||
|
||||
# regenerate configure
|
||||
ln -s ../metaconfig*/.package .
|
||||
ln -s ../metaconfig*/U .
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
SPDX-FileCopyrightText: 2012 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
|
|
|||
|
|
@ -0,0 +1,219 @@
|
|||
SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
This function is required for 5.22.
|
||||
|
||||
From 6bf3612f9c9b0788de8adf06539b41c64695c014 Mon Sep 17 00:00:00 2001
|
||||
From: Karl Williamson <khw@cpan.org>
|
||||
Date: Tue, 27 Jan 2015 15:08:08 -0700
|
||||
Subject: [PATCH] Unicode::UCD: Add prop_values() function
|
||||
|
||||
This new function returns the input property's possible values.
|
||||
---
|
||||
lib/Unicode/UCD.pm | 125 +++++++++++++++++++++++++++++++++++++--------
|
||||
2 files changed, 132 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
|
||||
index 9c3dd7c710..7033128ae5 100644
|
||||
--- perl-5.18.4/lib/Unicode/UCD.pm
|
||||
+++ perl-5.18.4/lib/Unicode/UCD.pm
|
||||
@@ -22,6 +20,7 @@ our @EXPORT_OK = qw(charinfo
|
||||
num
|
||||
prop_aliases
|
||||
prop_value_aliases
|
||||
+ prop_values
|
||||
prop_invlist
|
||||
prop_invmap
|
||||
MAX_CP
|
||||
@@ -73,6 +72,9 @@ Unicode::UCD - Unicode character database
|
||||
use Unicode::UCD 'prop_value_aliases';
|
||||
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
|
||||
|
||||
+ use Unicode::UCD 'prop_values';
|
||||
+ my @all_EA_short_names = prop_values("East_Asian_Width");
|
||||
+
|
||||
use Unicode::UCD 'prop_invlist';
|
||||
my @puncts = prop_invlist("gc=punctuation");
|
||||
|
||||
@@ -730,6 +732,9 @@ names>).
|
||||
L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
|
||||
different type of data structure.
|
||||
|
||||
+L<prop_values("Block")|/prop_values()> can be used to get all
|
||||
+the known new-style block names as a list, without the code point ranges.
|
||||
+
|
||||
See also L</Blocks versus Scripts>.
|
||||
|
||||
=cut
|
||||
@@ -752,6 +757,9 @@ the values.
|
||||
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
|
||||
different type of data structure.
|
||||
|
||||
+L<C<prop_values("Script")>|/prop_values()> can be used to get all
|
||||
+the known script names as a list, without the code point ranges.
|
||||
+
|
||||
See also L</Blocks versus Scripts>.
|
||||
|
||||
=cut
|
||||
@@ -835,8 +843,9 @@ from the long names to the short names. The general category is the
|
||||
one returned from
|
||||
L</charinfo()> under the C<category> key.
|
||||
|
||||
-The L</prop_value_aliases()> function can be used to get all the synonyms of
|
||||
-the category name.
|
||||
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
|
||||
+alternative to this function; the first returning a simple list of the short
|
||||
+category names; and the second gets all the synonyms of a given category name.
|
||||
|
||||
=cut
|
||||
|
||||
@@ -880,8 +889,10 @@ the Unicode TR9 is recommended reading:
|
||||
L<http://www.unicode.org/reports/tr9/>
|
||||
(as of Unicode 5.0.0)
|
||||
|
||||
-The L</prop_value_aliases()> function can be used to get all the synonyms of
|
||||
-the bidi type name.
|
||||
+The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
|
||||
+alternative to this function; the first returning a simple list of the short
|
||||
+bidi type names; and the second gets all the synonyms of a given bidi type
|
||||
+name.
|
||||
|
||||
=cut
|
||||
|
||||
@@ -1864,6 +1875,79 @@ sub prop_aliases ($) {
|
||||
|
||||
=pod
|
||||
|
||||
+=head2 B<prop_values()>
|
||||
+
|
||||
+ use Unicode::UCD 'prop_values';
|
||||
+
|
||||
+ print "AHex values are: ", join(", ", prop_values("AHex")),
|
||||
+ "\n";
|
||||
+ prints:
|
||||
+ AHex values are: N, Y
|
||||
+
|
||||
+Some Unicode properties have a restricted set of legal values. For example,
|
||||
+all binary properties are restricted to just C<true> or C<false>; and there
|
||||
+are only a few dozen possible General Categories. Use C<prop_values>
|
||||
+to find out if a given property is one such, and if so, to get a list of the
|
||||
+values:
|
||||
+
|
||||
+ print join ", ", prop_values("NFC_Quick_Check");
|
||||
+ prints:
|
||||
+ M, N, Y
|
||||
+
|
||||
+If the property doesn't have such a restricted set, C<undef> is returned.
|
||||
+
|
||||
+There are usually several synonyms for each possible value. Use
|
||||
+L</prop_value_aliases()> to access those.
|
||||
+
|
||||
+Case, white space, hyphens, and underscores are ignored in the input property
|
||||
+name (except for the trailing underscore in the old-form grandfathered-in
|
||||
+general category property value C<"L_">, which is better written as C<"LC">).
|
||||
+
|
||||
+If the property name is unknown, C<undef> is returned. Note that Perl typically
|
||||
+recognizes property names in regular expressions with an optional C<"Is_>"
|
||||
+(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
|
||||
+This function does not recognize those in the property parameter, returning
|
||||
+C<undef>.
|
||||
+
|
||||
+For the block property, new-style block names are returned (see
|
||||
+L</Old-style versus new-style block names>).
|
||||
+
|
||||
+C<prop_values> does not know about any user-defined properties, and
|
||||
+will return C<undef> if called with one of those.
|
||||
+
|
||||
+=cut
|
||||
+
|
||||
+# These are created by mktables for this module and stored in unicore/UCD.pl
|
||||
+# where their structures are described.
|
||||
+our %loose_to_standard_value;
|
||||
+our %prop_value_aliases;
|
||||
+
|
||||
+sub prop_values ($) {
|
||||
+ my $prop = shift;
|
||||
+ return undef unless defined $prop;
|
||||
+
|
||||
+ require "unicore/UCD.pl";
|
||||
+ require "utf8_heavy.pl";
|
||||
+
|
||||
+ # Find the property name synonym that's used as the key in other hashes,
|
||||
+ # which is element 0 in the returned list.
|
||||
+ ($prop) = prop_aliases($prop);
|
||||
+ return undef if ! $prop;
|
||||
+ $prop = utf8::_loose_name(lc $prop);
|
||||
+
|
||||
+ # Here is a legal property.
|
||||
+ return undef unless exists $prop_value_aliases{$prop};
|
||||
+ my @return;
|
||||
+ foreach my $value_key (sort { lc $a cmp lc $b }
|
||||
+ keys %{$prop_value_aliases{$prop}})
|
||||
+ {
|
||||
+ push @return, $prop_value_aliases{$prop}{$value_key}[0];
|
||||
+ }
|
||||
+ return @return;
|
||||
+}
|
||||
+
|
||||
+=pod
|
||||
+
|
||||
=head2 B<prop_value_aliases()>
|
||||
|
||||
use Unicode::UCD 'prop_value_aliases';
|
||||
@@ -1877,7 +1961,7 @@ sub prop_aliases ($) {
|
||||
print "The short name is $short_name\n";
|
||||
print "The other aliases are: ", join(", ", @other_names), "\n";
|
||||
|
||||
- prints:
|
||||
+ prints:
|
||||
The full name is Punctuation
|
||||
The short name is P
|
||||
The other aliases are: Punct
|
||||
@@ -1886,18 +1970,20 @@ Some Unicode properties have a restricted set of legal values. For example,
|
||||
all binary properties are restricted to just C<true> or C<false>; and there
|
||||
are only a few dozen possible General Categories.
|
||||
|
||||
-For such properties, there are usually several synonyms for each possible
|
||||
-value. For example, in binary properties, I<truth> can be represented by any of
|
||||
-the strings "Y", "Yes", "T", or "True"; and the General Category
|
||||
-"Punctuation" by that string, or "Punct", or simply "P".
|
||||
+You can use L</prop_values()> to find out if a given property is one which has
|
||||
+a restricted set of values, and if so, what those values are. But usually
|
||||
+each value actually has several synonyms. For example, in binary properties,
|
||||
+I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True";
|
||||
+and the General Category "Punctuation" by that string, or "Punct", or simply
|
||||
+"P".
|
||||
|
||||
Like property names, there is typically at least a short name for each such
|
||||
-property-value, and a long name. If you know any name of the property-value,
|
||||
-you can use C<prop_value_aliases>() to get the long name (when called in
|
||||
-scalar context), or a list of all the names, with the short name in the 0th
|
||||
-element, the long name in the next element, and any other synonyms in the
|
||||
-remaining elements, in no particular order, except that any all-numeric
|
||||
-synonyms will be last.
|
||||
+property-value, and a long name. If you know any name of the property-value
|
||||
+(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
|
||||
+to get the long name (when called in scalar context), or a list of all the
|
||||
+names, with the short name in the 0th element, the long name in the next
|
||||
+element, and any other synonyms in the remaining elements, in no particular
|
||||
+order, except that any all-numeric synonyms will be last.
|
||||
|
||||
The long name is returned in a form nicely capitalized, suitable for printing.
|
||||
|
||||
@@ -1926,11 +2012,6 @@ will return C<undef> if called with one of those.
|
||||
|
||||
=cut
|
||||
|
||||
-# These are created by mktables for this routine and stored in unicore/UCD.pl
|
||||
-# where their structures are described.
|
||||
-our %loose_to_standard_value;
|
||||
-our %prop_value_aliases;
|
||||
-
|
||||
sub prop_value_aliases ($$) {
|
||||
my ($prop, $value) = @_;
|
||||
return unless defined $prop && defined $value;
|
||||
--
|
||||
2.49.1
|
||||
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
Again, we need this for 5.22.
|
||||
|
||||
From b86e5545be9b2e891e3ee8e2fba38b799b1836fc Mon Sep 17 00:00:00 2001
|
||||
From: Nicholas Clark <nick@ccl4.org>
|
||||
Date: Sun, 7 Jul 2013 15:12:42 +0200
|
||||
Subject: [PATCH] Refactor ExtUtils::Embed to work with miniperl.
|
||||
|
||||
Remove the use of FileHandle, which relies on IO, and XS module.
|
||||
Only load Getopt::Std if it is needed (in xsinit()), to avoid needing to add
|
||||
Getopt::Std to lib/buildcustomize.pl
|
||||
Require File::Spec instead of using it, as it exports nothing, so there is no
|
||||
benefit to using it (but it costs a BEGIN block).
|
||||
---
|
||||
lib/ExtUtils/Embed.pm | 12 ++++++------
|
||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
index 9710630e51..758e24139c 100644
|
||||
--- perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
+++ perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
@@ -2,10 +2,8 @@ require 5.002;
|
||||
|
||||
package ExtUtils::Embed;
|
||||
require Exporter;
|
||||
-require FileHandle;
|
||||
use Config;
|
||||
-use Getopt::Std;
|
||||
-use File::Spec;
|
||||
+require File::Spec;
|
||||
|
||||
#Only when we need them
|
||||
#require ExtUtils::MakeMaker;
|
||||
@@ -18,7 +16,7 @@ use vars qw(@ISA @EXPORT $VERSION
|
||||
use strict;
|
||||
|
||||
# This is not a dual-life module, so no need for development version numbers
|
||||
-$VERSION = '1.30';
|
||||
+$VERSION = '1.31';
|
||||
|
||||
@ISA = qw(Exporter);
|
||||
@EXPORT = qw(&xsinit &ldopts
|
||||
@@ -54,7 +52,8 @@ sub xsinit {
|
||||
@mods = @$mods if $mods;
|
||||
}
|
||||
else {
|
||||
- getopts('o:s:');
|
||||
+ require Getopt::Std;
|
||||
+ Getopt::Std::getopts('o:s:');
|
||||
$file = $opt_o if defined $opt_o;
|
||||
$std = $opt_s if defined $opt_s;
|
||||
@mods = @ARGV;
|
||||
@@ -65,7 +64,8 @@ sub xsinit {
|
||||
$fh = \*STDOUT;
|
||||
}
|
||||
else {
|
||||
- $fh = new FileHandle "> $file";
|
||||
+ open $fh, '>', $file
|
||||
+ or die "Can't open '$file': $!";
|
||||
}
|
||||
|
||||
push(@mods, static_ext()) if defined $std;
|
||||
--
|
||||
2.49.1
|
||||
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
SPDX-FileCopyrightText: 2013 Nicholas Clark <nick@ccl4.org>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
Again, we need this for 5.22.
|
||||
|
||||
From 87c2fdd468989b9450aa393c4c715594cbba5711 Mon Sep 17 00:00:00 2001
|
||||
From: Nicholas Clark <nick@ccl4.org>
|
||||
Date: Mon, 8 Jul 2013 11:08:12 +0200
|
||||
Subject: [PATCH] Tweak ExtUtils::Embed's generated C code to be closer to
|
||||
ExtUtils::Miniperl.
|
||||
|
||||
Use #include "..." instead of #include <...> in xsi_header(), and don't add
|
||||
a trailing newline (and add a newline in xsinit() to compensate).
|
||||
Use four spaces instead of a tab for indenting.
|
||||
If there are no extensions and hence no calls to newXS() don't declare file[]
|
||||
and don't add a trailing newline.
|
||||
---
|
||||
lib/ExtUtils/Embed.pm | 21 ++++++++++++---------
|
||||
1 file changed, 12 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git perl-5.18.4/lib/ExtUtils/Embed.pm perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
index 9710630e51..52221cba53 100644
|
||||
--- perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
+++ perl-5.18.4/lib/ExtUtils/Embed.pm
|
||||
@@ -72,7 +72,7 @@ sub xsinit {
|
||||
@mods = grep(!$seen{$_}++, @mods);
|
||||
|
||||
print $fh &xsi_header();
|
||||
- print $fh "EXTERN_C void xs_init ($xsinit_proto);\n\n";
|
||||
+ print $fh "\nEXTERN_C void xs_init ($xsinit_proto);\n\n";
|
||||
print $fh &xsi_protos(@mods);
|
||||
|
||||
print $fh "\nEXTERN_C void\nxs_init($xsinit_proto)\n{\n";
|
||||
@@ -83,9 +83,9 @@ sub xsinit {
|
||||
|
||||
sub xsi_header {
|
||||
return <<EOF;
|
||||
-#include <EXTERN.h>
|
||||
-#include <perl.h>
|
||||
-
|
||||
+#include "EXTERN.h"
|
||||
+#include "perl.h"
|
||||
+#include "XSUB.h"
|
||||
EOF
|
||||
}
|
||||
|
||||
@@ -109,9 +109,12 @@ sub xsi_body {
|
||||
my(@exts) = @_;
|
||||
my($pname,@retval,%seen);
|
||||
my($dl) = canon('/','DynaLoader');
|
||||
- push(@retval, "\tchar *file = __FILE__;\n");
|
||||
- push(@retval, "\tdXSUB_SYS;\n") if $] > 5.002;
|
||||
- push(@retval, "\n");
|
||||
+ push(@retval, " static const char file[] = __FILE__;\n")
|
||||
+ if @exts;
|
||||
+ push(@retval, " dXSUB_SYS;\n");
|
||||
+ push(@retval, " PERL_UNUSED_CONTEXT;\n");
|
||||
+ push(@retval, "\n")
|
||||
+ if @exts;
|
||||
|
||||
foreach $_ (@exts){
|
||||
my($pname) = canon('/', $_);
|
||||
@@ -121,10 +124,10 @@ sub xsi_body {
|
||||
if ($pname eq $dl){
|
||||
# Must NOT install 'DynaLoader::boot_DynaLoader' as 'bootstrap'!
|
||||
# boot_DynaLoader is called directly in DynaLoader.pm
|
||||
- $ccode = "\t/* DynaLoader is a special case */\n\tnewXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
|
||||
+ $ccode = " /* DynaLoader is a special case */\n newXS(\"${mname}::boot_${cname}\", boot_${cname}, file);\n";
|
||||
push(@retval, $ccode) unless $seen{$ccode}++;
|
||||
} else {
|
||||
- $ccode = "\tnewXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
|
||||
+ $ccode = " newXS(\"${mname}::bootstrap\", boot_${cname}, file);\n";
|
||||
push(@retval, $ccode) unless $seen{$ccode}++;
|
||||
}
|
||||
}
|
||||
--
|
||||
2.49.1
|
||||
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
SPDX-FileCopyrightText: 2015 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
This renaming is expected by 5.22. The resulting modifications to
|
||||
charclass_invlists.h are done in the script.
|
||||
|
||||
From c75f24e6fc1fa9676693e4e42f199e542b2b2549 Mon Sep 17 00:00:00 2001
|
||||
From: Karl Williamson <public@khwilliamson.com>
|
||||
Date: Thu, 23 Jan 2014 20:34:15 -0700
|
||||
Subject: [PATCH 1/2] Move an inversion list generation to mktables
|
||||
|
||||
Prior to this patch, this was in regen/mk_invlists.pl, but future
|
||||
commits will want it to also be used by the header generated by
|
||||
regen/regcharclass.pl, so use a common source so the logic doesn't have
|
||||
to be duplicated.
|
||||
---
|
||||
lib/unicore/mktables | 22 ++++++++++++++++++++++
|
||||
regcomp.c | 3 ++-
|
||||
2 files changed, 24 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
|
||||
index 808760d002..a5c0d8930e 100644
|
||||
--- perl-5.18.4/lib/unicore/mktables
|
||||
+++ perl-5.18.4/lib/unicore/mktables
|
||||
@@ -13321,6 +13321,28 @@ sub compile_perl() {
|
||||
}
|
||||
$PosixXDigit->add_description('[0-9A-Fa-f]');
|
||||
|
||||
+ my $folds_to_multi_char = $perl->add_match_table(
|
||||
+ "_Perl_Folds_To_Multi_Char",
|
||||
+ Description =>
|
||||
+ "Code points whose fold is a string of more than one character",
|
||||
+ );
|
||||
+
|
||||
+ foreach my $range (property_ref('Case_Folding')->ranges) {
|
||||
+ my $start = $range->start;
|
||||
+ my $end = $range->end;
|
||||
+ $any_folds->add_range($start, $end);
|
||||
+
|
||||
+ my @hex_code_points = split " ", $range->value;
|
||||
+ if (@hex_code_points > 1) {
|
||||
+ $folds_to_multi_char->add_range($start, $end);
|
||||
+ }
|
||||
+
|
||||
+ foreach my $i (0 .. @hex_code_points - 1) {
|
||||
+ my $code_point = hex $hex_code_points[$i];
|
||||
+ $any_folds->add_range($code_point, $code_point);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
my $dt = property_ref('Decomposition_Type');
|
||||
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
|
||||
Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
|
||||
diff --git perl-5.18.4/regcomp.c perl-5.18.4/regcomp.c
|
||||
index 0841f172e5..e839e3e765 100644
|
||||
--- perl-5.18.4/regcomp.c
|
||||
+++ perl-5.18.4/regcomp.c
|
||||
@@ -5566,7 +5566,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
|
||||
PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
|
||||
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
|
||||
|
||||
- PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
|
||||
+ PL_HasMultiCharFold =
|
||||
+ _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
|
||||
}
|
||||
#endif
|
||||
|
||||
--
|
||||
2.49.1
|
||||
|
||||
|
|
@ -0,0 +1,199 @@
|
|||
SPDX-FileCopyrightText: 2014 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
The _Perl_Any_Folds and _Perl_Problematic_Locale_Folds tables are expected
|
||||
by 5.22.
|
||||
|
||||
Their addition is part of a giga commit (~1500 lines changed), we take
|
||||
just that small part of the commit.
|
||||
|
||||
From 10d28d6a24e2a3c41270c43203be5502a586b2f9 Mon Sep 17 00:00:00 2001
|
||||
From: Samuel Tyler <fosslinux@aussies.space>
|
||||
Date: Thu, 28 Aug 2025 21:54:12 +1000
|
||||
Subject: [PATCH 2/2] Partial backport: Work properly under UTF-8 LC_CTYPE
|
||||
locales
|
||||
|
||||
[... original commit message follows, mostly irrelevant]
|
||||
|
||||
This large (sorry, I couldn't figure out how to meaningfully split it
|
||||
up) commit causes Perl to fully support LC_CTYPE operations (case
|
||||
changing, character classification) in UTF-8 locales.
|
||||
|
||||
As a side effect it resolves [perl #56820].
|
||||
|
||||
The basics are easy, but there were a lot of details, and one
|
||||
troublesome edge case discussed below.
|
||||
|
||||
What essentially happens is that when the locale is changed to a UTF-8
|
||||
one, a global variable is set TRUE (FALSE when changed to a non-UTF-8
|
||||
locale). Within the scope of 'use locale', this variable is checked,
|
||||
and if TRUE, the code that Perl uses for non-locale behavior is used
|
||||
instead of the code for locale behavior. Since Perl's internal
|
||||
representation is UTF-8, we get UTF-8 behavior for a UTF-8 locale.
|
||||
|
||||
More work had to be done for regular expressions. There are three
|
||||
cases.
|
||||
|
||||
1) The character classes \w, [[:punct:]] needed no extra work, as
|
||||
the changes fall out from the base work.
|
||||
|
||||
2) Strings that are to be matched case-insensitively. These form
|
||||
EXACTFL regops (nodes). Notice that if such a string contains only
|
||||
characters above-Latin1 that match only themselves, that the node can be
|
||||
downgraded to an EXACT-only node, which presents better optimization
|
||||
possibilities, as we now have a fixed string known at compile time to be
|
||||
required to be in the target string to match. Similarly if all
|
||||
characters in the string match only other above-Latin1 characters
|
||||
case-insensitively, the node can be downgraded to a regular EXACTFU node
|
||||
(match, folding, using Unicode, not locale, rules). The code changes
|
||||
for this could be done without accepting UTF-8 locales fully, but there
|
||||
were edge cases which needed to be handled differently if I stopped
|
||||
there, so I continued on.
|
||||
|
||||
In an EXACTFL node, all such characters are now folded at compile time
|
||||
(just as before this commit), while the other characters whose folds are
|
||||
locale-dependent are left unfolded. This means that they have to be
|
||||
folded at execution time based on the locale in effect at the moment.
|
||||
Again, this isn't a change from before. The difference is that now some
|
||||
of the folds that need to be done at execution time (in regexec) are
|
||||
potentially multi-char. Some of the code in regexec was trivial to
|
||||
extend to account for this because of existing infrastructure, but the
|
||||
part dealing with regex quantifiers, had to have more work.
|
||||
|
||||
Also the code that joins EXACTish nodes together had to be expanded to
|
||||
account for the possibility of multi-character folds within locale
|
||||
handling. This was fairly easy, because it already has infrastructure
|
||||
to handle these under somewhat different circumstances.
|
||||
|
||||
3) In bracketed character classes, represented by ANYOF nodes, a new
|
||||
inversion list was created giving the characters that should be matched
|
||||
by this node when the runtime locale is UTF-8. The list is ignored
|
||||
except under that circumstance. To do this, I created a new ANYOF type
|
||||
which has an extra SV for the inversion list.
|
||||
|
||||
The edge case that caused the most difficulty is folding involving the
|
||||
MICRO SIGN, U+00B5. It folds to the GREEK SMALL LETTER MU, as does the
|
||||
GREEK CAPITAL LETTER MU. The MICRO SIGN is the only 0-255 range
|
||||
character that folds to outside that range. The issue is that it
|
||||
doesn't naturally fall out that it will match the CAP MU. If we let the
|
||||
CAP MU fold to the samll mu at compile time (which it can because both
|
||||
are above-Latin1 and so the fold is the same no matter what locale is in
|
||||
effect), it could appear that the regnode can be downgraded away from
|
||||
EXACTFL to EXACTFU, but doing so would cause the MICRO SIGN to not case
|
||||
insensitvely match the CAP MU. This could be special cased in regcomp
|
||||
and regexec, but I wanted to avoid that. Instead the mktables tables
|
||||
are set up to include the CAP MU as a character whose presence forbids
|
||||
the downgrading, so the special casing is in mktables, and not in the C
|
||||
code.
|
||||
---
|
||||
lib/unicore/mktables | 79 ++++++++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 73 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git perl-5.18.4/lib/unicore/mktables perl-5.18.4/lib/unicore/mktables
|
||||
index a5c0d8930e..4b34b3c338 100644
|
||||
--- perl-5.18.4/lib/unicore/mktables
|
||||
+++ perl-5.18.4/lib/unicore/mktables
|
||||
@@ -13321,25 +13321,92 @@ sub compile_perl() {
|
||||
}
|
||||
$PosixXDigit->add_description('[0-9A-Fa-f]');
|
||||
|
||||
+ my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
|
||||
+ Description => "Code points that particpate in some fold",
|
||||
+ );
|
||||
+ my $loc_problem_folds = $perl->add_match_table(
|
||||
+ "_Perl_Problematic_Locale_Folds",
|
||||
+ Description =>
|
||||
+ "Code points that are in some way problematic under locale",
|
||||
+ );
|
||||
+
|
||||
+ # This allows regexec.c to skip some work when appropriate. Some of the
|
||||
+ # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
|
||||
+ my $loc_problem_folds_start = $perl->add_match_table(
|
||||
+ "_Perl_Problematic_Locale_Foldeds_Start",
|
||||
+ Description =>
|
||||
+ "The first character of every sequence in _Perl_Problematic_Locale_Folds",
|
||||
+ );
|
||||
+
|
||||
+ my $cf = property_ref('Case_Folding');
|
||||
+
|
||||
+ # Every character 0-255 is problematic because what each folds to depends
|
||||
+ # on the current locale
|
||||
+ $loc_problem_folds->add_range(0, 255);
|
||||
+ $loc_problem_folds_start += $loc_problem_folds;
|
||||
+
|
||||
+ # Also problematic are anything these fold to outside the range. Likely
|
||||
+ # forever the only thing folded to by these outside the 0-255 range is the
|
||||
+ # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
|
||||
+ # completely general, which should catch any unexpected changes or errors.
|
||||
+ # We look at each code point 0-255, and add its fold (including each part
|
||||
+ # of a multi-char fold) to the list. See the commit message for these
|
||||
+ # changes for a more complete description of the MU issue.
|
||||
+ foreach my $range ($loc_problem_folds->ranges) {
|
||||
+ foreach my $code_point($range->start .. $range->end) {
|
||||
+ my $fold_range = $cf->containing_range($code_point);
|
||||
+ next unless defined $fold_range;
|
||||
+
|
||||
+ my @hex_folds = split " ", $fold_range->value;
|
||||
+ my $start_cp = hex $hex_folds[0];
|
||||
+ foreach my $i (0 .. @hex_folds - 1) {
|
||||
+ my $cp = hex $hex_folds[$i];
|
||||
+ next unless $cp > 255; # Already have the < 256 ones
|
||||
+
|
||||
+ $loc_problem_folds->add_range($cp, $cp);
|
||||
+ $loc_problem_folds_start->add_range($start_cp, $start_cp);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
my $folds_to_multi_char = $perl->add_match_table(
|
||||
"_Perl_Folds_To_Multi_Char",
|
||||
Description =>
|
||||
"Code points whose fold is a string of more than one character",
|
||||
);
|
||||
|
||||
- foreach my $range (property_ref('Case_Folding')->ranges) {
|
||||
+ # Look through all the known folds to populate these tables.
|
||||
+ foreach my $range ($cf->ranges) {
|
||||
my $start = $range->start;
|
||||
my $end = $range->end;
|
||||
$any_folds->add_range($start, $end);
|
||||
|
||||
- my @hex_code_points = split " ", $range->value;
|
||||
- if (@hex_code_points > 1) {
|
||||
+ my @hex_folds = split " ", $range->value;
|
||||
+ if (@hex_folds > 1) { # Is multi-char fold
|
||||
$folds_to_multi_char->add_range($start, $end);
|
||||
}
|
||||
|
||||
- foreach my $i (0 .. @hex_code_points - 1) {
|
||||
- my $code_point = hex $hex_code_points[$i];
|
||||
- $any_folds->add_range($code_point, $code_point);
|
||||
+ my $found_locale_problematic = 0;
|
||||
+
|
||||
+ # Look at each of the folded-to characters...
|
||||
+ foreach my $i (0 .. @hex_folds - 1) {
|
||||
+ my $cp = hex $hex_folds[$i];
|
||||
+ $any_folds->add_range($cp, $cp);
|
||||
+
|
||||
+ # The fold is problematic if any of the folded-to characters is
|
||||
+ # already considered problematic.
|
||||
+ if ($loc_problem_folds->contains($cp)) {
|
||||
+ $loc_problem_folds->add_range($start, $end);
|
||||
+ $found_locale_problematic = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ # If this is a problematic fold, add to the start chars the
|
||||
+ # folding-from characters and first folded-to character.
|
||||
+ if ($found_locale_problematic) {
|
||||
+ $loc_problem_folds_start->add_range($start, $end);
|
||||
+ my $cp = hex $hex_folds[0];
|
||||
+ $loc_problem_folds_start->add_range($cp, $cp);
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.49.1
|
||||
|
||||
39
steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch
Normal file
39
steps/perl-5.18.4/patches/Unicode-UCD-search_invlist.patch
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
SPDX-FileCopyrightText: 2013 Karl Williamson <public@khwilliamson.com>
|
||||
SPDX-FileCopyrightText: 2025 fosslinux <fosslinux@aussies.space>
|
||||
|
||||
SPDX-License-Identifier: Artistic-1.0
|
||||
|
||||
This renaming is required for 5.22.
|
||||
|
||||
It is a manual port of 1fdd5e539a9.
|
||||
|
||||
diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm
|
||||
index 9c3dd7c710..2349300626 100644
|
||||
--- perl-5.18.4/lib/Unicode/UCD.pm
|
||||
+++ perl-5.18.4/lib/Unicode/UCD.pm
|
||||
@@ -25,6 +25,7 @@ our @EXPORT_OK = qw(charinfo
|
||||
prop_invlist
|
||||
prop_invmap
|
||||
MAX_CP
|
||||
+ search_invlist
|
||||
);
|
||||
|
||||
use Carp;
|
||||
@@ -2261,7 +2261,7 @@ sub prop_invlist ($;$) {
|
||||
return @invlist;
|
||||
}
|
||||
|
||||
-sub _search_invlist {
|
||||
+sub search_invlist {
|
||||
# Find the range in the inversion list which contains a code point; that
|
||||
# is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no
|
||||
# such i.
|
||||
@@ -3411,7 +3411,7 @@ RETRY:
|
||||
}
|
||||
|
||||
# Find the range that the override applies to.
|
||||
- my $i = _search_invlist(\@invlist, $cp);
|
||||
+ my $i = search_invlist(\@invlist, $cp);
|
||||
if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
|
||||
croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue