From 1bfcdd572f51f5db59c6c3b91f28c5b3141215bb Mon Sep 17 00:00:00 2001
From: Samuel Tyler <fosslinux@aussies.space>
Date: Fri, 22 Aug 2025 17:24:24 +1000
Subject: [PATCH] Revert "regexec.c: Use new macros instead of swashes"

This reverts commit 45fdf108c5f4d5c719b8d2f7389c81e54795bad7.
---
 embed.fnc                  |   9 ++
 embedvar.h                 |   7 ++
 intrpvar.h                 |   7 ++
 regen/unicode_constants.pl |   3 +
 regexec.c                  |  78 ++++++++++------
 sv.c                       |   8 ++
 utf8.c                     | 180 +++++++++++++++++++++++++++++++++++++
 7 files changed, 265 insertions(+), 27 deletions(-)

diff --git perl-5.17.4/embed.fnc perl-5.17.4/embed.fnc
index 0db9300b3b..e45e6889c9 100644
--- perl-5.17.4/embed.fnc
+++ perl-5.17.4/embed.fnc
@@ -660,7 +660,16 @@ ApR	|bool	|is_utf8_punct	|NN const U8 *p
 ApR	|bool	|is_utf8_xdigit	|NN const U8 *p
 ApR	|bool	|is_utf8_mark	|NN const U8 *p
 EXpR	|bool	|is_utf8_X_extend	|NN const U8 *p
+EXpR	|bool	|is_utf8_X_prepend	|NN const U8 *p
 EXpR	|bool	|is_utf8_X_regular_begin|NN const U8 *p
+EXpR	|bool	|is_utf8_X_special_begin|NN const U8 *p
+EXpR	|bool	|is_utf8_X_L		|NN const U8 *p
+EXpR	|bool	|is_utf8_X_RI		|NN const U8 *p
+:not currently used EXpR	|bool	|is_utf8_X_LV		|NN const U8 *p
+EXpR	|bool	|is_utf8_X_LVT		|NN const U8 *p
+EXpR	|bool	|is_utf8_X_LV_LVT_V	|NN const U8 *p
+EXpR	|bool	|is_utf8_X_T		|NN const U8 *p
+EXpR	|bool	|is_utf8_X_V		|NN const U8 *p
 : Used in perly.y
 p	|OP*	|jmaybe		|NN OP *o
 : Used in pp.c 
diff --git perl-5.17.4/embedvar.h perl-5.17.4/embedvar.h
index b9fabab437..f306f2f2f7 100644
--- perl-5.17.4/embedvar.h
+++ perl-5.17.4/embedvar.h
@@ -353,9 +353,16 @@
 #define PL_unitcheckav_save	(vTHX->Iunitcheckav_save)
 #define PL_unlockhook		(vTHX->Iunlockhook)
 #define PL_unsafe		(vTHX->Iunsafe)
+#define PL_utf8_X_L		(vTHX->Iutf8_X_L)
 #define PL_utf8_X_LVT		(vTHX->Iutf8_X_LVT)
+#define PL_utf8_X_LV_LVT_V	(vTHX->Iutf8_X_LV_LVT_V)
+#define PL_utf8_X_RI		(vTHX->Iutf8_X_RI)
+#define PL_utf8_X_T		(vTHX->Iutf8_X_T)
+#define PL_utf8_X_V		(vTHX->Iutf8_X_V)
 #define PL_utf8_X_extend	(vTHX->Iutf8_X_extend)
+#define PL_utf8_X_prepend	(vTHX->Iutf8_X_prepend)
 #define PL_utf8_X_regular_begin	(vTHX->Iutf8_X_regular_begin)
+#define PL_utf8_X_special_begin	(vTHX->Iutf8_X_special_begin)
 #define PL_utf8_alnum		(vTHX->Iutf8_alnum)
 #define PL_utf8_alpha		(vTHX->Iutf8_alpha)
 #define PL_utf8_blank		(vTHX->Iutf8_blank)
diff --git perl-5.17.4/intrpvar.h perl-5.17.4/intrpvar.h
index 40a6aa1e9d..e306ebbb31 100644
--- perl-5.17.4/intrpvar.h
+++ perl-5.17.4/intrpvar.h
@@ -628,7 +628,14 @@ PERLVAR(I, utf8_xdigit,	SV *)
 PERLVAR(I, utf8_mark,	SV *)
 PERLVAR(I, utf8_X_regular_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
+PERLVAR(I, utf8_X_prepend, SV *)
+PERLVAR(I, utf8_X_special_begin, SV *)
+PERLVAR(I, utf8_X_L,	SV *)
 PERLVAR(I, utf8_X_LVT,	SV *)
+PERLVAR(I, utf8_X_RI,	SV *)
+PERLVAR(I, utf8_X_T,	SV *)
+PERLVAR(I, utf8_X_V,	SV *)
+PERLVAR(I, utf8_X_LV_LVT_V, SV *)
 PERLVAR(I, utf8_toupper, SV *)
 PERLVAR(I, utf8_totitle, SV *)
 PERLVAR(I, utf8_tolower, SV *)
diff --git perl-5.17.4/regen/unicode_constants.pl perl-5.17.4/regen/unicode_constants.pl
index e3d588a599..ab1ab8f326 100644
--- perl-5.17.4/regen/unicode_constants.pl
+++ perl-5.17.4/regen/unicode_constants.pl
@@ -134,6 +134,9 @@ __DATA__
 03C5 first
 03C5 tail
 
+1100
+1160
+11A8
 2010 string
 D800 first FIRST_SURROGATE
 
diff --git perl-5.17.4/regexec.c perl-5.17.4/regexec.c
index f207cdaf88..2676ebb92e 100644
--- perl-5.17.4/regexec.c
+++ perl-5.17.4/regexec.c
@@ -145,7 +145,14 @@
         /* No asserts are done for some of these, in case called on a   */  \
         /* Unicode version in which they map to nothing */                  \
 	LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
 	LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
+	LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);	            \
+	LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
+	LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
+	LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
 
 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
 
@@ -4084,8 +4091,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    locinput += 2;
 		}
 		else {
-                    STRLEN len;
-
 		    /* In case have to backtrack to beginning, then match '.' */
 		    char *starting = locinput;
 
@@ -4094,12 +4099,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 		    LOAD_UTF8_CHARCLASS_GCB();
 
-                    /* Match (prepend)*   */
-                    while (locinput < PL_regeol
-                           && (len = is_GCB_Prepend_utf8(locinput)))
-                    {
-                        previous_prepend = locinput;
-                        locinput += len;
+                    /* Match (prepend)*, but don't bother trying if empty (as
+                     * being set to _undef indicates) */
+                    if (PL_utf8_X_prepend != &PL_sv_undef) {
+                        while (locinput < PL_regeol
+                               && swash_fetch(PL_utf8_X_prepend,
+                                              (U8*)locinput, utf8_target))
+                        {
+                            previous_prepend = locinput;
+                            locinput += UTF8SKIP(locinput);
+                        }
                     }
 
 		    /* As noted above, if we matched a prepend character, but
@@ -4109,7 +4118,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 			&& (locinput >=  PL_regeol
 			    || (! swash_fetch(PL_utf8_X_regular_begin,
 					     (U8*)locinput, utf8_target)
-			         && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
+			         && ! swash_fetch(PL_utf8_X_special_begin,
+					     (U8*)locinput, utf8_target)))
                         )
 		    {
 			locinput = previous_prepend;
@@ -4124,7 +4134,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                                     (U8*)locinput, utf8_target)) {
                         locinput += UTF8SKIP(locinput);
                     }
-                    else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
+                    else if (! swash_fetch(PL_utf8_X_special_begin,
+					(U8*)locinput, utf8_target))
+			{
 
 			/* Here did not match the required 'Begin' in the
 			 * second term.  So just match the very first
@@ -4136,20 +4148,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         /* Here is a special begin.  It can be composed of
                          * several individual characters.  One possibility is
                          * RI+ */
-                        if ((len = is_GCB_RI_utf8(locinput))) {
-                            locinput += len;
+                        if (swash_fetch(PL_utf8_X_RI,
+                                        (U8*)locinput, utf8_target))
+                        {
+                            locinput += UTF8SKIP(locinput);
                             while (locinput < PL_regeol
-                                   && (len = is_GCB_RI_utf8(locinput)))
+                                    && swash_fetch(PL_utf8_X_RI,
+                                                    (U8*)locinput, utf8_target))
                             {
-                                locinput += len;
+                                locinput += UTF8SKIP(locinput);
                             }
-                        } else if ((len = is_GCB_T_utf8(locinput))) {
-                            /* Another possibility is T+ */
-                            locinput += len;
+                        } else /* Another possibility is T+ */
+                               if (swash_fetch(PL_utf8_X_T,
+                                               (U8*)locinput, utf8_target))
+                        {
+                            locinput += UTF8SKIP(locinput);
                             while (locinput < PL_regeol
-                                && (len = is_GCB_T_utf8(locinput)))
+                                    && swash_fetch(PL_utf8_X_T,
+                                                    (U8*)locinput, utf8_target))
                             {
-                                locinput += len;
+                                locinput += UTF8SKIP(locinput);
                             }
                         } else {
 
@@ -4160,9 +4178,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
                             /* Match L*           */
                             while (locinput < PL_regeol
-                                   && (len = is_GCB_L_utf8(locinput)))
+                                    && swash_fetch(PL_utf8_X_L,
+                                                    (U8*)locinput, utf8_target))
                             {
-                                locinput += len;
+                                locinput += UTF8SKIP(locinput);
                             }
 
                             /* Here, have exhausted L*.  If the next character
@@ -4172,7 +4191,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                              * Are done. */
 
                             if (locinput < PL_regeol
-                                && is_GCB_LV_LVT_V_utf8(locinput))
+                                && swash_fetch(PL_utf8_X_LV_LVT_V,
+                                                (U8*)locinput, utf8_target))
                             {
 
                                 /* Otherwise keep going.  Must be LV, LVT or V.
@@ -4185,18 +4205,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                                      * V*     */
                                     locinput += UTF8SKIP(locinput);
                                     while (locinput < PL_regeol
-                                           && (len = is_GCB_V_utf8(locinput)))
+                                            && swash_fetch(PL_utf8_X_V,
+                                                           (U8*)locinput,
+                                                           utf8_target))
                                     {
-                                        locinput += len;
+                                        locinput += UTF8SKIP(locinput);
                                     }
                                 }
 
                                 /* And any of LV, LVT, or V can be followed
-                                 * by T*            */
+                                    * by T*            */
                                 while (locinput < PL_regeol
-                                       && (len = is_GCB_T_utf8(locinput)))
+                                        && swash_fetch(PL_utf8_X_T,
+                                                        (U8*)locinput,
+                                                        utf8_target))
                                 {
-                                    locinput += len;
+                                    locinput += UTF8SKIP(locinput);
                                 }
                             }
                         }
diff --git perl-5.17.4/sv.c perl-5.17.4/sv.c
index 89699be4b2..b22e728275 100644
--- perl-5.17.4/sv.c
+++ perl-5.17.4/sv.c
@@ -13379,7 +13379,15 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_mark	= sv_dup_inc(proto_perl->Iutf8_mark, param);
     PL_utf8_X_regular_begin	= sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
     PL_utf8_X_extend	= sv_dup_inc(proto_perl->Iutf8_X_extend, param);
+    PL_utf8_X_prepend	= sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
+    PL_utf8_X_special_begin	= sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
+    PL_utf8_X_L	= sv_dup_inc(proto_perl->Iutf8_X_L, param);
+    /*not currently used: PL_utf8_X_LV	= sv_dup_inc(proto_perl->Iutf8_X_LV, param);*/
     PL_utf8_X_LVT	= sv_dup_inc(proto_perl->Iutf8_X_LVT, param);
+    PL_utf8_X_RI	= sv_dup_inc(proto_perl->Iutf8_X_RI, param);
+    PL_utf8_X_T	= sv_dup_inc(proto_perl->Iutf8_X_T, param);
+    PL_utf8_X_V	= sv_dup_inc(proto_perl->Iutf8_X_V, param);
+    PL_utf8_X_LV_LVT_V	= sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param);
     PL_utf8_toupper	= sv_dup_inc(proto_perl->Iutf8_toupper, param);
     PL_utf8_totitle	= sv_dup_inc(proto_perl->Iutf8_totitle, param);
     PL_utf8_tolower	= sv_dup_inc(proto_perl->Iutf8_tolower, param);
diff --git perl-5.17.4/utf8.c perl-5.17.4/utf8.c
index 660002388f..2172d311b4 100644
--- perl-5.17.4/utf8.c
+++ perl-5.17.4/utf8.c
@@ -2229,6 +2229,186 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p)
     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
 }
 
+bool
+Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+{
+    /* If no code points in the Unicode version being worked on match
+     * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its
+     * first call.  Otherwise, it will set it to a swash created for it.
+     * swash_fetch() hence can't be used without checking first if it is valid
+     * to do so. */
+
+    dVAR;
+    bool initialized = cBOOL(PL_utf8_X_prepend);
+    bool ret;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
+
+    if (PL_utf8_X_prepend == &PL_sv_undef) {
+        return FALSE;
+    }
+
+    if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"))
+        || initialized)
+    {
+        return ret;
+    }
+
+    /* Here the code point being checked was not a prepend, and we hadn't
+     * initialized PL_utf8_X_prepend, so we don't know if it is just this
+     * particular input code point that didn't match, or if the table is
+     * completely empty. The is_utf8_common() call did the initialization, so
+     * we can inspect the swash's inversion list to find out.  If there are no
+     * elements in its inversion list, it's empty, and nothing will ever match,
+     * so set things up so we can skip the check in future calls. */
+    if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) {
+        SvREFCNT_dec(PL_utf8_X_prepend);
+        PL_utf8_X_prepend = &PL_sv_undef;
+    }
+
+    return FALSE;
+}
+
+bool
+Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN;
+
+    return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin");
+}
+
+bool
+Perl_is_utf8_X_L(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_L;
+
+    return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
+}
+
+bool
+Perl_is_utf8_X_RI(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_RI;
+
+    return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI");
+}
+
+/* These constants are for finding GCB=LV and GCB=LVT.  These are for the
+ * pre-composed Hangul syllables, which are all in a contiguous block and
+ * arranged there in such a way so as to facilitate alorithmic determination of
+ * their characteristics.  As such, they don't need a swash, but can be
+ * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
+ * is a GCB=LV */
+#define SBASE 0xAC00    /* Start of block */
+#define SCount 11172    /* Length of block */
+#define TCount 28
+
+#if 0   /* This routine is not currently used */
+bool
+Perl_is_utf8_X_LV(pTHX_ const U8 *p)
+{
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
+
+    dVAR;
+
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV;
+
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
+        PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LV);
+            PL_utf8_X_LV = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LV != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
+}
+#endif
+
+bool
+Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
+{
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
+
+    dVAR;
+
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
+
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
+        PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LVT);
+            PL_utf8_X_LVT = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LVT != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
+}
+
+bool
+Perl_is_utf8_X_T(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_T;
+
+    return is_utf8_common(p, &PL_utf8_X_T, "_X_GCB_T");
+}
+
+bool
+Perl_is_utf8_X_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_V;
+
+    return is_utf8_common(p, &PL_utf8_X_V, "_X_GCB_V");
+}
+
+bool
+Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
+
+    return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
+}
+
+bool
+Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+{
+    /* For exclusive use of pp_quotemeta() */
+
+    dVAR;
+
+    PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
+
+    return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
+}
+
 /*
 =for apidoc to_utf8_case
 
-- 
2.49.1