SPDX-FileCopyrightText: 2015 Karl Williamson SPDX-FileCopyrightText: 2025 Samuel Tyler SPDX-License-Identifier: Artistic-1.0 This function is required for 5.22. From 6bf3612f9c9b0788de8adf06539b41c64695c014 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 27 Jan 2015 15:08:08 -0700 Subject: [PATCH] Unicode::UCD: Add prop_values() function This new function returns the input property's possible values. --- lib/Unicode/UCD.pm | 125 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 132 insertions(+), 23 deletions(-) diff --git perl-5.18.4/lib/Unicode/UCD.pm perl-5.18.4/lib/Unicode/UCD.pm index 9c3dd7c710..7033128ae5 100644 --- perl-5.18.4/lib/Unicode/UCD.pm +++ perl-5.18.4/lib/Unicode/UCD.pm @@ -22,6 +20,7 @@ our @EXPORT_OK = qw(charinfo num prop_aliases prop_value_aliases + prop_values prop_invlist prop_invmap MAX_CP @@ -73,6 +72,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'prop_value_aliases'; my @gc_punct_names = prop_value_aliases("Gc", "Punct"); + use Unicode::UCD 'prop_values'; + my @all_EA_short_names = prop_values("East_Asian_Width"); + use Unicode::UCD 'prop_invlist'; my @puncts = prop_invlist("gc=punctuation"); @@ -730,6 +732,9 @@ names>). L can be used to get this same data in a different type of data structure. +L can be used to get all +the known new-style block names as a list, without the code point ranges. + See also L. =cut @@ -752,6 +757,9 @@ the values. L can be used to get this same data in a different type of data structure. +L|/prop_values()> can be used to get all +the known script names as a list, without the code point ranges. + See also L. =cut @@ -835,8 +843,9 @@ from the long names to the short names. The general category is the one returned from L under the C key. -The L function can be used to get all the synonyms of -the category name. +The L and L functions can be used as an +alternative to this function; the first returning a simple list of the short +category names; and the second gets all the synonyms of a given category name. =cut @@ -880,8 +889,10 @@ the Unicode TR9 is recommended reading: L (as of Unicode 5.0.0) -The L function can be used to get all the synonyms of -the bidi type name. +The L and L functions can be used as an +alternative to this function; the first returning a simple list of the short +bidi type names; and the second gets all the synonyms of a given bidi type +name. =cut @@ -1864,6 +1875,79 @@ sub prop_aliases ($) { =pod +=head2 B + + use Unicode::UCD 'prop_values'; + + print "AHex values are: ", join(", ", prop_values("AHex")), + "\n"; + prints: + AHex values are: N, Y + +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just C or C; and there +are only a few dozen possible General Categories. Use C +to find out if a given property is one such, and if so, to get a list of the +values: + + print join ", ", prop_values("NFC_Quick_Check"); + prints: + M, N, Y + +If the property doesn't have such a restricted set, C is returned. + +There are usually several synonyms for each possible value. Use +L to access those. + +Case, white space, hyphens, and underscores are ignored in the input property +name (except for the trailing underscore in the old-form grandfathered-in +general category property value C<"L_">, which is better written as C<"LC">). + +If the property name is unknown, C is returned. Note that Perl typically +recognizes property names in regular expressions with an optional C<"Is_>" +(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. +This function does not recognize those in the property parameter, returning +C. + +For the block property, new-style block names are returned (see +L). + +C does not know about any user-defined properties, and +will return C if called with one of those. + +=cut + +# These are created by mktables for this module and stored in unicore/UCD.pl +# where their structures are described. +our %loose_to_standard_value; +our %prop_value_aliases; + +sub prop_values ($) { + my $prop = shift; + return undef unless defined $prop; + + require "unicore/UCD.pl"; + require "utf8_heavy.pl"; + + # Find the property name synonym that's used as the key in other hashes, + # which is element 0 in the returned list. + ($prop) = prop_aliases($prop); + return undef if ! $prop; + $prop = utf8::_loose_name(lc $prop); + + # Here is a legal property. + return undef unless exists $prop_value_aliases{$prop}; + my @return; + foreach my $value_key (sort { lc $a cmp lc $b } + keys %{$prop_value_aliases{$prop}}) + { + push @return, $prop_value_aliases{$prop}{$value_key}[0]; + } + return @return; +} + +=pod + =head2 B use Unicode::UCD 'prop_value_aliases'; @@ -1877,7 +1961,7 @@ sub prop_aliases ($) { print "The short name is $short_name\n"; print "The other aliases are: ", join(", ", @other_names), "\n"; - prints: + prints: The full name is Punctuation The short name is P The other aliases are: Punct @@ -1886,18 +1970,20 @@ Some Unicode properties have a restricted set of legal values. For example, all binary properties are restricted to just C or C; and there are only a few dozen possible General Categories. -For such properties, there are usually several synonyms for each possible -value. For example, in binary properties, I can be represented by any of -the strings "Y", "Yes", "T", or "True"; and the General Category -"Punctuation" by that string, or "Punct", or simply "P". +You can use L to find out if a given property is one which has +a restricted set of values, and if so, what those values are. But usually +each value actually has several synonyms. For example, in binary properties, +I can be represented by any of the strings "Y", "Yes", "T", or "True"; +and the General Category "Punctuation" by that string, or "Punct", or simply +"P". Like property names, there is typically at least a short name for each such -property-value, and a long name. If you know any name of the property-value, -you can use C() to get the long name (when called in -scalar context), or a list of all the names, with the short name in the 0th -element, the long name in the next element, and any other synonyms in the -remaining elements, in no particular order, except that any all-numeric -synonyms will be last. +property-value, and a long name. If you know any name of the property-value +(which you can get by L, you can use C() +to get the long name (when called in scalar context), or a list of all the +names, with the short name in the 0th element, the long name in the next +element, and any other synonyms in the remaining elements, in no particular +order, except that any all-numeric synonyms will be last. The long name is returned in a form nicely capitalized, suitable for printing. @@ -1926,11 +2012,6 @@ will return C if called with one of those. =cut -# These are created by mktables for this routine and stored in unicore/UCD.pl -# where their structures are described. -our %loose_to_standard_value; -our %prop_value_aliases; - sub prop_value_aliases ($$) { my ($prop, $value) = @_; return unless defined $prop && defined $value; -- 2.49.1