From 955a0bbad8086963309bc81229e31b50c9e83cc3 Mon Sep 17 00:00:00 2001 From: Eddie Kohler Date: Sun, 22 Oct 2023 21:11:48 -0400 Subject: [PATCH] Document Fmt and update accordingly. * Ftext conversion to HTML must quote single-quotes, which requires an argument in older PHPs. * Add `lang` requirement. --- devel/manual/fmt.md | 308 ++++++++++++++++++++++++++++++++++++++++++++ lib/fmt.php | 92 +++++++------ lib/ftext.php | 4 +- src/conference.php | 11 +- test/t_fmt.php | 15 ++- 5 files changed, 382 insertions(+), 48 deletions(-) create mode 100644 devel/manual/fmt.md diff --git a/devel/manual/fmt.md b/devel/manual/fmt.md new file mode 100644 index 000000000..56ed32fe5 --- /dev/null +++ b/devel/manual/fmt.md @@ -0,0 +1,308 @@ +# Message formatting in HotCRP + +## Markup types + +HotCRP understands several types of markup. These are defined so far. (Note +that open-source HotCRP ships without Markdown support.) + +| Markup type | Description | +|-------------|:----------------------------------| +| 0 | Plain text | +| 1 | Markdown (no HTML allowed) | +| 3 | Markdown (HTML allowed) | +| 5 | HTML | + +The `Ftext` class can convert between some formats. + + +## Ftext + +An **ftext**, short for “formatted text,” is a string that includes its markup +type as a prefix. Ftexts are used for many HotCRP messages, and some HotCRP +subsystems, such as error messages, require ftexts. + +An ftext is written `STRING`, where `MARKUPTYPE` is a non-negative +integer. The most common `FORMAT`s are `0` (plain text) and `5` (HTML). + + +## Translation overview + +HotCRP messages are rendered using a JSON **translation database**. +Translations can change message text based on context, database settings, and +arguments, and could be used for internationalization. + +A translation request comprises a **string**, an optional **context** (a +slash-separated string), and optional **arguments**, which can be named or +positional. The arguments can help determine the chosen translation string, +and can be interpolated into the translation result as **replacement fields**. + +Here is an example translation request: + +```php +$conf->_("Hello, {names:list}!", new FmtArg("names", ["Alice", "Joan"])); +``` + +In this request, the string is `Hello, {names:list}`; the context is empty +(the `_` translation function assumes an empty context); and the single +argument is the list `["Alice", "Joan"]`. + +In the absence of a translation database, this request will resolve to: + +``` +Hello, Alice and Joan! +``` + +with the `names` argument interpolated as a list. + +Related requests would resolve as follows: + +```php +$conf->_("Hello, {names:list}!", new FmtArg("names", ["Gesine"])) + === "Hello, Gesine!"; +$conf->_("Hello, {names:list}!", new FmtArg("names", [])) + === "Hello, !"; +$conf->_("Hello, {names:list}!", new FmtArg("names", range(1, 15))) + === "Hello, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, and 15!"; +``` + +A translation database can set up substitute texts for particular arguments or +database settings. For example, this database defines special-case messages +when there are no names, or more than five: + +```json +[ + {"in": "Hello, {names:list}!", "out": "Hello!", "require": ["!#{names}"]}, + {"in": "Hello, {names:list}!", "out": "Hello, all!", "require": ["#{names}>5"]} +] +``` + +With that database, our requests would resolve as: + + +```php +$conf->_("Hello, {names:list}!", new FmtArg("names", ["Alice", "Joan"])) + === "Hello, Alice and Joan!"; // using default translation +$conf->_("Hello, {names:list}!", new FmtArg("names", ["Gesine"])) + === "Hello, Gesine!"; // using default translation +$conf->_("Hello, {names:list}!", new FmtArg("names", [])) + === "Hello!"; // using first database record +$conf->_("Hello, {names:list}!", new FmtArg("names", range(1, 15))) + === "Hello, all!"; // using second database record +``` + +## Translation records + +A translation record is an object with these properties: + +* `in`: The input string. +* `out`: The output string (i.e., the translation). If not specified, it + defaults to the value of `in`. +* `context`: (Optional) The record’s context. +* `require`: (Optional) A list of requirements that must hold for the + translation to match. A requirement is specified as a string with format + defined below. +* `priority`: (Optional) The priority of this translation. Priority defaults + to 0, except that messages in the default `etc/msgs.json` database have + default priority -1. +* `template`: (Optional) If `true`, indicates that this translation should be + made accessible as a replacement field from other translations. +* `expand`: (Optional) Determines how replacement fields are interpolated into + this translation. Defaults to `"full"`, which expands arguments and + templates. Other possibilities are `"template"`, which only expands + templates, and `"none"`, which uses the translation verbatim. + +A translation database is simply a JSON array of translation objects. + +Shorthands are available to define translations more parsimoniously: + +* A set of related translations may defined using a single **parent** object + whose `m` property defines an array of **child** translation records. HotCRP + uses the child records, but each child inherits properties from its parent + by default. + +* A translation without `template` or `expand` properties can be defined using + array shorthand. Specifically: + + | Object definition | Array shorthand | + |:----------------------------------------------------|:-------------------------| + | `{"in": "IN", "out": "OUT"}` | `["IN", "OUT"]` | + | `{"context": "CTX", "in": "IN", "out": "IN"}` | `["CTX", "IN", "OUT"]` | + | `{"in": "IN", "out": "OUT", "priority": 2}` | `["IN", "OUT", 2]` | + | `{"in": "IN", "out": "OUT", "require": ["REQ"]}` | `["IN", "OUT", ["REQ"]]` | + | `{"in": "STR", "out": "STR"}` | `["STR"]` | + + (The `priority` and `require` properties can occur anywhere in the array, + not just at the end.) + +* An identity translation with no requirements and default priority can be + written as just the input string. + +For example, this set of related translations: + +```json +[ + {"in": "Hello, {names:list}!", "out": "Hello, {names:list}!"}, + {"in": "Hello, {names:list}!", "out": "Hello!", "require": ["!#{names}"]}, + {"in": "Hello, {names:list}!", "out": "Hello, all!", "require": ["#{names}>5"]} + {"in": "Hello, {names:list}!", "out": "Boujour mes enfants !", "require": "lang=fr", "priority": 1} +] +``` + +can be expressed more concisely as + +```json +[ + {"in": "Hello, {names:list}!", "m": [ + "Hello, {names:list}!", + ["Hello!", ["!#{names}"]], + ["Hello, all!", ["#{names}>5"]], + ["Bonjour mes enfants !", 1, ["lang=fr"]] + ]} +] +``` + +(Note that in nested array shorthand, the `in` string need not be specified, +since it is inherited from the parent.) + +There are some restrictions on nested translations. A nested translation’s +`context` must be more specific than its parent’s, and when a parent +translation defines an input string, its children translations must have the +same input string. + + +## Translation search + +To find the best translation for a request: + +1. HotCRP first scans the database for records with matching string, context, + and requirements. + + A translation record’s *input string* matches if it is + character-for-character identical with the requested string. + + *Contexts* can distinguish strings that might be translated differently + based on where in the UI they appear. A context is a slash-separated + string. A record’s context matches if it is a prefix of the requested + context. For example, the context `"paper"` would match requested contexts + `"paper"` and `"paper/edit"`, but not `"paperedit"` (because components + between slashes must match exactly). A translation with empty context + matches all requested contexts. + + A record’s *requirements* match if each of them evaluates to true. + +2. Of the matching translation records, HotCRP selects the ones with the + maximum *priority* (an number that allows translations to override one + another regardless of context or requirements). + +3. Of those, HotCRP selects the records with the maximum *context length*. + +4. Of those, HotCRP selects the records with the maximum *number of + requirements* (so a translation with more requirements will beat a + translation with fewer). + +5. And of those, HotCRP selects the record that was defined last. + +The search yields the resulting record’s output string, if any records +matched, or a copy of the input string, if none matched. + + +## Requirement minilanguage + +Requirements can check arguments or certain configuration properties and can +perform simple comparisons. A requirement should have one of these formats: + +* `V`: Check whether value `V` is truthy (not null, empty array, or empty + string). +* `!V`: Check whether `V` is falsy. +* `V=CV`: Check whether two scalar values are equal, considered as strings. +* `V!=CV`: Check whether two scalar values are not equal, considered as + strings. +* `VCV`, `V<=CV`, `V>=CV`: Compare numeric values. +* `V^=CV`: Check whether string `V` is a prefix of `CV`. +* `V!^=CV`: Check whether string `V` is not a prefix of `CV`. + +The first value `V` can be: + +* A parameter definition enclosed in braces, such as `{value}`. +* An array count, such as `#{names}`. This evaluates to the number of elements + in array parameter `{names}`. +* A database setting, such as `setting.sub_blind`. +* A configuration option, such as `opt.sendEmail`. +* `lang`, which expands to a language code. + +The second, comparand value `CV` can be: + +* A parameter definition enclosed in braces. +* An array count. +* A literal string. + + +## Expansion + +By default, HotCRP interpolates replacement fields into translated strings. +Interpolated fields are specified using curly braces `{}`, as in Python `fmt` +or C++ `std::format`. To include a literal curly brace, especially if it would +otherwise be mistaken for a replacement field, double it: `{{` is interpolated +as `{`. + +A replacement field consists of an optional argument ID, optionally followed +by colon and a **format specification** defining how the replacement should be +parsed. + +An argument ID can be a nonnegative number, which specifies a positional +argument starting from 0, or a name. Fields with missing argument IDs are +assigned the positional arguments in order. A string should not use both +numeric argument IDs and missing argument IDs; don’t say, for example, `The +{0} score is {}`. + +Named arguments are generally provided in PHP code using `FmtArg`; the names +available in a translation depend on the code that requests that translation. +However, a name can also refer to a template message from the database, such +as `{conflictdef}` (the definition of conflict of interest). Only +specifically-marked translations may be included as templates. + +An argument is usually a string, boolean, or number, but it may also be an +array. Use square brackets to refer to a specific element of an array, as in +`{0[foo]}` or `{names[1]}`. + +Arguments with known formats, such as ftexts, are translated to match the +expected format before being interpolated. For example, given these templates: + +```json +[ + {"in": "company1", "out": "<0>Fortnum & Mason", "template": true}, + {"in": "company2", "out": "<5>Sanford & Sons", "template": true} +] +``` + +HotCRP would translate: + +```php +$conf->_("<0>{company1} and {company2}") + === "<0>Fortnum & Mason and Sanford & Sons"; +$conf->_("<5>{company1} and {company2}") + === "<5>Fortnum & Mason and Sanford & Sons"; +``` + +## Format specifications + +HotCRP understands the following format specifications. + +| Format specification | Result | +|:--------------------------|:-----------------------------------------------------| +| `:url` | The string argument is urlencoded. | +| `:html` | The string argument is HTML-encoded; i.e., `&<>"'` are replaced by HTML entities. | +| `:ftext` | When possible, the string argument is incorporated as an ftext, rather than having its format translated or stripped. | +| `:humanize_url` | If the argument string is a simple url, such as `https://hotcrp.com/privacy`, it is replaced by a shorter version, such as `hotcrp.com/privacy`. | +| `:.2f`, etc. | The numeric argument is rendered using a printf-style specification. | +| `:time` | The integer argument is treated as a number of seconds since the Unix epoch, and printed as a long-format time. | +| `:expandedtime` | The integer argument is treated as a number of seconds since the Unix epoch, and printed as an expanded long-format time (including the time in the browser’s time zone). | +| `:list` | The array argument is incorporated as a comma-separated list. | +| `:nblist` | The array argument is incorporated as a comma-separated list; when formatting to HTML, the elements of the list will not be broken across lines. | +| `:lcrestlist` | The array argument is incorporated as a comma-separated list; all but the first element of the list are lower-cased. | +| `:numlist` | The argument, which should be a list of numbers, is incorporated as a list of numeric ranges; for example, `[1, 2, 3, 4, 5, 6]` is incorporated as `1-6`. | + +The `expand` property defines how HotCRP interpolates a given message. If +`expand` is `"none"`, then no interpolation is performed. If `expand` is +`"template"`, then *only* templates are interpolated, and furthermore double +braces like `{{` are included verbatim. diff --git a/lib/fmt.php b/lib/fmt.php index 01dae6431..d21e5bc76 100644 --- a/lib/fmt.php +++ b/lib/fmt.php @@ -62,15 +62,15 @@ static function make_template($out, $expand = 1) { } /** @param string $context + * @param Fmt $fmt * @return $this */ - function add_context($context) { - if (($context ?? "") === "") { - /* skip */ - } else if ($this->context !== null) { - $this->context = "{$this->context}/{$context}"; - } else { - $this->context = $context; + function set_context($context, $fmt) { + if ($fmt->_check > 0 + && $this->context !== null + && !self::context_starts_with($context, $this->context)) { + error_log("nested translation has unexpected context `{$context}`"); } + $this->context = ($context !== "" ? $context : null); return $this; } @@ -86,56 +86,59 @@ function add_require($req) { } + /** @param ?string $c1 + * @param ?string $c2 + * @return bool */ + static function context_starts_with($c1, $c2) { + $l1 = strlen($c1 ?? ""); + $l2 = strlen($c2 ?? ""); + return $l1 >= $l2 + && ($l2 === 0 + || (str_starts_with($c1, $c2) + && ($l1 === $l2 || $c1[$l2] === "/"))); + } + /** @param list $args * @return int|false */ function check_require(Fmt $ms, $args) { if (!$this->require) { return 0; } - $nreq = 0; - $compval = null; - '@phan-var-force ?string $compval'; foreach ($this->require as $req) { + $ok = false; if (preg_match('/\A\s*(!*)\s*(\S+?)\s*(\z|[=!<>]=?|≠|≤|≥|!?\^=)\s*(\S*)\s*\z/', $req, $m) && ($m[1] === "" || ($m[3] === "" && $m[4] === "")) - && ($m[3] === "") === ($m[4] === "")) { - if (!$ms->test_requirement($m[2], $args, $val)) { - return false; - } + && ($m[3] === "") === ($m[4] === "") + && $ms->test_requirement($m[2], $args, $val)) { $compar = $m[3]; - $compval = $m[4]; - if ($m[4] !== "" - && ($m[4][0] === "\$" || $m[4][0] === "{") - && !$ms->test_requirement($m[4], $args, $compval)) { + $cv = $compval = $m[4]; + if ($cv !== "" + && ($cv[0] === "\$" /* XXX */ || $cv[0] === "{" || str_starts_with($cv, "#{")) + && !$ms->test_requirement($cv, $args, $compval)) { return false; } if ($compar === "") { $bval = (bool) $val && $val !== "0"; - $weight = $bval === (strlen($m[1]) % 2 === 0) ? 1 : 0; + $ok = $bval === (strlen($m[1]) % 2 === 0); } else if (!is_scalar($val)) { - $weight = 0; + // skip } else if ($compar === "^=") { - $weight = str_starts_with($val, $compval) ? 0.9 : 0; + $ok = str_starts_with($val, $compval); } else if ($compar === "!^=") { - $weight = !str_starts_with($val, $compval) ? 0.9 : 0; + $ok = !str_starts_with($val, $compval); } else if (is_numeric($compval)) { - $weight = CountMatcher::compare((float) $val, $compar, (float) $compval) ? 1 : 0; + $ok = CountMatcher::compare((float) $val, $compar, (float) $compval); } else if ($compar === "=" || $compar === "==") { - $weight = (string) $val === (string) $compval ? 1 : 0; + $ok = (string) $val === (string) $compval; } else if ($compar === "!=" || $compar === "≠") { - $weight = (string) $val === (string) $compval ? 0 : 1; - } else { - $weight = 0; + $ok = (string) $val !== (string) $compval; } - if ($weight === 0) { - return false; - } - $nreq += $weight; - } else { + } + if (!$ok) { return false; } } - return $nreq; + return count($this->require); } } @@ -251,7 +254,7 @@ function apply_fmtspec($fspec, $vformat, $value) { if ($fspec === ":url") { return [null, urlencode((string) $value)]; } else if ($fspec === ":html") { // unneeded if FmtArg has correct format - return [null, htmlspecialchars((string) $value)]; + return [null, htmlspecialchars((string) $value, ENT_QUOTES)]; } else if ($fspec === ":humanize_url") { if (preg_match('/\Ahttps?:\/\/([^\[\]:\/?#\s]*)([\/?#]\S*|)\z/i', (string) $value, $mm)) { $value = $mm[1] . ($mm[2] === "/" ? "" : $mm[2]); @@ -283,7 +286,9 @@ class Fmt { /** @var array */ private $ims = []; /** @var list */ - private $require_resolvers = []; + private $_require_resolvers = []; + /** @var int */ + public $_check = 0; /** @var ?string */ private $_default_in; /** @var FmtItem */ @@ -314,7 +319,7 @@ function clear_default_priority() { private function _addj_object($m) { $im = clone $this->_default_item; if (isset($m->context) && is_string($m->context) && $m->context !== "") { - $im->add_context($m->context); + $im->set_context($m->context, $this); } if (isset($m->priority) && (is_float($m->priority) || is_int($m->priority))) { $im->priority = (float) $m->priority; @@ -385,7 +390,7 @@ private function _addj_list($m) { if ($context !== null) { return false; } else if ($in !== null) { - $im->add_context(($context = $in)); + $im->set_context(($context = $in), $this); } $in = $out; } @@ -439,6 +444,11 @@ function define($in, $out) { } else { $im = $out; } + if ($this->_check > 0 + && $im->template + && !preg_match('/\A[A-Za-z_]\w+\z/', $in)) { + error_log("bad template name {$in}"); + } $im->next = $this->ims[$in] ?? null; $this->ims[$in] = $im; return $this; @@ -482,7 +492,7 @@ function remove_overrides() { /** @param callable(string):(false|array{true,mixed}) $function */ function add_requirement_resolver($function) { - $this->require_resolvers[] = $function; + $this->_require_resolvers[] = $function; } /** @param ?string $context @@ -550,9 +560,7 @@ private function find($context, $in, $args, $priobound, $source) { // check context match $ctxpfx = strlen($im->context ?? ""); if ($ctxpfx > $ctxlen - || ($ctxpfx > 0 - && (strncmp($context, $im->context, $ctxpfx) !== 0 - || ($ctxpfx < $ctxlen && $context[$ctxpfx] !== "/"))) + || ($ctxpfx > 0 && !FmtItem::context_starts_with($context, $im->context)) || ($match && $im->priority == $match->priority && $ctxpfx < $matchctxlen)) { continue; } @@ -595,7 +603,7 @@ static function find_arg($args, $argdef) { /** @param string $s */ private function resolve_requirement($s) { - foreach ($this->require_resolvers as $fn) { + foreach ($this->_require_resolvers as $fn) { if (($v = call_user_func($fn, $s))) return $v[1]; } diff --git a/lib/ftext.php b/lib/ftext.php index bae3c2d86..06849c904 100644 --- a/lib/ftext.php +++ b/lib/ftext.php @@ -104,7 +104,7 @@ function ($m) { } }, $s), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, "UTF-8"); } else if ($from_format !== 5 && $to_format === 5) { - return htmlspecialchars($s); + return htmlspecialchars($s, ENT_QUOTES); } else { return $s; } @@ -146,7 +146,7 @@ static function concat(...$ftexts) { $ts = []; foreach ($parses as $parse) { if ($parse[0] !== 5 && $format === 5) { - $ts[] = htmlspecialchars($parse[1]); + $ts[] = htmlspecialchars($parse[1], ENT_QUOTES); } else { $ts[] = $parse[1]; } diff --git a/src/conference.php b/src/conference.php index ea0c86ec6..5e3f9a04d 100644 --- a/src/conference.php +++ b/src/conference.php @@ -23,6 +23,8 @@ class Conf { public $opt; /** @var array */ public $opt_override; + /** @var string */ + public $lang = "en"; /** @var ?int */ private $_opt_timestamp; @@ -590,6 +592,7 @@ function refresh_options() { } $this->short_name = $this->opt["shortName"]; $this->long_name = $this->opt["longName"]; + $this->lang = $this->opt["lang"] ?? "en"; // set submission nouns if (isset($this->opt["submissionNouns"]) @@ -4258,7 +4261,7 @@ function print_head_tag($qreq, $title, $extra = []) { $qreq->set_cookie($k, "", Conf::$now - 86400); } - echo "\n\n\n", + echo "\nlang}\">\n\n", "\n"; // gather stylesheets @@ -5007,8 +5010,8 @@ function fmt() { $this->_fmt = new Fmt($this); $this->_fmt->add_requirement_resolver([$this, "resolve_fmt_requirement"]); $m = ["?etc/msgs.json"]; - if (($lang = $this->opt("lang"))) { - $m[] = "?etc/msgs.{$lang}.json"; + if ($this->lang !== "en") { + $m[] = "?etc/msgs.{$this->lang}.json"; } $this->_fmt->set_default_priority(-1.0); expand_json_includes_callback($m, [$this->_fmt, "addj"]); @@ -5088,6 +5091,8 @@ function resolve_fmt_requirement($s) { return [true, $this->setting(substr($s, 8))]; } else if (str_starts_with($s, "opt.")) { return [true, $this->opt(substr($s, 4))]; + } else if ($s === "lang") { + return [true, $this->lang]; } else { return false; } diff --git a/test/t_fmt.php b/test/t_fmt.php index 47d667be2..d4a72375a 100644 --- a/test/t_fmt.php +++ b/test/t_fmt.php @@ -12,7 +12,7 @@ function test_1() { $ms->addj(["ax", "b"]); $ms->addj(["bx", "a", 2]); $ms->addj(["bx", "b"]); - $ms->addj(["fart", "fart example A", ["{0}=bob"]]); + $ms->addj(["fart", "fart example A", ["{0}=bob", "{0}=bob"]]); $ms->addj(["fart", "fart example B", ["{0}^=bob"]]); $ms->addj(["fart", "fart example C"]); $ms->addj(["in" => "fox-saying", "out" => "What the fox said"]); @@ -169,4 +169,17 @@ function test_example() { xassert_eqq($ms->_("Hello", new FmtArg("name", ["Bob", "Jane"])), "Hello, Bob and Jane"); xassert_eqq($ms->_("Hello", new FmtArg("name", ["Bob", "Jane", "Fred"])), "Hello, all"); } + + function test_context_starts_with() { + xassert(FmtItem::context_starts_with(null, null)); + xassert(FmtItem::context_starts_with("", null)); + xassert(FmtItem::context_starts_with(null, "")); + xassert(!FmtItem::context_starts_with(null, "a")); + xassert(FmtItem::context_starts_with("a", null)); + xassert(FmtItem::context_starts_with("a", "a")); + xassert(!FmtItem::context_starts_with("a", "ab")); + xassert(FmtItem::context_starts_with("a/b", "a")); + xassert(!FmtItem::context_starts_with("a/b", "ab")); + xassert(!FmtItem::context_starts_with("a", "a/b")); + } }