From 32fc3719e06899d43e2298ad6d0028efe5ec3024 Mon Sep 17 00:00:00 2001 From: Eduard-Mihai Burtescu Date: Sat, 16 Nov 2019 16:32:50 +0100 Subject: [PATCH] [PATCH] Refactor rust-demangle to be independent of C++ demangling. * demangle.h (rust_demangle_callback): Add. * cplus-dem.c (cplus_demangle): Use rust_demangle directly. (rust_demangle): Remove. * rust-demangle.c (is_prefixed_hash): Rename to is_legacy_prefixed_hash. (parse_lower_hex_nibble): Rename to decode_lower_hex_nibble. (parse_legacy_escape): Rename to decode_legacy_escape. (rust_is_mangled): Remove. (struct rust_demangler): Add. (peek): Add. (next): Add. (struct rust_mangled_ident): Add. (parse_ident): Add. (rust_demangle_sym): Remove. (print_str): Add. (PRINT): Add. (print_ident): Add. (rust_demangle_callback): Add. (struct str_buf): Add. (str_buf_reserve): Add. (str_buf_append): Add. (str_buf_demangle_callback): Add. (rust_demangle): Add. * rust-demangle.h: Remove. From-SVN: r278358 --- include/ChangeLog | 4 + include/demangle.h | 5 + libiberty/ChangeLog | 25 ++ libiberty/cplus-dem.c | 51 +--- libiberty/rust-demangle.c | 572 ++++++++++++++++++++++++++------------ libiberty/rust-demangle.h | 45 --- 6 files changed, 436 insertions(+), 266 deletions(-) delete mode 100644 libiberty/rust-demangle.h diff --git a/include/ChangeLog b/include/ChangeLog index 364fa72d457b0..5e18fa3d5a619 100644 --- a/include/ChangeLog +++ b/include/ChangeLog @@ -1,3 +1,7 @@ +2019-10-22 Eduard-Mihai Burtescu + + * demangle.h (rust_demangle_callback): Add. + 2019-11-13 Andrew Stubbs Kwok Cheung Yeung Julian Brown diff --git a/include/demangle.h b/include/demangle.h index 06c32571d5ce0..ce7235d13f387 100644 --- a/include/demangle.h +++ b/include/demangle.h @@ -159,6 +159,11 @@ ada_demangle (const char *mangled, int options); extern char * dlang_demangle (const char *mangled, int options); +extern int +rust_demangle_callback (const char *mangled, int options, + demangle_callbackref callback, void *opaque); + + extern char * rust_demangle (const char *mangled, int options); diff --git a/libiberty/ChangeLog b/libiberty/ChangeLog index 14e7f5af335e5..db5cf3dd32334 100644 --- a/libiberty/ChangeLog +++ b/libiberty/ChangeLog @@ -1,3 +1,28 @@ +2019-10-22 Eduard-Mihai Burtescu + + * cplus-dem.c (cplus_demangle): Use rust_demangle directly. + (rust_demangle): Remove. + * rust-demangle.c (is_prefixed_hash): Rename to is_legacy_prefixed_hash. + (parse_lower_hex_nibble): Rename to decode_lower_hex_nibble. + (parse_legacy_escape): Rename to decode_legacy_escape. + (rust_is_mangled): Remove. + (struct rust_demangler): Add. + (peek): Add. + (next): Add. + (struct rust_mangled_ident): Add. + (parse_ident): Add. + (rust_demangle_sym): Remove. + (print_str): Add. + (PRINT): Add. + (print_ident): Add. + (rust_demangle_callback): Add. + (struct str_buf): Add. + (str_buf_reserve): Add. + (str_buf_append): Add. + (str_buf_demangle_callback): Add. + (rust_demangle): Add. + * rust-demangle.h: Remove. + 2019-11-15 Miguel Saldivar * testsuite/demangle-expected: Fix test. diff --git a/libiberty/cplus-dem.c b/libiberty/cplus-dem.c index a39e2bf2ed46e..735a61d7a824e 100644 --- a/libiberty/cplus-dem.c +++ b/libiberty/cplus-dem.c @@ -52,7 +52,6 @@ void * realloc (); #define CURRENT_DEMANGLING_STYLE options #include "libiberty.h" -#include "rust-demangle.h" enum demangling_styles current_demangling_style = auto_demangling; @@ -160,27 +159,20 @@ cplus_demangle (const char *mangled, int options) if ((options & DMGL_STYLE_MASK) == 0) options |= (int) current_demangling_style & DMGL_STYLE_MASK; + /* The Rust demangling is implemented elsewhere. + Legacy Rust symbols overlap with GNU_V3, so try Rust first. */ + if (RUST_DEMANGLING || AUTO_DEMANGLING) + { + ret = rust_demangle (mangled, options); + if (ret || RUST_DEMANGLING) + return ret; + } + /* The V3 ABI demangling is implemented elsewhere. */ - if (GNU_V3_DEMANGLING || RUST_DEMANGLING || AUTO_DEMANGLING) + if (GNU_V3_DEMANGLING || AUTO_DEMANGLING) { ret = cplus_demangle_v3 (mangled, options); - if (GNU_V3_DEMANGLING) - return ret; - - if (ret) - { - /* Rust symbols are GNU_V3 mangled plus some extra subtitutions. - The subtitutions are always smaller, so do in place changes. */ - if (rust_is_mangled (ret)) - rust_demangle_sym (ret); - else if (RUST_DEMANGLING) - { - free (ret); - ret = NULL; - } - } - - if (ret || RUST_DEMANGLING) + if (ret || GNU_V3_DEMANGLING) return ret; } @@ -204,27 +196,6 @@ cplus_demangle (const char *mangled, int options) return (ret); } -char * -rust_demangle (const char *mangled, int options) -{ - /* Rust symbols are GNU_V3 mangled plus some extra subtitutions. */ - char *ret = cplus_demangle_v3 (mangled, options); - - /* The Rust subtitutions are always smaller, so do in place changes. */ - if (ret != NULL) - { - if (rust_is_mangled (ret)) - rust_demangle_sym (ret); - else - { - free (ret); - ret = NULL; - } - } - - return ret; -} - /* Demangle ada names. The encoding is documented in gcc/ada/exp_dbug.ads. */ char * diff --git a/libiberty/rust-demangle.c b/libiberty/rust-demangle.c index 6b62e6dbd806f..fa9d4724010af 100644 --- a/libiberty/rust-demangle.c +++ b/libiberty/rust-demangle.c @@ -33,9 +33,11 @@ If not, see . */ #include "safe-ctype.h" +#include #include #include #include +#include #ifdef HAVE_STRING_H #include @@ -47,207 +49,110 @@ extern void *memset(void *s, int c, size_t n); #include #include "libiberty.h" -#include "rust-demangle.h" +struct rust_demangler +{ + const char *sym; + size_t sym_len; -/* Mangled (legacy) Rust symbols look like this: - _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a - - The original symbol is: - ::drop - - The last component of the path is a 64-bit hash in lowercase hex, - prefixed with "h". Rust does not have a global namespace between - crates, an illusion which Rust maintains by using the hash to - distinguish things that would otherwise have the same symbol. - - Any path component not starting with a XID_Start character is - prefixed with "_". - - The following escape sequences are used: - - "," => $C$ - "@" => $SP$ - "*" => $BP$ - "&" => $RF$ - "<" => $LT$ - ">" => $GT$ - "(" => $LP$ - ")" => $RP$ - "\u{XY}" => $uXY$ - - A double ".." means "::" and a single "." means "-". - - The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */ - -static const char *hash_prefix = "::h"; -static const size_t hash_prefix_len = 3; -static const size_t hash_len = 16; - -static int is_prefixed_hash (const char *start); -static int parse_lower_hex_nibble (char nibble); -static char parse_legacy_escape (const char **in); + void *callback_opaque; + demangle_callbackref callback; -/* INPUT: sym: symbol that has been through C++ (gnu v3) demangling + /* Position of the next character to read from the symbol. */ + size_t next; - This function looks for the following indicators: + /* Non-zero if any error occurred. */ + int errored; - 1. The hash must consist of "h" followed by 16 lowercase hex digits. + /* Non-zero if printing should be verbose (e.g. include hashes). */ + int verbose; - 2. As a sanity check, the hash must use between 5 and 15 of the 16 - possible hex digits. This is true of 99.9998% of hashes so once - in your life you may see a false negative. The point is to - notice path components that could be Rust hashes but are - probably not, like "haaaaaaaaaaaaaaaa". In this case a false - positive (non-Rust symbol has an important path component - removed because it looks like a Rust hash) is worse than a false - negative (the rare Rust symbol is not demangled) so this sets - the balance in favor of false negatives. + /* Rust mangling version, with legacy mangling being -1. */ + int version; +}; - 3. There must be no characters other than a-zA-Z0-9 and _.:$ */ +/* Parsing functions. */ -int -rust_is_mangled (const char *sym) +static char +peek (const struct rust_demangler *rdm) { - size_t len, len_without_hash; - const char *end; + if (rdm->next < rdm->sym_len) + return rdm->sym[rdm->next]; + return 0; +} - if (!sym) - return 0; +static char +next (struct rust_demangler *rdm) +{ + char c = peek (rdm); + if (!c) + rdm->errored = 1; + else + rdm->next++; + return c; +} - len = strlen (sym); - if (len <= hash_prefix_len + hash_len) - /* Not long enough to contain "::h" + hash + something else */ - return 0; +struct rust_mangled_ident +{ + /* ASCII part of the identifier. */ + const char *ascii; + size_t ascii_len; +}; - len_without_hash = len - (hash_prefix_len + hash_len); - if (!is_prefixed_hash (sym + len_without_hash)) - return 0; +static struct rust_mangled_ident +parse_ident (struct rust_demangler *rdm) +{ + char c; + size_t start, len; + struct rust_mangled_ident ident; - end = sym + len_without_hash; + ident.ascii = NULL; + ident.ascii_len = 0; - while (sym < end) + c = next (rdm); + if (!ISDIGIT (c)) { - if (*sym == '$' || *sym == '.' || *sym == '_' || *sym == ':' - || ISALNUM (*sym)) - sym++; - else - return 0; + rdm->errored = 1; + return ident; } + len = c - '0'; - return 1; -} - -/* A hash is the prefix "::h" followed by 16 lowercase hex digits. The - hex digits must contain at least 5 distinct digits. */ - -static int -is_prefixed_hash (const char *str) -{ - const char *end; - char seen[16]; - size_t i; - int count, nibble; - - if (strncmp (str, hash_prefix, hash_prefix_len)) - return 0; - str += hash_prefix_len; + if (c != '0') + while (ISDIGIT (peek (rdm))) + len = len * 10 + (next (rdm) - '0'); - memset (seen, 0, sizeof(seen)); - for (end = str + hash_len; str < end; str++) + start = rdm->next; + rdm->next += len; + /* Check for overflows. */ + if ((start > rdm->next) || (rdm->next > rdm->sym_len)) { - nibble = parse_lower_hex_nibble (*str); - if (nibble < 0) - return 0; - seen[nibble] = 1; + rdm->errored = 1; + return ident; } - /* Count how many distinct digits seen */ - count = 0; - for (i = 0; i < 16; i++) - if (seen[i]) - count++; + ident.ascii = rdm->sym + start; + ident.ascii_len = len; - return count >= 5; -} + if (ident.ascii_len == 0) + ident.ascii = NULL; -/* - INPUT: sym: symbol for which rust_is_mangled(sym) returned 1. + return ident; +} - The input is demangled in-place because the mangled name is always - longer than the demangled one. */ +/* Printing functions. */ -void -rust_demangle_sym (char *sym) +static void +print_str (struct rust_demangler *rdm, const char *data, size_t len) { - const char *in; - char *out; - const char *end; - char unescaped; - - if (!sym) - return; - - in = sym; - out = sym; - end = sym + strlen (sym) - (hash_prefix_len + hash_len); - - while (in < end) - { - if (*in == '$') - { - unescaped = parse_legacy_escape (&in); - if (unescaped) - *out++ = unescaped; - else - /* unexpected escape sequence, skip the rest of this segment. */ - while (in < end && *in != ':') - *out++ = *in++; - } - else if (*in == '_') - { - /* If this is the start of a path component and the next - character is an escape sequence, ignore the underscore. The - mangler inserts an underscore to make sure the path - component begins with a XID_Start character. */ - if ((in == sym || in[-1] == ':') && in[1] == '$') - in++; - else - *out++ = *in++; - } - else if (*in == '.') - { - if (in[1] == '.') - { - /* ".." becomes "::" */ - *out++ = ':'; - *out++ = ':'; - in += 2; - } - else - { - /* "." becomes "-" */ - *out++ = '-'; - in++; - } - } - else if (*in == ':' || ISALNUM (*in)) - *out++ = *in++; - else - { - /* unexpected character in symbol, not rust_is_mangled. */ - *out++ = '?'; /* This is pretty lame, but it's hard to do better. */ - *out = '\0'; - return; - } - } - - *out = '\0'; + if (!rdm->errored) + rdm->callback (data, len, rdm->callback_opaque); } +#define PRINT(s) print_str (rdm, s, strlen (s)) + /* Return a 0x0-0xf value if the char is 0-9a-f, and -1 otherwise. */ static int -parse_lower_hex_nibble (char nibble) +decode_lower_hex_nibble (char nibble) { if ('0' <= nibble && nibble <= '9') return nibble - '0'; @@ -258,17 +163,17 @@ parse_lower_hex_nibble (char nibble) /* Return the unescaped character for a "$...$" escape, or 0 if invalid. */ static char -parse_legacy_escape (const char **in) +decode_legacy_escape (const char *e, size_t len, size_t *out_len) { char c = 0; - const char *e; size_t escape_len = 0; int lo_nibble = -1, hi_nibble = -1; - if ((*in)[0] != '$') + if (len < 3 || e[0] != '$') return 0; - e = *in + 1; + e++; + len--; if (e[0] == 'C') { @@ -276,7 +181,7 @@ parse_legacy_escape (const char **in) c = ','; } - else + else if (len > 2) { escape_len = 2; @@ -294,14 +199,14 @@ parse_legacy_escape (const char **in) c = '('; else if (e[0] == 'R' && e[1] == 'P') c = ')'; - else if (e[0] == 'u') + else if (e[0] == 'u' && len > 3) { escape_len = 3; - hi_nibble = parse_lower_hex_nibble (e[1]); + hi_nibble = decode_lower_hex_nibble (e[1]); if (hi_nibble < 0) return 0; - lo_nibble = parse_lower_hex_nibble (e[2]); + lo_nibble = decode_lower_hex_nibble (e[2]); if (lo_nibble < 0) return 0; @@ -314,9 +219,314 @@ parse_legacy_escape (const char **in) } } - if (!c || e[escape_len] != '$') + if (!c || len <= escape_len || e[escape_len] != '$') return 0; - *in += 2 + escape_len; + *out_len = 2 + escape_len; return c; } + +static void +print_ident (struct rust_demangler *rdm, struct rust_mangled_ident ident) +{ + char unescaped; + size_t len; + + if (rdm->errored) + return; + + if (rdm->version == -1) + { + /* Ignore leading underscores preceding escape sequences. + The mangler inserts an underscore to make sure the + identifier begins with a XID_Start character. */ + if (ident.ascii_len >= 2 && ident.ascii[0] == '_' + && ident.ascii[1] == '$') + { + ident.ascii++; + ident.ascii_len--; + } + + while (ident.ascii_len > 0) + { + /* Handle legacy escape sequences ("$...$", ".." or "."). */ + if (ident.ascii[0] == '$') + { + unescaped + = decode_legacy_escape (ident.ascii, ident.ascii_len, &len); + if (unescaped) + print_str (rdm, &unescaped, 1); + else + { + /* Unexpected escape sequence, print the rest verbatim. */ + print_str (rdm, ident.ascii, ident.ascii_len); + return; + } + } + else if (ident.ascii[0] == '.') + { + if (ident.ascii_len >= 2 && ident.ascii[1] == '.') + { + /* ".." becomes "::" */ + PRINT ("::"); + len = 2; + } + else + { + /* "." becomes "-" */ + PRINT ("-"); + len = 1; + } + } + else + { + /* Print everything before the next escape sequence, at once. */ + for (len = 0; len < ident.ascii_len; len++) + if (ident.ascii[len] == '$' || ident.ascii[len] == '.') + break; + + print_str (rdm, ident.ascii, len); + } + + ident.ascii += len; + ident.ascii_len -= len; + } + + return; + } +} + +/* A legacy hash is the prefix "h" followed by 16 lowercase hex digits. + The hex digits must contain at least 5 distinct digits. */ +static int +is_legacy_prefixed_hash (struct rust_mangled_ident ident) +{ + uint16_t seen; + int nibble; + size_t i, count; + + if (ident.ascii_len != 17 || ident.ascii[0] != 'h') + return 0; + + seen = 0; + for (i = 0; i < 16; i++) + { + nibble = decode_lower_hex_nibble (ident.ascii[1 + i]); + if (nibble < 0) + return 0; + seen |= (uint16_t)1 << nibble; + } + + /* Count how many distinct digits were seen. */ + count = 0; + while (seen) + { + if (seen & 1) + count++; + seen >>= 1; + } + + return count >= 5; +} + +int +rust_demangle_callback (const char *mangled, int options, + demangle_callbackref callback, void *opaque) +{ + const char *p; + struct rust_demangler rdm; + struct rust_mangled_ident ident; + + rdm.sym = mangled; + rdm.sym_len = 0; + + rdm.callback_opaque = opaque; + rdm.callback = callback; + + rdm.next = 0; + rdm.errored = 0; + rdm.verbose = (options & DMGL_VERBOSE) != 0; + rdm.version = 0; + + /* Rust symbols always start with _ZN (legacy). */ + if (rdm.sym[0] == '_' && rdm.sym[1] == 'Z' && rdm.sym[2] == 'N') + { + rdm.sym += 3; + rdm.version = -1; + } + else + return 0; + + /* Legacy Rust symbols use only [_0-9a-zA-Z.:$] characters. */ + for (p = rdm.sym; *p; p++) + { + rdm.sym_len++; + + if (*p == '_' || ISALNUM (*p)) + continue; + + if (rdm.version == -1 && (*p == '$' || *p == '.' || *p == ':')) + continue; + + return 0; + } + + /* Legacy Rust symbols need to be handled separately. */ + if (rdm.version == -1) + { + /* Legacy Rust symbols always end with E. */ + if (!(rdm.sym_len > 0 && rdm.sym[rdm.sym_len - 1] == 'E')) + return 0; + rdm.sym_len--; + + /* Legacy Rust symbols also always end with a path segment + that encodes a 16 hex digit hash, i.e. '17h[a-f0-9]{16}'. + This early check, before any parse_ident calls, should + quickly filter out most C++ symbols unrelated to Rust. */ + if (!(rdm.sym_len > 19 + && !memcmp (&rdm.sym[rdm.sym_len - 19], "17h", 3))) + return 0; + + do + { + ident = parse_ident (&rdm); + if (rdm.errored || !ident.ascii) + return 0; + } + while (rdm.next < rdm.sym_len); + + /* The last path segment should be the hash. */ + if (!is_legacy_prefixed_hash (ident)) + return 0; + + /* Reset the state for a second pass, to print the symbol. */ + rdm.next = 0; + if (!rdm.verbose && rdm.sym_len > 19) + { + /* Hide the last segment, containing the hash, if not verbose. */ + rdm.sym_len -= 19; + } + + do + { + if (rdm.next > 0) + print_str (&rdm, "::", 2); + + ident = parse_ident (&rdm); + print_ident (&rdm, ident); + } + while (rdm.next < rdm.sym_len); + } + else + return 0; + + return !rdm.errored; +} + +/* Growable string buffers. */ +struct str_buf +{ + char *ptr; + size_t len; + size_t cap; + int errored; +}; + +static void +str_buf_reserve (struct str_buf *buf, size_t extra) +{ + size_t available, min_new_cap, new_cap; + char *new_ptr; + + /* Allocation failed before. */ + if (buf->errored) + return; + + available = buf->cap - buf->len; + + if (extra <= available) + return; + + min_new_cap = buf->cap + (extra - available); + + /* Check for overflows. */ + if (min_new_cap < buf->cap) + { + buf->errored = 1; + return; + } + + new_cap = buf->cap; + + if (new_cap == 0) + new_cap = 4; + + /* Double capacity until sufficiently large. */ + while (new_cap < min_new_cap) + { + new_cap *= 2; + + /* Check for overflows. */ + if (new_cap < buf->cap) + { + buf->errored = 1; + return; + } + } + + new_ptr = (char *)realloc (buf->ptr, new_cap); + if (new_ptr == NULL) + { + free (buf->ptr); + buf->ptr = NULL; + buf->len = 0; + buf->cap = 0; + buf->errored = 1; + } + else + { + buf->ptr = new_ptr; + buf->cap = new_cap; + } +} + +static void +str_buf_append (struct str_buf *buf, const char *data, size_t len) +{ + str_buf_reserve (buf, len); + if (buf->errored) + return; + + memcpy (buf->ptr + buf->len, data, len); + buf->len += len; +} + +static void +str_buf_demangle_callback (const char *data, size_t len, void *opaque) +{ + str_buf_append ((struct str_buf *)opaque, data, len); +} + +char * +rust_demangle (const char *mangled, int options) +{ + struct str_buf out; + int success; + + out.ptr = NULL; + out.len = 0; + out.cap = 0; + out.errored = 0; + + success = rust_demangle_callback (mangled, options, + str_buf_demangle_callback, &out); + + if (!success) + { + free (out.ptr); + return NULL; + } + + str_buf_append (&out, "\0", 1); + return out.ptr; +} diff --git a/libiberty/rust-demangle.h b/libiberty/rust-demangle.h deleted file mode 100644 index abf4c6cde559b..0000000000000 --- a/libiberty/rust-demangle.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Internal demangler interface for the Rust programming language. - Copyright (C) 2016-2019 Free Software Foundation, Inc. - Written by David Tolnay (dtolnay@gmail.com). - -This file is part of the libiberty library. -Libiberty is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public -License as published by the Free Software Foundation; either -version 2 of the License, or (at your option) any later version. - -In addition to the permissions in the GNU Library General Public -License, the Free Software Foundation gives you unlimited permission -to link the compiled version of this file into combinations with other -programs, and to distribute those combinations without any restriction -coming from the use of this file. (The Library Public License -restrictions do apply in other respects; for example, they cover -modification of the file, and distribution when not linked into a -combined executable.) - -Libiberty is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. - -You should have received a copy of the GNU Library General Public -License along with libiberty; see the file COPYING.LIB. -If not, see . */ - -/* This file provides some definitions shared by cplus-dem.c and - rust-demangle.c. It should not be included by any other files. */ - -/* Returns non-zero iff MANGLED is a rust mangled symbol. MANGLED must - already have been demangled through cplus_demangle_v3. If this function - returns non-zero then MANGLED can be demangled (in-place) using - RUST_DEMANGLE_SYM. */ -extern int -rust_is_mangled (const char *mangled); - -/* Demangles SYM (in-place) if RUST_IS_MANGLED returned non-zero for SYM. - If RUST_IS_MANGLED returned zero for SYM then RUST_DEMANGLE_SYM might - replace characters that cannot be demangled with '?' and might truncate - SYM. After calling RUST_DEMANGLE_SYM SYM might be shorter, but never - larger. */ -extern void -rust_demangle_sym (char *sym);