Loading...
1// SPDX-License-Identifier: GPL-2.0
2#include <string.h>
3#include "util.h"
4#include "debug.h"
5
6#include "demangle-rust.h"
7
8/*
9 * Mangled Rust symbols look like this:
10 *
11 * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
12 *
13 * The original symbol is:
14 *
15 * <std::sys::fd::FileDesc as core::ops::Drop>::drop
16 *
17 * The last component of the path is a 64-bit hash in lowercase hex, prefixed
18 * with "h". Rust does not have a global namespace between crates, an illusion
19 * which Rust maintains by using the hash to distinguish things that would
20 * otherwise have the same symbol.
21 *
22 * Any path component not starting with a XID_Start character is prefixed with
23 * "_".
24 *
25 * The following escape sequences are used:
26 *
27 * "," => $C$
28 * "@" => $SP$
29 * "*" => $BP$
30 * "&" => $RF$
31 * "<" => $LT$
32 * ">" => $GT$
33 * "(" => $LP$
34 * ")" => $RP$
35 * " " => $u20$
36 * "'" => $u27$
37 * "[" => $u5b$
38 * "]" => $u5d$
39 * "~" => $u7e$
40 *
41 * A double ".." means "::" and a single "." means "-".
42 *
43 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
44 */
45
46static const char *hash_prefix = "::h";
47static const size_t hash_prefix_len = 3;
48static const size_t hash_len = 16;
49
50static bool is_prefixed_hash(const char *start);
51static bool looks_like_rust(const char *sym, size_t len);
52static bool unescape(const char **in, char **out, const char *seq, char value);
53
54/*
55 * INPUT:
56 * sym: symbol that has been through BFD-demangling
57 *
58 * This function looks for the following indicators:
59 *
60 * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
61 *
62 * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
63 * hex digits. This is true of 99.9998% of hashes so once in your life you
64 * may see a false negative. The point is to notice path components that
65 * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
66 * this case a false positive (non-Rust symbol has an important path
67 * component removed because it looks like a Rust hash) is worse than a
68 * false negative (the rare Rust symbol is not demangled) so this sets the
69 * balance in favor of false negatives.
70 *
71 * 3. There must be no characters other than a-zA-Z0-9 and _.:$
72 *
73 * 4. There must be no unrecognized $-sign sequences.
74 *
75 * 5. There must be no sequence of three or more dots in a row ("...").
76 */
77bool
78rust_is_mangled(const char *sym)
79{
80 size_t len, len_without_hash;
81
82 if (!sym)
83 return false;
84
85 len = strlen(sym);
86 if (len <= hash_prefix_len + hash_len)
87 /* Not long enough to contain "::h" + hash + something else */
88 return false;
89
90 len_without_hash = len - (hash_prefix_len + hash_len);
91 if (!is_prefixed_hash(sym + len_without_hash))
92 return false;
93
94 return looks_like_rust(sym, len_without_hash);
95}
96
97/*
98 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
99 * digits must comprise between 5 and 15 (inclusive) distinct digits.
100 */
101static bool is_prefixed_hash(const char *str)
102{
103 const char *end;
104 bool seen[16];
105 size_t i;
106 int count;
107
108 if (strncmp(str, hash_prefix, hash_prefix_len))
109 return false;
110 str += hash_prefix_len;
111
112 memset(seen, false, sizeof(seen));
113 for (end = str + hash_len; str < end; str++)
114 if (*str >= '0' && *str <= '9')
115 seen[*str - '0'] = true;
116 else if (*str >= 'a' && *str <= 'f')
117 seen[*str - 'a' + 10] = true;
118 else
119 return false;
120
121 /* Count how many distinct digits seen */
122 count = 0;
123 for (i = 0; i < 16; i++)
124 if (seen[i])
125 count++;
126
127 return count >= 5 && count <= 15;
128}
129
130static bool looks_like_rust(const char *str, size_t len)
131{
132 const char *end = str + len;
133
134 while (str < end)
135 switch (*str) {
136 case '$':
137 if (!strncmp(str, "$C$", 3))
138 str += 3;
139 else if (!strncmp(str, "$SP$", 4)
140 || !strncmp(str, "$BP$", 4)
141 || !strncmp(str, "$RF$", 4)
142 || !strncmp(str, "$LT$", 4)
143 || !strncmp(str, "$GT$", 4)
144 || !strncmp(str, "$LP$", 4)
145 || !strncmp(str, "$RP$", 4))
146 str += 4;
147 else if (!strncmp(str, "$u20$", 5)
148 || !strncmp(str, "$u27$", 5)
149 || !strncmp(str, "$u5b$", 5)
150 || !strncmp(str, "$u5d$", 5)
151 || !strncmp(str, "$u7e$", 5))
152 str += 5;
153 else
154 return false;
155 break;
156 case '.':
157 /* Do not allow three or more consecutive dots */
158 if (!strncmp(str, "...", 3))
159 return false;
160 /* Fall through */
161 case 'a' ... 'z':
162 case 'A' ... 'Z':
163 case '0' ... '9':
164 case '_':
165 case ':':
166 str++;
167 break;
168 default:
169 return false;
170 }
171
172 return true;
173}
174
175/*
176 * INPUT:
177 * sym: symbol for which rust_is_mangled(sym) returns true
178 *
179 * The input is demangled in-place because the mangled name is always longer
180 * than the demangled one.
181 */
182void
183rust_demangle_sym(char *sym)
184{
185 const char *in;
186 char *out;
187 const char *end;
188
189 if (!sym)
190 return;
191
192 in = sym;
193 out = sym;
194 end = sym + strlen(sym) - (hash_prefix_len + hash_len);
195
196 while (in < end)
197 switch (*in) {
198 case '$':
199 if (!(unescape(&in, &out, "$C$", ',')
200 || unescape(&in, &out, "$SP$", '@')
201 || unescape(&in, &out, "$BP$", '*')
202 || unescape(&in, &out, "$RF$", '&')
203 || unescape(&in, &out, "$LT$", '<')
204 || unescape(&in, &out, "$GT$", '>')
205 || unescape(&in, &out, "$LP$", '(')
206 || unescape(&in, &out, "$RP$", ')')
207 || unescape(&in, &out, "$u20$", ' ')
208 || unescape(&in, &out, "$u27$", '\'')
209 || unescape(&in, &out, "$u5b$", '[')
210 || unescape(&in, &out, "$u5d$", ']')
211 || unescape(&in, &out, "$u7e$", '~'))) {
212 pr_err("demangle-rust: unexpected escape sequence");
213 goto done;
214 }
215 break;
216 case '_':
217 /*
218 * If this is the start of a path component and the next
219 * character is an escape sequence, ignore the
220 * underscore. The mangler inserts an underscore to make
221 * sure the path component begins with a XID_Start
222 * character.
223 */
224 if ((in == sym || in[-1] == ':') && in[1] == '$')
225 in++;
226 else
227 *out++ = *in++;
228 break;
229 case '.':
230 if (in[1] == '.') {
231 /* ".." becomes "::" */
232 *out++ = ':';
233 *out++ = ':';
234 in += 2;
235 } else {
236 /* "." becomes "-" */
237 *out++ = '-';
238 in++;
239 }
240 break;
241 case 'a' ... 'z':
242 case 'A' ... 'Z':
243 case '0' ... '9':
244 case ':':
245 *out++ = *in++;
246 break;
247 default:
248 pr_err("demangle-rust: unexpected character '%c' in symbol\n",
249 *in);
250 goto done;
251 }
252
253done:
254 *out = '\0';
255}
256
257static bool unescape(const char **in, char **out, const char *seq, char value)
258{
259 size_t len = strlen(seq);
260
261 if (strncmp(*in, seq, len))
262 return false;
263
264 **out = value;
265
266 *in += len;
267 *out += 1;
268
269 return true;
270}
1// SPDX-License-Identifier: GPL-2.0
2#include <string.h>
3#include "debug.h"
4
5#include "demangle-rust.h"
6
7/*
8 * Mangled Rust symbols look like this:
9 *
10 * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
11 *
12 * The original symbol is:
13 *
14 * <std::sys::fd::FileDesc as core::ops::Drop>::drop
15 *
16 * The last component of the path is a 64-bit hash in lowercase hex, prefixed
17 * with "h". Rust does not have a global namespace between crates, an illusion
18 * which Rust maintains by using the hash to distinguish things that would
19 * otherwise have the same symbol.
20 *
21 * Any path component not starting with a XID_Start character is prefixed with
22 * "_".
23 *
24 * The following escape sequences are used:
25 *
26 * "," => $C$
27 * "@" => $SP$
28 * "*" => $BP$
29 * "&" => $RF$
30 * "<" => $LT$
31 * ">" => $GT$
32 * "(" => $LP$
33 * ")" => $RP$
34 * " " => $u20$
35 * "'" => $u27$
36 * "[" => $u5b$
37 * "]" => $u5d$
38 * "~" => $u7e$
39 *
40 * A double ".." means "::" and a single "." means "-".
41 *
42 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
43 */
44
45static const char *hash_prefix = "::h";
46static const size_t hash_prefix_len = 3;
47static const size_t hash_len = 16;
48
49static bool is_prefixed_hash(const char *start);
50static bool looks_like_rust(const char *sym, size_t len);
51static bool unescape(const char **in, char **out, const char *seq, char value);
52
53/*
54 * INPUT:
55 * sym: symbol that has been through BFD-demangling
56 *
57 * This function looks for the following indicators:
58 *
59 * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
60 *
61 * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
62 * hex digits. This is true of 99.9998% of hashes so once in your life you
63 * may see a false negative. The point is to notice path components that
64 * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
65 * this case a false positive (non-Rust symbol has an important path
66 * component removed because it looks like a Rust hash) is worse than a
67 * false negative (the rare Rust symbol is not demangled) so this sets the
68 * balance in favor of false negatives.
69 *
70 * 3. There must be no characters other than a-zA-Z0-9 and _.:$
71 *
72 * 4. There must be no unrecognized $-sign sequences.
73 *
74 * 5. There must be no sequence of three or more dots in a row ("...").
75 */
76bool
77rust_is_mangled(const char *sym)
78{
79 size_t len, len_without_hash;
80
81 if (!sym)
82 return false;
83
84 len = strlen(sym);
85 if (len <= hash_prefix_len + hash_len)
86 /* Not long enough to contain "::h" + hash + something else */
87 return false;
88
89 len_without_hash = len - (hash_prefix_len + hash_len);
90 if (!is_prefixed_hash(sym + len_without_hash))
91 return false;
92
93 return looks_like_rust(sym, len_without_hash);
94}
95
96/*
97 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
98 * digits must comprise between 5 and 15 (inclusive) distinct digits.
99 */
100static bool is_prefixed_hash(const char *str)
101{
102 const char *end;
103 bool seen[16];
104 size_t i;
105 int count;
106
107 if (strncmp(str, hash_prefix, hash_prefix_len))
108 return false;
109 str += hash_prefix_len;
110
111 memset(seen, false, sizeof(seen));
112 for (end = str + hash_len; str < end; str++)
113 if (*str >= '0' && *str <= '9')
114 seen[*str - '0'] = true;
115 else if (*str >= 'a' && *str <= 'f')
116 seen[*str - 'a' + 10] = true;
117 else
118 return false;
119
120 /* Count how many distinct digits seen */
121 count = 0;
122 for (i = 0; i < 16; i++)
123 if (seen[i])
124 count++;
125
126 return count >= 5 && count <= 15;
127}
128
129static bool looks_like_rust(const char *str, size_t len)
130{
131 const char *end = str + len;
132
133 while (str < end)
134 switch (*str) {
135 case '$':
136 if (!strncmp(str, "$C$", 3))
137 str += 3;
138 else if (!strncmp(str, "$SP$", 4)
139 || !strncmp(str, "$BP$", 4)
140 || !strncmp(str, "$RF$", 4)
141 || !strncmp(str, "$LT$", 4)
142 || !strncmp(str, "$GT$", 4)
143 || !strncmp(str, "$LP$", 4)
144 || !strncmp(str, "$RP$", 4))
145 str += 4;
146 else if (!strncmp(str, "$u20$", 5)
147 || !strncmp(str, "$u27$", 5)
148 || !strncmp(str, "$u5b$", 5)
149 || !strncmp(str, "$u5d$", 5)
150 || !strncmp(str, "$u7e$", 5))
151 str += 5;
152 else
153 return false;
154 break;
155 case '.':
156 /* Do not allow three or more consecutive dots */
157 if (!strncmp(str, "...", 3))
158 return false;
159 /* Fall through */
160 case 'a' ... 'z':
161 case 'A' ... 'Z':
162 case '0' ... '9':
163 case '_':
164 case ':':
165 str++;
166 break;
167 default:
168 return false;
169 }
170
171 return true;
172}
173
174/*
175 * INPUT:
176 * sym: symbol for which rust_is_mangled(sym) returns true
177 *
178 * The input is demangled in-place because the mangled name is always longer
179 * than the demangled one.
180 */
181void
182rust_demangle_sym(char *sym)
183{
184 const char *in;
185 char *out;
186 const char *end;
187
188 if (!sym)
189 return;
190
191 in = sym;
192 out = sym;
193 end = sym + strlen(sym) - (hash_prefix_len + hash_len);
194
195 while (in < end)
196 switch (*in) {
197 case '$':
198 if (!(unescape(&in, &out, "$C$", ',')
199 || unescape(&in, &out, "$SP$", '@')
200 || unescape(&in, &out, "$BP$", '*')
201 || unescape(&in, &out, "$RF$", '&')
202 || unescape(&in, &out, "$LT$", '<')
203 || unescape(&in, &out, "$GT$", '>')
204 || unescape(&in, &out, "$LP$", '(')
205 || unescape(&in, &out, "$RP$", ')')
206 || unescape(&in, &out, "$u20$", ' ')
207 || unescape(&in, &out, "$u27$", '\'')
208 || unescape(&in, &out, "$u5b$", '[')
209 || unescape(&in, &out, "$u5d$", ']')
210 || unescape(&in, &out, "$u7e$", '~'))) {
211 pr_err("demangle-rust: unexpected escape sequence");
212 goto done;
213 }
214 break;
215 case '_':
216 /*
217 * If this is the start of a path component and the next
218 * character is an escape sequence, ignore the
219 * underscore. The mangler inserts an underscore to make
220 * sure the path component begins with a XID_Start
221 * character.
222 */
223 if ((in == sym || in[-1] == ':') && in[1] == '$')
224 in++;
225 else
226 *out++ = *in++;
227 break;
228 case '.':
229 if (in[1] == '.') {
230 /* ".." becomes "::" */
231 *out++ = ':';
232 *out++ = ':';
233 in += 2;
234 } else {
235 /* "." becomes "-" */
236 *out++ = '-';
237 in++;
238 }
239 break;
240 case 'a' ... 'z':
241 case 'A' ... 'Z':
242 case '0' ... '9':
243 case ':':
244 *out++ = *in++;
245 break;
246 default:
247 pr_err("demangle-rust: unexpected character '%c' in symbol\n",
248 *in);
249 goto done;
250 }
251
252done:
253 *out = '\0';
254}
255
256static bool unescape(const char **in, char **out, const char *seq, char value)
257{
258 size_t len = strlen(seq);
259
260 if (strncmp(*in, seq, len))
261 return false;
262
263 **out = value;
264
265 *in += len;
266 *out += 1;
267
268 return true;
269}