2 * Wildcard matching engine for use with SFTP-based file transfer
3 * programs (PSFTP, new-look PSCP): since SFTP has no notion of
4 * getting the remote side to do globbing (and rightly so) we have
5 * to do it locally, by retrieving all the filenames in a directory
6 * and checking each against the wildcard pattern.
16 * Definition of wildcard syntax:
18 * - * matches any sequence of characters, including zero.
19 * - ? matches exactly one character which can be anything.
20 * - [abc] matches exactly one character which is a, b or c.
21 * - [a-f] matches anything from a through f.
22 * - [^a-f] matches anything _except_ a through f.
23 * - [-_] matches - or _; [^-_] matches anything else. (The - is
24 * non-special if it occurs immediately after the opening
26 * - [a^] matches an a or a ^. (The ^ is non-special if it does
27 * _not_ occur immediately after the opening bracket.)
28 * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
29 * - All other characters are non-special and match themselves.
33 * The wildcard matching technique we use is very simple and
34 * potentially O(N^2) in running time, but I don't anticipate it
35 * being that bad in reality (particularly since N will be the size
36 * of a filename, which isn't all that much). Perhaps one day, once
37 * PuTTY has grown a regexp matcher for some other reason, I might
38 * come back and reimplement wildcards by translating them into
39 * regexps or directly into NFAs; but for the moment, in the
40 * absence of any other need for the NFA->DFA translation engine,
41 * anything more than the simplest possible wildcard matcher is
42 * vast code-size overkill.
44 * Essentially, these wildcards are much simpler than regexps in
45 * that they consist of a sequence of rigid fragments (? and [...]
46 * can never match more or less than one character) separated by
47 * asterisks. It is therefore extremely simple to look at a rigid
48 * fragment and determine whether or not it begins at a particular
49 * point in the test string; so we can search along the string
50 * until we find each fragment, then search for the next. As long
51 * as we find each fragment in the _first_ place it occurs, there
52 * will never be a danger of having to backpedal and try to find it
53 * again somewhere else.
57 WC_TRAILINGBACKSLASH = 1,
63 * Error reporting is done by returning various negative values
64 * from the wildcard routines. Passing any such value to wc_error
65 * will give a human-readable message.
67 const char *wc_error(int value)
71 case WC_TRAILINGBACKSLASH:
72 return "'\' occurred at end of string (expected another character)";
73 case WC_UNCLOSEDCLASS:
74 return "expected ']' to close character class";
76 return "character range was not terminated (']' just after '-')";
78 return "INTERNAL ERROR: unrecognised wildcard error number";
82 * This is the routine that tests a target string to see if an
83 * initial substring of it matches a fragment. If successful, it
84 * returns 1, and advances both `fragment' and `target' past the
85 * fragment and matching substring respectively. If unsuccessful it
86 * returns zero. If the wildcard fragment suffers a syntax error,
87 * it returns <0 and the precise value indexes into wc_error.
89 static int wc_match_fragment(const char **fragment, const char **target)
96 * The fragment terminates at either the end of the string, or
97 * the first (unescaped) *.
99 while (*f && *f != '*' && *t) {
101 * Extract one character from t, and one character's worth
102 * of pattern from f, and step along both. Return 0 if they
107 * Backslash, which means f[1] is to be treated as a
108 * literal character no matter what it is. It may not
109 * be the end of the string.
112 return -WC_TRAILINGBACKSLASH; /* error */
114 return 0; /* failed to match */
116 } else if (*f == '?') {
118 * Question mark matches anything.
121 } else if (*f == '[') {
125 * Open bracket introduces a character class.
134 f++; /* backslashes still work */
136 return -WC_UNCLOSEDCLASS; /* error again */
138 int lower, upper, ourchr;
139 lower = (unsigned char) *f++;
140 f++; /* eat the minus */
142 return -WC_INVALIDRANGE; /* different error! */
144 f++; /* backslashes _still_ work */
146 return -WC_UNCLOSEDCLASS; /* error again */
147 upper = (unsigned char) *f++;
148 ourchr = (unsigned char) *t;
150 int t = lower; lower = upper; upper = t;
152 if (ourchr >= lower && ourchr <= upper)
155 matched |= (*t == *f++);
158 if (invert == matched)
159 return 0; /* failed to match character class */
163 * Non-special character matches itself.
170 * Now we've done that, increment t past the character we
175 if (!*f || *f == '*') {
177 * We have reached the end of f without finding a mismatch;
178 * so we're done. Update the caller pointers and return 1.
185 * Otherwise, we must have reached the end of t before we
186 * reached the end of f; so we've failed. Return 0.
192 * This is the real wildcard matching routine. It returns 1 for a
193 * successful match, 0 for an unsuccessful match, and <0 for a
194 * syntax error in the wildcard.
196 int wc_match(const char *wildcard, const char *target)
201 * Every time we see a '*' _followed_ by a fragment, we just
202 * search along the string for a location at which the fragment
203 * matches. The only special case is when we see a fragment
204 * right at the start, in which case we just call the matching
205 * routine once and give up if it fails.
207 if (*wildcard != '*') {
208 ret = wc_match_fragment(&wildcard, &target);
210 return ret; /* pass back failure or error alike */
214 assert(*wildcard == '*');
215 while (*wildcard == '*')
219 * It's possible we've just hit the end of the wildcard
220 * after seeing a *, in which case there's no need to
221 * bother searching any more because we've won.
227 * Now `wildcard' points at the next fragment. So we
228 * attempt to match it against `target', and if that fails
229 * we increment `target' and try again, and so on. When we
230 * find we're about to try matching against the empty
231 * string, we give up and return 0.
235 const char *save_w = wildcard, *save_t = target;
237 ret = wc_match_fragment(&wildcard, &target);
240 return ret; /* syntax error */
242 if (ret > 0 && !*wildcard && *target) {
244 * Final special case - literally.
246 * This situation arises when we are matching a
247 * _terminal_ fragment of the wildcard (that is,
248 * there is nothing after it, e.g. "*a"), and it
249 * has matched _too early_. For example, matching
250 * "*a" against "parka" will match the "a" fragment
251 * against the _first_ a, and then (if it weren't
252 * for this special case) matching would fail
253 * because we're at the end of the wildcard but not
254 * at the end of the target string.
256 * In this case what we must do is measure the
257 * length of the fragment in the target (which is
258 * why we saved `target'), jump straight to that
259 * distance from the end of the string using
260 * strlen, and match the same fragment again there
261 * (which is why we saved `wildcard'). Then we
262 * return whatever that operation returns.
264 target = save_t + strlen(save_t) - (target - save_t);
266 return wc_match_fragment(&wildcard, &target);
279 * If we reach here, it must be because we successfully matched
280 * a fragment and then found ourselves right at the end of the
281 * wildcard. Hence, we return 1 if and only if we are also
282 * right at the end of the target.
284 return (*target ? 0 : 1);
288 * Another utility routine that translates a non-wildcard string
289 * into its raw equivalent by removing any escaping backslashes.
290 * Expects a target string buffer of anything up to the length of
291 * the original wildcard. You can also pass NULL as the output
292 * buffer if you're only interested in the return value.
294 * Returns 1 on success, or 0 if a wildcard character was
295 * encountered. In the latter case the output string MAY not be
296 * zero-terminated and you should not use it for anything!
298 int wc_unescape(char *output, const char *wildcard)
301 if (*wildcard == '\\') {
303 /* We are lenient about trailing backslashes in non-wildcards. */
306 *output++ = *wildcard;
309 } else if (*wildcard == '*' || *wildcard == '?' ||
310 *wildcard == '[' || *wildcard == ']') {
311 return 0; /* it's a wildcard! */
314 *output++ = *wildcard;
319 return 1; /* it's clean */
325 const char *wildcard;
330 const struct test fragment_tests[] = {
332 * We exhaustively unit-test the fragment matching routine
333 * itself, which should save us the need to test all its
334 * intricacies during the full wildcard tests.
340 {"ab[cd]", "abc", 1},
341 {"ab[cd]", "abd", 1},
342 {"ab[cd]", "abe", 0},
343 {"ab[^cd]", "abc", 0},
344 {"ab[^cd]", "abd", 0},
345 {"ab[^cd]", "abe", 1},
346 {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
351 {"ab[", "abc", -WC_UNCLOSEDCLASS},
352 {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
353 {"ab[c-]", "abb", -WC_INVALIDRANGE},
354 {"ab[c-e]", "abb", 0},
355 {"ab[c-e]", "abc", 1},
356 {"ab[c-e]", "abd", 1},
357 {"ab[c-e]", "abe", 1},
358 {"ab[c-e]", "abf", 0},
359 {"ab[e-c]", "abb", 0},
360 {"ab[e-c]", "abc", 1},
361 {"ab[e-c]", "abd", 1},
362 {"ab[e-c]", "abe", 1},
363 {"ab[e-c]", "abf", 0},
364 {"ab[^c-e]", "abb", 1},
365 {"ab[^c-e]", "abc", 0},
366 {"ab[^c-e]", "abd", 0},
367 {"ab[^c-e]", "abe", 0},
368 {"ab[^c-e]", "abf", 1},
369 {"ab[^e-c]", "abb", 1},
370 {"ab[^e-c]", "abc", 0},
371 {"ab[^e-c]", "abd", 0},
372 {"ab[^e-c]", "abe", 0},
373 {"ab[^e-c]", "abf", 1},
374 {"ab[a^]", "aba", 1},
375 {"ab[a^]", "ab^", 1},
376 {"ab[a^]", "abb", 0},
377 {"ab[^a^]", "aba", 0},
378 {"ab[^a^]", "ab^", 0},
379 {"ab[^a^]", "abb", 1},
380 {"ab[-c]", "ab-", 1},
381 {"ab[-c]", "abc", 1},
382 {"ab[-c]", "abd", 0},
383 {"ab[^-c]", "ab-", 0},
384 {"ab[^-c]", "abc", 0},
385 {"ab[^-c]", "abd", 1},
386 {"ab[\\[-\\]]", "abZ", 0},
387 {"ab[\\[-\\]]", "ab[", 1},
388 {"ab[\\[-\\]]", "ab\\", 1},
389 {"ab[\\[-\\]]", "ab]", 1},
390 {"ab[\\[-\\]]", "ab^", 0},
391 {"ab[^\\[-\\]]", "abZ", 1},
392 {"ab[^\\[-\\]]", "ab[", 0},
393 {"ab[^\\[-\\]]", "ab\\", 0},
394 {"ab[^\\[-\\]]", "ab]", 0},
395 {"ab[^\\[-\\]]", "ab^", 1},
396 {"ab[a-fA-F]", "aba", 1},
397 {"ab[a-fA-F]", "abF", 1},
398 {"ab[a-fA-F]", "abZ", 0},
401 const struct test full_tests[] = {
405 {"a*", "aardvark", 1},
412 {"?b*r?", "abracadabra", 1},
413 {"?b*r?", "abracadabr", 0},
414 {"?b*r?", "abracadabzr", 0},
424 for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
427 f = fragment_tests[i].wildcard;
428 t = fragment_tests[i].target;
429 eret = fragment_tests[i].expected_result;
430 aret = wc_match_fragment(&f, &t);
432 printf("failed test: /%s/ against /%s/ returned %d not %d\n",
433 fragment_tests[i].wildcard, fragment_tests[i].target,
440 for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
443 f = full_tests[i].wildcard;
444 t = full_tests[i].target;
445 eret = full_tests[i].expected_result;
446 aret = wc_match(f, t);
448 printf("failed test: /%s/ against /%s/ returned %d not %d\n",
449 full_tests[i].wildcard, full_tests[i].target,
456 printf("passed %d, failed %d\n", passes, fails);